In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import pickle

# Load Data

In [None]:
train = pd.read_csv("drive/Shared drives/MATH156 Project/Data/train.csv")
test = pd.read_csv("drive/Shared drives/MATH156 Project/Data/test.csv")

In [None]:
X_train = train['review'].tolist()
y_train = train['sentiment'].tolist()

In [None]:
X_test = test['review'].tolist()
y_test = test['sentiment'].tolist()

In [None]:
import nltk

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Vectorization
* CountVectorizer
* Unigram + Bigram
* Remove non-words/numbers
* Remove stopwords
* Lemmatization

In [None]:
"""
vectorizer
"""
import re
import string

from nltk import PorterStemmer, WordNetLemmatizer, sent_tokenize, wordpunct_tokenize, pos_tag
from nltk.corpus import wordnet, stopwords
from sklearn.feature_extraction.text import CountVectorizer


class YelpSentCountVectorizer(CountVectorizer):
    def __init__(self, ngram_range=(1, 1),
                 remove_nonwords=False, remove_stopwords=False,
                 stem=False, lemmatize=False, min_df=1, binary=False):
        super().__init__()
        self.punct = set(string.punctuation)
        self.ngram_range = ngram_range
        self.remove_nonwords = remove_nonwords
        self.stop_words = set(stopwords.words('english')) if remove_stopwords else set()
        self.stemmer = PorterStemmer() if stem else None
        self.lemmatizer = WordNetLemmatizer() if lemmatize else None
        self.min_df = min_df
        self.binary = binary

    def lemmatize(self, token, tag):
        tag = {
            'N': wordnet.NOUN,
            'V': wordnet.VERB,
            'R': wordnet.ADV,
            'J': wordnet.ADJ
        }.get(tag[0], wordnet.NOUN)
        return self.lemmatizer.lemmatize(token, tag)

    def stem(self, token):
        return self.stemmer.stem(token)

    def build_analyzer(self):
        # create the analyzer that will be returned by this method
        def analyser(doc):
            # Keep only words
            doc = re.sub('[^A-Za-z0-9]+', ' ', doc) if self.remove_nonwords else doc
            cleaned_tokens = []
            # Break the document into sentences
            for sent in sent_tokenize(doc):
                # Break the sentence into part of speech tagged tokens
                for token, tag in pos_tag(wordpunct_tokenize(sent)):
                    # Lower case and strip spaces
                    token = token.lower()
                    token = token.strip()
                    # If stopword, ignore token and continue
                    if token in self.stop_words:
                        continue
                    # If punctuation, continue
                    if all(char in self.punct for char in token):
                        continue
                    # Lemmatize/stem the token
                    if self.lemmatizer:
                        token = self.lemmatize(token, tag)
                    elif self.stemmer:
                        token = self.stem(token)
                    cleaned_tokens.append(token)
            # use CountVectorizer's _word_ngrams built in method to extract n-grams
            return self._word_ngrams(cleaned_tokens)

        return analyser

In [None]:
vect = YelpSentCountVectorizer(ngram_range=(1,2),
                                        remove_nonwords=True,
                                        remove_stopwords=True,
                                        stem=False,
                                        lemmatize=True)

In [None]:
%time cv = vect.fit(X_train)

CPU times: user 39min 57s, sys: 18.6 s, total: 40min 16s
Wall time: 40min 21s


In [None]:
X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)

In [None]:
import pickle

with open('drive/Shared drives/MATH156 Project/Data/pickles/X_train_dtm.pickle', 'wb') as f:
    pickle.dump(X_train_dtm, f)
with open('drive/Shared drives/MATH156 Project/Data/pickles/X_test_dtm.pickle', 'wb') as f:
    pickle.dump(X_test_dtm, f)