In [1]:
import os
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score

import json

### Load In Data

#### Data Class

In [2]:
class Review:
    def __init__(self, category, text):
        self.category = category
        self.text = text    
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_y(self):
        return [x.category for x in self.reviews]

#### Prep Training/Test Data

In [3]:
train_reviews = []
all_categories = []
for file in os.listdir('./data/training'):
    category = file.strip('train_').split('.')[0]
    all_categories.append(category)
    with open(f'./data/training/{file}') as f:
        for line in f:
            review_json = json.loads(line)
            review = Review(category, review_json['reviewText'])
            train_reviews.append(review)

train_container = ReviewContainer(train_reviews)

In [4]:
test_reviews = []
for file in os.listdir('./data/test'):
    category = file.strip('test_').split('.')[0]
    with open(f'./data/test/{file}') as f:
        for line in f:
            review_json = json.loads(line)
            review = Review(category, review_json['reviewText'])
            test_reviews.append(review)
            
test_container = ReviewContainer(test_reviews)

#### Train Model (Bag of words)

In [5]:
from sklearn import svm

corpus = train_container.get_text()
vectorizer = CountVectorizer(binary=True)
train_x = vectorizer.fit_transform(corpus) # training text converted to vectors

clf = svm.SVC(kernel='linear')
clf.fit(train_x, train_container.get_y())

SVC(kernel='linear')

#### Evaluate Performance (Bag of words)

In [6]:
# make sure to convert test text to vector form
test_corpus = test_container.get_text()
test_x = vectorizer.transform(test_corpus)

In [7]:
print("Overall Accuracy:", clf.score(test_x, test_container.get_y()))

y_pred = clf.predict(test_x)

print("f1 scores by category")
print(all_categories)
print(f1_score(test_container.get_y(), y_pred, average=None, labels=all_categories))

Overall Accuracy: 0.6522222222222223
f1 scores by category
['Electronics', 'Automotive', 'Digital_Music', 'Patio_Lawn_Garden', 'Grocery', 'Beauty', 'Pet_Supplies', 'Clothing', 'Books']
[0.5484222  0.46201074 0.71557971 0.46501129 0.70614035 0.79538905
 0.66816143 0.71020408 0.82866044]


#### Bigram approach to Bag of Words

The regular approach to bag of words model is unigram, meaning that the model assesses individual words, however, When trying to infer sentiment from a sentence, context matters, "great" is quite different from "not great", so now let's try to use combination of words.

In [8]:
corpus = train_container.get_text()
vectorizer = CountVectorizer(binary=True, ngram_range=(1,2))
train_x = vectorizer.fit_transform(corpus) # training text converted to vector

clf = svm.SVC(kernel='linear')
clf.fit(train_x, train_container.get_y())

test_corpus = test_container.get_text()
test_x = vectorizer.transform(test_corpus)

In [9]:
# check if performance has improved

print("Overall Accuracy:", clf.score(test_x, test_container.get_y()))

y_pred = clf.predict(test_x)

print("f1 scores by category")
print(all_categories)
print(f1_score(test_container.get_y(), y_pred, average=None, labels=all_categories))

Overall Accuracy: 0.6097777777777778
f1 scores by category
['Electronics', 'Automotive', 'Digital_Music', 'Patio_Lawn_Garden', 'Grocery', 'Beauty', 'Pet_Supplies', 'Clothing', 'Books']
[0.46918919 0.41161401 0.69604317 0.37086093 0.6416309  0.80545455
 0.61470911 0.66188525 0.7983454 ]


### Word-Vectors Approach

In [72]:
import spacy

In [11]:
#!python -m spacy download en_core_web_md
nlp = spacy.load('en_core_web_md') # English pipeline optimized for cpu

In [12]:
train_x_raw = train_container.get_text()

docs_train = [nlp(text) for text in train_x_raw] # both text and vectors

train_x_word_vectors = [x.vector for x in docs_train] #only vectors

clf = svm.SVC(kernel='linear')
clf.fit(train_x_word_vectors, train_container.get_y())

test_x_raw = test_container.get_text()
docs_test = [nlp(text) for text in test_x_raw] # both text and vectors
test_x_word_vectors = [x.vector for x in docs_test] #only vectors

In [13]:
# Evaluating performance
print("Overall Accuracy:", clf.score(test_x_word_vectors, test_container.get_y()))

y_pred = clf.predict(test_x_word_vectors)

print("f1 scores by category")
print(all_categories)
print(f1_score(test_container.get_y(), y_pred, average=None, labels=all_categories))

Overall Accuracy: 0.7131111111111111
f1 scores by category
['Electronics', 'Automotive', 'Digital_Music', 'Patio_Lawn_Garden', 'Grocery', 'Beauty', 'Pet_Supplies', 'Clothing', 'Books']
[0.65784114 0.54347826 0.76080692 0.55747711 0.78958785 0.74114441
 0.75339367 0.77603143 0.86217617]


#### Word vectors are quite powerful tool, as it tries to infer relationship between words, thus performing better with previously unencountered combination of words: this approach encodes the meaning of the word such that the words that are closer in the vector space are expected to be similar in meaning. The result is obvious, accuracy of our svc model went up

## Stemming & Lemmatization

In [14]:
import nltk

#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('punkt')

In [15]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [16]:
stemmer = PorterStemmer()

# test it

phrase = "doing all the projects."
tokenized_words = word_tokenize(phrase)

stemmed_words = [stemmer.stem(x) for x in tokenized_words]
print(stemmed_words)
' '.join(stemmed_words)

['do', 'all', 'the', 'project', '.']


'do all the project .'

#### The drawback of this approach is that it uses an algorithm to stem the words, not a real-world dictionary-based approach which makes it vulnerable to lexical ambiguities and punctual mistakes, for instance counting commas and periods as stand-alone words.

In [17]:
# since stemming does not ignore special characters, we have to clean the review data first
# experimenting with regexes

ph = "just trying, to/ ^ remove*) special. characters & what? did it!! work?% >/$ "

import re
s = re.sub(r"[^a-zA-Z0-9]"," ",ph)
print(s)

tokenized_words = word_tokenize(s)
stemmed_words = [stemmer.stem(x) for x in tokenized_words]
print(stemmed_words)
' '.join(stemmed_words)

# yup, this looks just right

just trying  to    remove   special  characters   what  did it   work       
['just', 'tri', 'to', 'remov', 'special', 'charact', 'what', 'did', 'it', 'work']


'just tri to remov special charact what did it work'

In [18]:
# recreating word vector model with stemming

# step 1 - cleaning special characters
# step 2 - stemming train and test x data
# step 3 - re-implementing word-vector SVC model

train_x_raw = train_container.get_text()
test_x_raw = test_container.get_text()

train_x_raw2 = [re.sub(r"[^a-zA-Z0-9]"," ", x) for x in train_x_raw]
test_x_raw2 = [re.sub(r"[^a-zA-Z0-9]"," ", x) for x in test_x_raw]

print(test_x_raw2[0])
train_x_raw2[0]

I love this song 


'so far  so good '

In [19]:
tokenized_train = [word_tokenize(x) for x in train_x_raw2]
tokenized_test = [word_tokenize(x) for x in test_x_raw2]

#train_x_stemmed = [stemmer.stem(x) for x in tokenized_train]
#test_x_stemmed = [stemmer.stem(x) for x in tokenized_test]

train_x_stemmed = []
test_x_stemmed = []

for review in tokenized_train:
    train_x_stemmed.append(' '.join([stemmer.stem(x) for x in review]))
    
for review in tokenized_test:
    test_x_stemmed.append(' '.join([stemmer.stem(x) for x in review]))
    

In [20]:
docs_train = [nlp(text) for text in train_x_stemmed] # both text and vectors

train_x_word_vectors = [x.vector for x in docs_train] #only vectors

clf = svm.SVC(kernel='linear')
clf.fit(train_x_word_vectors, train_container.get_y())

docs_test = [nlp(text) for text in test_x_stemmed] # both text and vectors
test_x_word_vectors = [x.vector for x in docs_test] #only vectors

In [21]:
# Evaluating performance
print("Overall Accuracy:", clf.score(test_x_word_vectors, test_container.get_y()))

y_pred = clf.predict(test_x_word_vectors)

print("f1 scores by category")
print(all_categories)
print(f1_score(test_container.get_y(), y_pred, average=None, labels=all_categories))

Overall Accuracy: 0.6795555555555556
f1 scores by category
['Electronics', 'Automotive', 'Digital_Music', 'Patio_Lawn_Garden', 'Grocery', 'Beauty', 'Pet_Supplies', 'Clothing', 'Books']
[0.62741313 0.52173913 0.73904762 0.50362694 0.73005464 0.71103008
 0.70046083 0.74658869 0.85031185]


The performance does not seem to have improved. Let's now try lemmatizing instead

### Lemmatization

In [22]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [23]:
train_x_lemmatized = []
test_x_lemmatized = []

for review in tokenized_train:
    train_x_lemmatized.append(' '.join([lemmatizer.lemmatize(x, pos='v') for x in review]))
    
for review in tokenized_test:
    test_x_lemmatized.append(' '.join([lemmatizer.lemmatize(x, pos='v') for x in review]))

In [24]:
docs_train = [nlp(text) for text in train_x_lemmatized] # both text and vectors

train_x_word_vectors = [x.vector for x in docs_train] #only vectors

clf = svm.SVC(kernel='linear')
clf.fit(train_x_word_vectors, train_container.get_y())

docs_test = [nlp(text) for text in test_x_lemmatized] # both text and vectors
test_x_word_vectors = [x.vector for x in docs_test] #only vectors

In [25]:
# Evaluating performance
print("Overall Accuracy:", clf.score(test_x_word_vectors, test_container.get_y()))

y_pred = clf.predict(test_x_word_vectors)

print("f1 scores by category")
print(all_categories)
print(f1_score(test_container.get_y(), y_pred, average=None, labels=all_categories))

Overall Accuracy: 0.7128888888888889
f1 scores by category
['Electronics', 'Automotive', 'Digital_Music', 'Patio_Lawn_Garden', 'Grocery', 'Beauty', 'Pet_Supplies', 'Clothing', 'Books']
[0.66260163 0.54721977 0.77312561 0.56352459 0.77105832 0.74074074
 0.73496659 0.76424361 0.88517745]


#### Word Embedding + Regex filtering + lemmatization + stopword removal

In [44]:

train_x_raw = train_container.get_text()
test_x_raw = test_container.get_text()


In [45]:
train_x_raw2 = [re.sub(r"[^a-zA-Z0-9]"," ", x) for x in train_x_raw]
test_x_raw2 = [re.sub(r"[^a-zA-Z0-9]"," ", x) for x in test_x_raw]

tokenized_train = [word_tokenize(x) for x in train_x_raw2]
tokenized_test = [word_tokenize(x) for x in test_x_raw2]

tokenized_train[0]

['so', 'far', 'so', 'good']

In [46]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

stripped_train = []
stripped_word = []

for lst in tokenized_train:
    for word in lst:
        if word not in stop_words:
            stripped_word.append(word)
    stripped_train.append(' '.join(stripped_word))
    stripped_word = [] 
    

stripped_test = []
stripped_word = []

for lst in tokenized_test:
    for word in lst:
        if word not in stop_words:
            stripped_word.append(word)
    stripped_test.append(' '.join(stripped_word))
    stripped_word = []
    
stripped_test[0]

'I love song'

In [47]:
train_x_lemmatized = []
test_x_lemmatized = []

for review in stripped_train:
    train_x_lemmatized.append(''.join([lemmatizer.lemmatize(x, pos='v') for x in review]))
    
for review in stripped_test:
    test_x_lemmatized.append(''.join([lemmatizer.lemmatize(x, pos='v') for x in review]))
    
test_x_lemmatized[0]

'I love song'

In [48]:

docs_train = [nlp(text) for text in train_x_lemmatized]
train_x_word_vectors = [x.vector for x in docs_train] 

docs_test = [nlp(text) for text in test_x_lemmatized]
test_x_word_vectors = [x.vector for x in docs_test]

clf = svm.SVC(kernel='linear')
clf.fit(train_x_word_vectors, train_container.get_y())


SVC(kernel='linear')

In [51]:
# Evaluating performance
print("Overall Accuracy:", clf.score(test_x_word_vectors, test_container.get_y()))

y_pred = clf.predict(test_x_word_vectors)

print("f1 scores by category")
print(all_categories)
print(f1_score(test_container.get_y(), y_pred, average=None, labels=all_categories))

Overall Accuracy: 0.716
f1 scores by category
['Electronics', 'Automotive', 'Digital_Music', 'Patio_Lawn_Garden', 'Grocery', 'Beauty', 'Pet_Supplies', 'Clothing', 'Books']
[0.66188525 0.5443787  0.76453765 0.5738576  0.77705628 0.74861368
 0.75968992 0.77675841 0.86992716]


### Trying Random Forest instead of SVC

In [52]:
from sklearn.ensemble import RandomForestClassifier
cls = RandomForestClassifier()

In [54]:
cls.fit(train_x_word_vectors, train_container.get_y())

RandomForestClassifier()

In [55]:
print("Overall Accuracy:", cls.score(test_x_word_vectors, test_container.get_y()))

y_pred = cls.predict(test_x_word_vectors)

print("f1 scores by category")
print(all_categories)
print(f1_score(test_container.get_y(), y_pred, average=None, labels=all_categories))

Overall Accuracy: 0.6908888888888889
f1 scores by category
['Electronics', 'Automotive', 'Digital_Music', 'Patio_Lawn_Garden', 'Grocery', 'Beauty', 'Pet_Supplies', 'Clothing', 'Books']
[0.61132075 0.51171875 0.77145612 0.4420218  0.7742616  0.86407767
 0.66592428 0.73209028 0.85333333]


In [70]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'random_forest' : {
            'model': RandomForestClassifier(),
            'params': {
                'max_features': ['auto', 'sqrt']
            }
        },
        'svc': {
            'model': svm.SVC(),
            'params': {
                'kernel':['linear']
            }
        },
        'decision_tree': {
            'model': DecisionTreeClassifier(),
            'params': {
                'splitter': ['best','random']
            }
        },
        'logistic_regression': {
            'model': LogisticRegression(),
            'params': {
                'penalty':['none', 'l1', 'elasticnet']
            }
        
    }
    }

    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(test_x_word_vectors,test_container.get_y())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Unnamed: 0,model,best_score,best_params
0,random_forest,0.639259,{'max_features': 'sqrt'}
1,svc,0.672296,{'kernel': 'linear'}
2,decision_tree,0.424148,{'splitter': 'best'}
3,logistic_regression,0.664,{'penalty': 'none'}


seems like SVC is, indeed the best conventional choice for this task, however it is still not accurate enough. in the bert_model file I try to solve same classification problem, but this time with spacy bert transformer .