<a href="https://colab.research.google.com/github/mattfredericksen/CSCE-4205-ML-Project/blob/main/mnb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [14]:
import os
import json
import re
from time import time
from contextlib import suppress
import gc

import gzip
import pickle
import gdown
from urllib.request import urlopen

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import f1_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from scipy import stats

import spacy
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

In [2]:
# Progress Bar
# https://colab.research.google.com/drive/1I2o3Ie34vJ3G4M6eE54-OyrmzJNBwhOp#scrollTo=EbF9oPhzOqZj

from IPython.display import HTML, display

def progress(value, max=100):
    return HTML(f"""
        <progress
            value='{value}'
            max='{max}',
            style='width: 50%'
        >
            {value}
        </progress>
    """)

# Feature Engineering

**Dataset links**
- [Books (\~30 million, too large!)](http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Books_5.json.gz)  
- [Clothing, Shoes, and Jewelry (\~11 million)](http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Clothing_Shoes_and_Jewelry_5.json.gz)    
- [Electronics (\~7 million)](http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Electronics_5.json.gz  )  
- [Home and Kitchen (\~7 million)](http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Home_and_Kitchen_5.json.gz)  
- [Movies and TV (\~3.5 million)](http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Movies_and_TV_5.json.gz)  

**List of features**  
`reviewerID` - ID of the reviewer, e.g. A2SUAM1J3GNN3B  
`asin` - ID of the product, e.g. 0000013714  
`reviewerName` - name of the reviewer  
`vote` - helpful votes of the review  
`style` - a disctionary of the product metadata, e.g., "Format" is "Hardcover"  
`reviewText` - text of the review  
`overall` - rating of the product  
`summary` - summary of the review  
`unixReviewTime` - time of the review (unix time)  
`reviewTime` - time of the review (raw)  
`image` - images that users post after they have received the product  

### Download All Data
This section shows how we originally retrieved the data. We recommend using the later cells to load our pre-processed data.

In [None]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Movies_and_TV_5.json.gz

In [None]:
def drop_features(d):
  kept_features = ("overall", "reviewText")
  return {f: d[f] for f in kept_features}

data = []

with gzip.open("Movies_and_TV_5.json.gz") as file:
  for line in file:
    with suppress(KeyError):
      data.append(drop_features(json.loads(line.strip())))

print(f'{len(data)} reviews loaded.')
data = pd.DataFrame.from_dict(data)
data

#### Download Partial Data
We used this smaller dataset when initially figuring out how to use Lemmatizing and Vectorizing.

In [None]:
# Faster, partial data download

# !wget https://drive.google.com/uc?export=download&id=1_sieKFN89ry-owWOUWzWwmgbjeJb1xYu
!wget -O cut_data.json https://www.dropbox.com/s/ostscnaq8eukdwy/cutData.json?dl=1

In [None]:
with open("cut_data.json") as file:
    data = pd.read_json(file)

print(f'{len(data)} reviews loaded.')

## Text Preprocessing

### From Raw Documents
This section shows how we originally processed the data. We recommend using the later cells to load our pre-processed data.

https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

https://scikit-learn.org/stable/modules/feature_extraction.html#customizing-the-vectorizer-classes

In [None]:
class LemmaTokenizer:
    def __init__(self, data):
        # used to trim sequences of the 
        # same character to no more than 2
        self.shorten_re = re.compile(r'(.)\1{2,}')
        # remove non-alphabetical characters
        self.words_re = re.compile(r'[^a-z]')

        self.call_count = 0
        self.pb_len = len(data)
        self.pb = display(progress(0, self.pb_len), display_id=True)

    def __call__(self, doc):
        self.call_count += 1
        if self.call_count % 1024 == 0:
            self.pb.update(progress(self.call_count % self.pb_len, self.pb_len))
        # apply conver to lower case and apply our regex patterns
        doc = self.shorten_re.sub(r'\1\1', self.words_re.sub(' ', doc.lower()))
        # remove stop words and apply lemmatization
        return tuple(t.lemma_ for t in nlp(doc) if not t.is_stop)

In [None]:
tokenizer = LemmaTokenizer(data)
start = time()
data['reviewText'] = data['reviewText'].apply(tokenizer)
print(f'execution time: {((time() - start) / 60):.1f} minutes')

### From Pre-Lemmatized Documents
Use this section to quickly load the results of previous sections.

In [3]:
!wget -O processed.pkl https://www.dropbox.com/s/t3nbo9hyteir0s6/all_lemmas.pkl?dl=1

--2020-12-06 20:33:53--  https://www.dropbox.com/s/t3nbo9hyteir0s6/all_lemmas.pkl?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.5.1, 2620:100:601d:1::a27d:501
Connecting to www.dropbox.com (www.dropbox.com)|162.125.5.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/dl/t3nbo9hyteir0s6/all_lemmas.pkl [following]
--2020-12-06 20:33:53--  https://www.dropbox.com/s/dl/t3nbo9hyteir0s6/all_lemmas.pkl
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc36b776da27187c178aac58e465.dl.dropboxusercontent.com/cd/0/get/BEkufqvEblm7bvNXdE14Xea1hmyknfhIQATSfTL7ybmcQkslW0QLrjq2jMCSGlR0MGBObpUoRs471BKiUiE6BaTtG3SEzzmWaMZz5-uwn1iNwnHAAjTP15JAojX6cySXgVg/file?dl=1# [following]
--2020-12-06 20:33:54--  https://uc36b776da27187c178aac58e465.dl.dropboxusercontent.com/cd/0/get/BEkufqvEblm7bvNXdE14Xea1hmyknfhIQATSfTL7ybmcQkslW0QLrjq2jMCSGlR0MGBObpUoRs471BKiUiE6BaTtG3SEzzmWaMZz5-uwn1

We convert the star ratings to a binary classification here because the low classes (1-3) occur with significantly less frequency than the high classes (4-5).

In [4]:
data = pd.read_pickle("processed.pkl")

data['overall'] = data['overall'].apply(lambda x: 1 if x > 3 else 0)

print(f'{len(data)} lemmatized reviews loaded.')
data

3408438 lemmatized reviews loaded.


Unnamed: 0,overall,reviewText
0,1,sorry didn t purchase year ago come good ent...
1,1,believe tell receive blessing watch video cath...
2,1,see x live time early day recent reunion sho...
3,1,excited finally live concert video x ve ...
4,1,x good punk band don t like call punk band ...
...,...,...
3408433,1,singing part good expect sonya yoncheva zeljik...
3408434,1,recording production metropolitan opera verd...
3408435,1,wish write review release like point voice m...
3408436,1,gift


# Training/Testing Split
In the next cell, we randomly split the data into training and testing sets. We will use training set with 5x2 cross validation to tune hyperparameters.

In [5]:
targets = data['overall']
features = data['reviewText']
train_features, test_features, train_targets, test_targets = (
    train_test_split(features, targets, test_size=0.2, stratify=targets, random_state=123456)
)
# train_features, tune_features, train_targets, tune_targets = (
#     train_test_split(train_features, train_targets, test_size=0.2, stratify=train_targets, random_state=654321)
# )

# stratification maintains an even class distribution
print([
       train_targets.sum() / len(train_targets),
       test_targets.sum() / len(test_targets)
])

[0.7901795177409003, 0.7901796716386382]


# Create Count and TF-IDF Vectors

### Vectorizers from lemmatized samples
We recommend using the pickled vectorizers in the next section.


In [None]:
vectorizer = TfidfVectorizer(min_df=1e-6, ngram_range=(1, 2))

start = time()
train_matrix = vectorizer.fit_transform(train_features)

print(f'execution time: {((time() - start) / 60):.1f} minutes')
print(f'{len(vectorizer.get_feature_names())} features (unique words)')
train_matrix

In [None]:
# with open("vectorizer.pkl", 'wb') as file:
#     pickle.dump(vectorizer, file)

In [None]:
print(vectorizer.get_feature_names()[:50])
print(vectorizer.get_feature_names()[5000:5050])
print(vectorizer.get_feature_names()[-50:])

### From pickled vectorizers
This function can be called to download and unpickle vectorizers that were previously created from our training data. This process took about an hour, so using these saves a lot of time.

The names represent the class/parameters used to create the vectorizers:
- "min_df{n}" means the parameter "min_df" was set to "1e-{n}".
- "ngrams" means the parameter "ngram_range" was set to "(1, 2)", meaning that uni-grams and bi-grams were processed. 

In [6]:
def load_vectorizer(name):
    url_template = 'https://drive.google.com/uc?id={}'
    vectorizer_names = {
        'CountVectorizer-min_df5-ngrams': '107O5GW0aIqla6pYJrE1YfGiaANRNypXg',
        'CountVectorizer-min_df5': '1K9Sd-nme_N3iHzKYRPv46UsYSMJf3-fx',
        'CountVectorizer-min_df6-ngrams': '101TIMpeg3O-31q1zvePZYhUf6pTwEPzM',
        'CountVectorizer-min_df6': '10-pwX_w74rguvLGqRMAjHU-KA1XAwGvh',
        'CountVectorizer-min_df7': '10HPukhM8ZF91BeU5ZgtK2QIx9oK3gU6h',
        'TfidfVectorizer-min_df5-ngrams': '1-jkMtmwz6HXms3Y26bUWv2mt3OqHJqLc',
        'TfidfVectorizer-min_df5': '1-sWbjPJup-v31smQZP9IczDtIaH0prS2',
        'TfidfVectorizer-min_df6-ngrams': '10X7sWhMVPDhfdKrvqQerRq03sknBscLA',
        'TfidfVectorizer-min_df6': '103qmnyHPlRfJ2E6ZMevRCFFhTSP-sfee',
        'TfidfVectorizer-min_df7': '1-uDkBESf7ySa2ixNtA4cDtocjeMUlRvO',
    }
    if name not in vectorizer_names:
        raise ValueError()
    if not os.path.exists(f'{name}.pkl'):
        gdown.download(url_template.format(vectorizer_names[name]), f'{name}.pkl', False)
    with open(f'{name}.pkl', 'rb') as file:
        return pickle.load(file)

## Create the vectors

In [None]:
vectorizer = load_vectorizer('CountVectorizer-min_df6')
train_vectors = vectorizer.transform(train_features)

# Create the Classifiers

Our goal is to perform a valid statistical test comparing models from two algorithms:
- Multinomial Naive Bayes with uni-gram CountVectorizer
- Multinomial Naive Bayes with uni-gram + bi-gram TfidfVectorizer.

Before performing these tests, will will perform 5x2 cross validation using our training data to tune a few hyperparameters:
1. The "min_df" Vectorizer parameter, which sets a minimum threshold for how frequently a term must occur to be retained by the vectorizer.
2. The "alpha" MultinomialNB parameter, which determines how much smoothing is applied (smoothing is used essentially so that no errors occur when previously unseen tokens are encountered).

In [19]:
rkf = RepeatedKFold(n_splits=2, n_repeats=5, random_state=654321)
scoring = ['f1', 'accuracy']

best_f1 = 0
best_count_vec_name = ''
best_count_alpha = -1

for min_df in (5, 6, 7):
    vec_name = f'CountVectorizer-min_df{min_df}'
    print(f'loading vectorizer: {vec_name}')
    vectorizer = load_vectorizer(vec_name)
    print('creating vectors...')
    train_vectors = vectorizer.transform(train_features)
    for alpha in (1, 1e-3, 1e-10):
        print(f'\talpha: {alpha}')
        clf = MultinomialNB(alpha=alpha)
        scores = cross_validate(clf, train_vectors, train_targets, scoring=scoring, cv=rkf)
        for metric in scoring:
            print(f'\t\t{metric}:', scores[f'test_{metric}'].mean())
        
        if scores['test_f1'].mean() > best_f1:
            best_f1 = scores['test_f1'].mean()
            best_count_vec_name = vec_name
            best_count_alpha = alpha

print('best count vectorizer:', best_count_vec_name)
print('best MNBayes alpha:', best_count_alpha)

loading vectorizer: CountVectorizer-min_df5
creating vectors...
	alpha: 1
		f1: 0.9080594183567499
		accuracy: 0.8519818098468873
	alpha: 0.001
		f1: 0.9080503971490369
		accuracy: 0.8519722013385899
	alpha: 1e-10
		f1: 0.9080366751714909
		accuracy: 0.8519319336206106
loading vectorizer: CountVectorizer-min_df6
creating vectors...
	alpha: 1
		f1: 0.9082408459375919
		accuracy: 0.8521923902081232
	alpha: 0.001
		f1: 0.9073507275853275
		accuracy: 0.8506047125699092
	alpha: 1e-10
		f1: 0.9062130702029669
		accuracy: 0.8483555881543963
loading vectorizer: CountVectorizer-min_df7
creating vectors...
	alpha: 1
		f1: 0.9100363120638251
		accuracy: 0.8541965343357477
	alpha: 0.001
		f1: 0.9063157058244601
		accuracy: 0.8492630420830659
	alpha: 1e-10
		f1: 0.9050539750999536
		accuracy: 0.8467457596039243
best count vectorizer: CountVectorizer-min_df7
best MNBayes alpha: 1e-10


In [20]:
rkf = RepeatedKFold(n_splits=2, n_repeats=5, random_state=654321)
scoring = ['f1', 'accuracy']

best_f1 = 0
best_tfidf_vec_name = ''
best_tfidf_alpha = -1

for min_df in (5, 6):
    vec_name = f'TfidfVectorizer-min_df{min_df}-ngrams'
    print(f'loading vectorizer: {vec_name}')
    vectorizer = load_vectorizer(vec_name)
    print('creating vectors...')
    train_vectors = vectorizer.transform(train_features)
    for alpha in (1, 1e-3, 1e-10):
        print(f'\talpha: {alpha}')
        clf = MultinomialNB(alpha=alpha)
        scores = cross_validate(clf, train_vectors, train_targets, scoring=scoring, cv=rkf)
        for metric in scoring:
            print(f'\t\t{metric}:', scores[f'test_{metric}'].mean())
        
        if scores['test_f1'].mean() > best_f1:
            best_f1 = scores['test_f1'].mean()
            best_tfidf_vec_name = vec_name
            best_tfidf_alpha = alpha

print('best TF-IDF vectorizer:', best_tfidf_vec_name)
print('best MNBayes alpha:', best_tfidf_alpha)

loading vectorizer: TfidfVectorizer-min_df5-ngrams
creating vectors...
	alpha: 1
		f1: 0.9213327470672825
		accuracy: 0.8692978087466765
	alpha: 0.001
		f1: 0.9231617509175358
		accuracy: 0.8739723847070688
	alpha: 1e-10
		f1: 0.9228786917782139
		accuracy: 0.8732486659943154
loading vectorizer: TfidfVectorizer-min_df6-ngrams
creating vectors...
	alpha: 1
		f1: 0.8934064084306508
		accuracy: 0.8116769047400751
	alpha: 0.001
		f1: 0.9184109650737675
		accuracy: 0.8650039790959934
	alpha: 1e-10
		f1: 0.9054866801979988
		accuracy: 0.8413683689373797
best TF-IDF vectorizer: TfidfVectorizer-min_df5-ngrams
best MNBayes alpha: 0.001


# Statistical Tests

In [22]:
rkf = RepeatedKFold(n_splits=10 ,n_repeats=1, random_state=123321)

clf = MultinomialNB(alpha=best_count_alpha)
test_vectors = load_vectorizer(best_count_vec_name).transform(test_features)
count_scores = cross_validate(clf, test_vectors, test_targets, scoring='f1', cv=rkf)

clf = MultinomialNB(alpha=best_tfidf_alpha)
test_vectors = load_vectorizer(best_tfidf_vec_name).transform(test_features)
tfidf_scores = cross_validate(clf, test_vectors, test_targets, scoring='f1', cv=rkf)

In [24]:
stats.ttest_ind(count_scores['test_score'], tfidf_scores['test_score'])

Ttest_indResult(statistic=-26.198402658751434, pvalue=8.722671215651108e-16)