In [27]:
from constants import *
from helpers import *
from training import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import scipy.sparse
import pickle

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
reviews = load_dataset(DATA_FOLDER + 'Reviews.csv')
reviews.head()

Unnamed: 0_level_0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


### Logistic regression

In [12]:
text = reviews['Text']
scores = reviews['Score']

#### Training with full tfidf

In [13]:
vectorizer = TfidfVectorizer(stop_words='english')

train_regression(vectorizer, text, scores, 'fulltfidf.npz', '../models/full_linear_regression')


X_train shape: (511608, 119939)
Mean squared error: 0.97
Recall:
[0.28562969 0.28287256 0.43325635 0.54206999 0.64890463]
Precision:
[0.77421081 0.27108239 0.26463535 0.24157956 0.8848832 ]
0.7045606879427562
F1 score:
[0.41730353 0.276852   0.32857518 0.33421324 0.74874085]


#### Training with tfidf limited to words that appear in at least 10 documents

In [14]:
vectorizer = TfidfVectorizer(stop_words='english', min_df=10)

train_regression(vectorizer, text, scores, 'limtfidf.npz', '../models/lim_linear_regression')


X_train shape: (511608, 29166)
Mean squared error: 0.79
Recall:
[0.21792341 0.25724882 0.39976905 0.53747828 0.62795805]
Precision:
[0.8313253  0.25239828 0.24142259 0.22235342 0.88497698]
0.704242382415444
F1 score:
[0.34532374 0.25480047 0.30104348 0.31457002 0.7346362 ]


#### Training with tfidf limited to words that appear in at least 10 documents + 2-grams

In [15]:
vectorizer = TfidfVectorizer(stop_words='english', min_df=10, ngram_range=(1,2))

train_regression(vectorizer, text, scores, 'ngramtfidf.npz', '../models/ngram_linear_regression')


X_train shape: (511608, 341646)
Mean squared error: 7.84
Recall:
[0.62495065 0.48010789 0.48198614 0.49391909 0.71849778]
Precision:
[0.43699103 0.33024119 0.34179496 0.44992087 0.8627089 ]
0.698796061953995
F1 score:
[0.51433677 0.3913163  0.39996167 0.47089446 0.78402708]


#### Training with tfidf limited to words that appear in at least 20 documents + 2-grams

In [16]:
vectorizer = TfidfVectorizer(stop_words='english', min_df=20, ngram_range=(1,2))

train_regression(vectorizer, text, scores, 'ngramreducedtfidf.npz', '../models/ngram_reduced_linear_regression')


X_train shape: (511608, 155233)
Mean squared error: 0.83
Recall:
[0.43821555 0.45279838 0.51454965 0.56180194 0.71728985]
Precision:
[0.81229418 0.3588031  0.3295858  0.3103661  0.90022051]
0.7370573319486018
F1 score:
[0.56930376 0.40035773 0.40180343 0.39984102 0.798411  ]


#### Training with resampled dataset

In [23]:
reviews_upsampled = load_dataset('../data_processed/Reviews_upsampled.csv')
text_upsampled = reviews_upsampled['Text']
scores_upsampled = reviews_upsampled['Score']

In [25]:
vectorizer = TfidfVectorizer(stop_words='english', min_df=10)

train_regression(vectorizer, text_upsampled, scores_upsampled, 
                 '../data_processed/limtfidf_upsampled.npz', '../models/lim_linear_regression_upsampled')


X_train shape: (1634049, 53173)
Mean squared error: 0.71
Recall:
[0.46271495 0.63547409 0.67884447 0.58261229 0.38052607]
Precision:
[0.86103277 0.51296028 0.45616675 0.48059919 0.73732546]
0.6091887394676043
F1 score:
[0.60194662 0.56768236 0.54566205 0.52671176 0.50198358]


In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=50000, ngram_range=(1,2))

train_regression(vectorizer, text_upsampled, scores_upsampled, 
                 'ngramreducedtfidf_upsampled.npz', '../models/ngram_reduced_linear_regression_upsampled')


X_train shape: (1634049, 50000)


### Most useful n-grams when it comes to helpfulness

In [None]:
# add the helpfulness rate

feature_names = np.array(vect.get_feature_names())
sorted_coef_index = model.coef_[0].argsort()

print(f'Most helpful words or 2-grams {feature_names[sorted_coef_index[:10]]}')
print(f'Least helpful words or 2-grams: {feature_names[sorted_coef_index[:-11:-1]]}')