In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import gzip
import string
import spacy
from collections import Counter

%matplotlib inline
sns.set_style('white')

In [2]:
def parse(path): 
    g = gzip.open(path, 'rb') 
    for l in g: 
        yield eval(l) 
        
def getDF(path): 
    i = 0 
    df = {} 
    for d in parse(path): 
        df[i] = d 
        i += 1 
    return pd.DataFrame.from_dict(df, orient='index') 

In [3]:
raw_data = getDF('reviews_Musical_Instruments_5.json.gz')
raw_data.shape

(10261, 9)

In [4]:
raw_data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


In [5]:
raw_data['overall'].describe()

count    10261.000000
mean         4.488744
std          0.894642
min          1.000000
25%          4.000000
50%          5.000000
75%          5.000000
max          5.000000
Name: overall, dtype: float64

In [6]:
(raw_data['overall']<4).sum()

1239

Well, it looks as though these classes are going to be imbalanced. We may have to resample the negative reviews to get enough data. First let's worry about feature selection. Let's use the words that are most common in the positive or negative reviews which aren't duplicated in the other list.

In [16]:
def word_frequencies(text, include_stop=True):
    
    # Build a list of words.
    # Strip out punctuation and, optionally, stop words.
    words = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            words.append(token.text)
            
    # Build and return a Counter object containing word counts.
    return Counter(words)

In [41]:
def lemma_frequencies(text, include_stop=True):
    
    # Build a list of lemas.
    # Strip out punctuation and, optionally, stop words.
    lemmas = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            lemmas.append(token.lemma_)
            
    # Build and return a Counter object containing word counts.
    return Counter(lemmas)

In [174]:
# Function to break up a series into a manageable size and count the lemmas.

def count_lemma(word_series, batch_size=500):

    i = 0
    nlp = spacy.load('en')
    lemma_freq = Counter()

    while i*batch_size < len(word_series):
        raw = [x for x in word_series]
        reviews = ''
        reviews += str([x for x in raw[(i*batch_size):(min((i+1)*batch_size, len(word_series)))]])
        processed = nlp(reviews)
        lemma_freq += lemma_frequencies(processed)
        i += 1
        print(i*batch_size)
    
    return lemma_freq

In [103]:
# We'll set our classification threshold at 4: 4's and 5's will be positive, lower will be negative.

raw_data['outcome_var'] = np.where(raw_data['overall']<4, 0, 1)

In [93]:
# These take a few minutes to run. Reduce batch size in count_lemma if they crash the kernal.

nlp = spacy.load('en')
neg_lemma = count_lemma([x for x in raw_data[raw_data['sentiment']==0]['reviewText']])
pos_lemma = count_lemma([x for x in raw_data[raw_data['sentiment']==1]['reviewText']])

In [193]:
n_common = 500

pos_lemma_freq = lemma_frequencies(pos_reviews).most_common(n_common)
neg_lemma_freq = lemma_frequencies(neg_reviews).most_common(n_common)

pos_common = [pair[0] for pair in pos_lemma_freq]
neg_common = [pair[0] for pair in neg_lemma_freq]

keywords = list((set(pos_common) - set(neg_common))) + list((set(neg_common) - set(pos_common)))
print('{} keywords selected.'.format(len(keywords)))

240 keywords selected.


In [194]:
key_df = pd.DataFrame(index=range(len(raw_data)), columns=keywords).fillna(0)
key_df['outcome_var'] = raw_data['outcome_var']

i=0

# Determine how many of each keyword the comment contains.
for review in raw_data.reviewText:
    counts = lemma_frequencies(nlp(review))
    for lemma in counts:
        if np.isin(lemma, keywords):
            key_df.loc[i, lemma] += counts[lemma]
            
    i+=1
    if i%500 == 0: 
        print(i)

500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000


In [195]:
# Number of samples in the test set
n_test = 2000

key_df = key_df.sample(frac=1)
data_test = key_df[:n_test]
data_train = key_df[n_test:]

X_train = data_train.loc[:, ~data_train.columns.isin(['outcome_var'])]
Y_train = data_train['outcome_var']
X_test = data_test.loc[:, ~data_train.columns.isin(['outcome_var'])]
Y_test = data_test['outcome_var']

In [196]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample

In [197]:
rfc = RandomForestClassifier(
            n_estimators=20,
            max_depth=40
)

cross_val_score(rfc, X_train, Y_train, cv=5)

array([ 0.87477314,  0.88317191,  0.88135593,  0.88075061,  0.88498789])

In [198]:
rfc.fit(X_train, Y_train)
Y_pred = rfc.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))

[[  40  219]
 [  22 1719]]


Well, that didn't work that well. Let's try it with resampling.

In [199]:
data_pos = data_train[data_train.outcome_var==1]
data_neg = data_train[data_train.outcome_var==0]

# We'll only resample the low class (negative reviews)
n_resamp = len(data_pos)

data_neg_resamp = resample(data_neg, n_samples=n_resamp)

data_train_resamp = pd.concat([data_pos, data_neg_resamp])
data_train_resamp = data_train_resamp.sample(frac=1)

X_train = data_train_resamp.loc[:, ~data_train_resamp.columns.isin(['outcome_var'])]
Y_train = data_train_resamp['outcome_var']

X_train.shape, Y_train.shape

((14562, 240), (14562,))

In [208]:
rfc = RandomForestClassifier(
            n_estimators=50,
            max_depth=25
)

rfc.fit(X_train, Y_train)
Y_pred = rfc.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))

[[  95  164]
 [ 195 1546]]


Not doing great, even with almost 300 keywords. Let's try some other models.

In [184]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression

In [201]:
dtc = DecisionTreeClassifier(
            max_features=4,
            max_depth=20
)

dtc.fit(X_train, Y_train)
Y_pred = dtc.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))

[[  69  190]
 [ 176 1565]]


In [202]:
bnb = BernoulliNB()

bnb.fit(X_train, Y_train)
Y_pred = bnb.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))

[[ 134  125]
 [ 376 1365]]


In [203]:
lgr = LogisticRegression()

lgr.fit(X_train, Y_train)
Y_pred = lgr.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))

[[ 146  113]
 [ 428 1313]]


Well, logistic regression captures almost twice as many negative reviews, but at the expense of misclassifying more than twice as many positive reviews. 

In [213]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [212]:
gbc = GradientBoostingClassifier(
                n_estimators=200,
                max_depth=4
)

gbc.fit(X_train, Y_train)
Y_pred = gbc.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))

[[ 108  151]
 [ 245 1496]]


In [216]:
knc = KNeighborsClassifier(n_neighbors=3)

knc.fit(X_train, Y_train)
Y_pred = knc.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))

[[ 111  148]
 [ 624 1117]]


In [218]:
svc = SVC()

svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))

[[ 106  153]
 [ 212 1529]]


In [219]:
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA

In [222]:
rfe = RFE(estimator=lgr, n_features_to_select=40)

rfe.fit(X_train, Y_train)
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

In [223]:
skb = SelectKBest(k=40)

skb.fit(X_train, Y_train)
X_train_skb = skb.transform(X_train)
X_test_skb = skb.transform(X_test)

  f = msb / msw


In [225]:
pca = PCA(n_components=40)

pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [226]:
rfc.fit(X_train, Y_train)
Y_pred = rfc.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))

[[  94  165]
 [ 193 1548]]


In [227]:
rfc.fit(X_train_rfe, Y_train)
Y_pred = rfc.predict(X_test_rfe)
print(confusion_matrix(Y_test, Y_pred))

[[  83  176]
 [ 210 1531]]


In [228]:
rfc.fit(X_train_skb, Y_train)
Y_pred = rfc.predict(X_test_skb)
print(confusion_matrix(Y_test, Y_pred))

[[  87  172]
 [ 213 1528]]


In [229]:
rfc.fit(X_train_pca, Y_train)
Y_pred = rfc.predict(X_test_pca)
print(confusion_matrix(Y_test, Y_pred))

[[  31  228]
 [ 103 1638]]


Feature selection (keeping 40 features) decreases our accuracy by a bit, or by a lot in the case of PCA. But this is a big reduction -- from 240 features to 40. It may be worth it, especially if we run the model with many more keywords. 