In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import gzip
import string

%matplotlib inline
sns.set_style('white')

In [9]:
def parse(path): 
    g = gzip.open(path, 'rb') 
    for l in g: 
        yield eval(l) 
        
def getDF(path): 
    i = 0 
    df = {} 
    for d in parse(path): 
        df[i] = d 
        i += 1 
    return pd.DataFrame.from_dict(df, orient='index') 

In [12]:
raw_data = getDF('reviews_Musical_Instruments_5.json.gz')
raw_data.shape

(10261, 9)

In [13]:
raw_data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


In [14]:
raw_data['overall'].describe()

count    10261.000000
mean         4.488744
std          0.894642
min          1.000000
25%          4.000000
50%          5.000000
75%          5.000000
max          5.000000
Name: overall, dtype: float64

In [15]:
(raw_data['overall']<4).sum()

1239

Well, it looks as though these classes are going to be imbalanced. We may have to resample the negative reviews to get enough data. First let's worry about feature selection. Let's use the words that are most common in the positive or negative reviews which aren't duplicated in the other list.

In [16]:
# We'll set our classification threshold at 4: 4's and 5's will be positive, lower will be negative.

raw_data['sentiment'] = np.where(raw_data['overall']<4, 0, 1)

In [157]:
# Let's find which words are most typical for good and bad reviews.
n_reviews = 500 # Use this to change the search size for the keyword database. More than 500 takes a while...

translator = str.maketrans('', '', string.punctuation)
pos_words = pd.Series()
neg_words = pd.Series()

for com in raw_data[raw_data.sentiment==1][:n_reviews].reviewText:
    words = com.split(' ')
    for w in words:
        s = w.translate(translator).lower()
        pos_words = pos_words.append(pd.Series(s), ignore_index=True)

for com in raw_data[raw_data.sentiment==0][:n_reviews].reviewText:
    words = com.split(' ')
    for w in words:
        s = w.translate(translator).lower()
        neg_words = neg_words.append(pd.Series(s), ignore_index=True)

In [158]:
# And now exclude the words in common between the two sets.
# n_words will set how many of the top words from each set to look at.

n_words = 400

keywords = []
for word in pos_words.value_counts().head(n_words).index:
    if not np.isin(word, neg_words.value_counts().head(n_words).index): keywords.append(str(word)) 
        
for word in neg_words.value_counts().head(n_words).index:
    if not np.isin(word, pos_words.value_counts().head(n_words).index): keywords.append(str(word)) 

print('{} keywords selected.'.format(len(keywords)))

178 keywords selected.


In [159]:
key_df = pd.DataFrame()
key_df['sentiment'] = raw_data['sentiment']

# Determine whether each comment contains each keyword.
for key in keywords:
    key_df[str(key)] = np.where(raw_data.reviewText.str.contains(str(key), case=False), 1, 0)

In [160]:
# Number of samples in the test set
n_test = 2000

key_df = key_df.sample(frac=1)
data_test = key_df[:n_test]
data_train = key_df[n_test:]

X_train = data_train.loc[:, ~data_train.columns.isin(['sentiment'])]
Y_train = data_train['sentiment']
X_test = data_test.loc[:, ~data_train.columns.isin(['sentiment'])]
Y_test = data_test['sentiment']

In [139]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample

In [140]:
rfc = RandomForestClassifier(
            n_estimators=20,
            max_depth=40
)

cross_val_score(rfc, X_train, Y_train, cv=5)

array([ 0.87900786,  0.87416818,  0.87719298,  0.88007268,  0.88007268])

In [161]:
rfc.fit(X_train, Y_train)
Y_pred = rfc.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))

[[  14  216]
 [  20 1750]]


Well, that didn't work that well. Let's try it with resampling.

In [162]:
data_pos = data_train[data_train.sentiment==1]
data_neg = data_train[data_train.sentiment==0]

# We'll only resample the low class (negative reviews)
n_resamp = len(data_pos)

data_neg_resamp = resample(data_neg, n_samples=n_resamp)

data_train_resamp = pd.concat([data_pos, data_neg_resamp])
data_train_resamp = data_train_resamp.sample(frac=1)

X_train = data_train_resamp.loc[:, ~data_train_resamp.columns.isin(['sentiment'])]
Y_train = data_train_resamp['sentiment']

X_train.shape, Y_train.shape

((14504, 178), (14504,))

In [163]:
rfc.fit(X_train, Y_train)
Y_pred = rfc.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))

[[  53  177]
 [ 187 1583]]


Not doing great, even with almost 300 keywords. Let's try some other models.

In [152]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression

In [164]:
dtc = DecisionTreeClassifier(
            max_features=4,
            max_depth=20
)

dtc.fit(X_train, Y_train)
Y_pred = dtc.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))

[[ 101  129]
 [ 490 1280]]


In [165]:
bnb = BernoulliNB()

bnb.fit(X_train, Y_train)
Y_pred = bnb.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))

[[ 137   93]
 [ 473 1297]]


In [166]:
lgr = LogisticRegression()

lgr.fit(X_train, Y_train)
Y_pred = lgr.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))

[[ 131   99]
 [ 516 1254]]
