In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gzip

%matplotlib inline
sns.set_style('white')

In [2]:
def parse(path): 
    g = gzip.open(path, 'rb') 
    for l in g: 
        yield eval(l) 
        
def getDF(path): 
    i = 0 
    df = {} 
    for d in parse(path): 
        df[i] = d 
        i += 1 
    return pd.DataFrame.from_dict(df, orient='index') 

In [51]:
raw_data = getDF('reviews_Digital_Music_5.json.gz')
raw_data.shape

(64706, 9)

In [52]:
raw_data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A3EBHHCZO6V2A4,5555991584,"Amaranth ""music fan""","[3, 3]","It's hard to believe ""Memory of Trees"" came ou...",5.0,Enya's last great album,1158019200,"09 12, 2006"
1,AZPWAXJG9OJXV,5555991584,bethtexas,"[0, 0]","A clasically-styled and introverted album, Mem...",5.0,Enya at her most elegant,991526400,"06 3, 2001"
2,A38IRL0X2T4DPF,5555991584,bob turnley,"[2, 2]",I never thought Enya would reach the sublime h...,5.0,The best so far,1058140800,"07 14, 2003"
3,A22IK3I6U76GX0,5555991584,Calle,"[1, 1]",This is the third review of an irish album I w...,5.0,Ireland produces good music.,957312000,"05 3, 2000"
4,A1AISPOIIHTHXX,5555991584,"Cloud ""...""","[1, 1]","Enya, despite being a successful recording art...",4.0,4.5; music to dream to,1200528000,"01 17, 2008"


In [53]:
raw_data['overall'].describe()

count    64706.000000
mean         4.222514
std          1.086081
min          1.000000
25%          4.000000
50%          5.000000
75%          5.000000
max          5.000000
Name: overall, dtype: float64

In [54]:
(raw_data['overall']<4).sum()

12590

In [55]:
# We'll set our classification threshold at 4: 4's and 5's will be positive, lower will be negative.

raw_data['outcome_var'] = np.where(raw_data['overall']<4, 0, 1)
reviews = [x for x in raw_data['reviewText']]

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

In [65]:
tfidf = TfidfVectorizer(lowercase=True, 
                        stop_words='english',
                        ngram_range=(1, 1), 
                        analyzer='word', 
                        min_df=5,
                        max_features=None, 
                        vocabulary=None, 
                        binary=False, 
                        use_idf=True)

In [66]:
tfidf.fit(reviews)
word_counts = tfidf.transform(reviews)
print('Original Shape:')
print(word_counts.get_shape())

svd = TruncatedSVD(n_components=200)
word_svd = svd.fit_transform(word_counts)
print('SVD Shape:')
print(word_svd.shape)

pos_idx = np.where(raw_data['outcome_var']==1)[0]
neg_idx = np.where(raw_data['outcome_var']==0)[0]
pos_counts = word_svd[pos_idx]
neg_counts = word_svd[neg_idx]

Original Shape:
(64706, 32184)
SVD Shape:
(64706, 200)


In [67]:
n_test = 0.2

X_train_pos, X_test_pos = train_test_split(pos_counts, test_size=n_test, random_state=0)
X_train_neg, X_test_neg = train_test_split(neg_counts, test_size=n_test, random_state=0)
Y_train_pos, Y_test_pos = train_test_split(raw_data['outcome_var'][pos_idx], test_size=n_test, random_state=0)
Y_train_neg, Y_test_neg = train_test_split(raw_data['outcome_var'][neg_idx], test_size=n_test, random_state=0)

X_train_neg_resamp = resample(X_train_neg, n_samples=len(X_train_pos))
print('X_train neg and pos shapes:')
print(X_train_neg_resamp.shape, X_train_pos.shape)

Y_train_neg_resamp = resample(Y_train_neg, n_samples=len(Y_train_pos))
print('Y_train neg and pos shapes:')
print(Y_train_neg_resamp.shape, Y_train_pos.shape)

print('X_test neg and pos shapes:')
print(X_test_neg.shape, X_test_pos.shape)

print('Y_train neg and pos shapes:')
print(Y_test_neg.shape, Y_test_pos.shape)

X_train = np.concatenate([X_train_pos, X_train_neg_resamp])
X_test = np.concatenate([X_test_pos, X_test_neg])
Y_train = np.concatenate([Y_train_pos, Y_train_neg_resamp])
Y_test = np.concatenate([Y_test_pos, Y_test_neg])

X_train neg and pos shapes:
(41692, 200) (41692, 200)
Y_train neg and pos shapes:
(41692,) (41692,)
X_test neg and pos shapes:
(2518, 200) (10424, 200)
Y_train neg and pos shapes:
(2518,) (10424,)


In [17]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB

from sklearn.model_selection import cross_val_score

from sklearn.metrics import confusion_matrix, auc, precision_recall_curve

In [18]:
def auprc(Y, Y_pred):
    precision, recall, thresholds = precision_recall_curve(Y, Y_pred)
    return auc(precision, recall)

In [68]:
dtc = DecisionTreeClassifier(max_features=4, max_depth=20)
dtc.fit(X_train, Y_train)
Y_pred = dtc.predict(X_test)

print(confusion_matrix(Y_test, Y_pred))
print('AUPRC: ' + str(auprc(Y_test, Y_pred)))

dtc.fit(X_train, Y_train)
Y_pred = dtc.predict(X_test)

print(confusion_matrix(Y_test, Y_pred))
print('AUPRC: ' + str(auprc(Y_test, Y_pred)))

dtc.fit(X_train, Y_train)
Y_pred = dtc.predict(X_test)

print(confusion_matrix(Y_test, Y_pred))
print('AUPRC: ' + str(auprc(Y_test, Y_pred)))

[[1107 1411]
 [3061 7363]]
AUPRC: 0.0855860705542
[[ 898 1620]
 [2430 7994]]
AUPRC: 0.0876306600745
[[ 977 1541]
 [2610 7814]]
AUPRC: 0.0878406005207


In [69]:
rfc = RandomForestClassifier(n_estimators=30, max_depth=20)
rfc.fit(X_train, Y_train)
Y_pred = rfc.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))
print('AUPRC: ' + str(auprc(Y_test, Y_pred)))

[[ 669 1849]
 [ 508 9916]]
AUPRC: 0.111238988731


In [70]:
lgr = LogisticRegression()
lgr.fit(X_train, Y_train)
Y_pred = lgr.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))
print('AUPRC: ' + str(auprc(Y_test, Y_pred)))

[[1955  563]
 [2502 7922]]
AUPRC: 0.13803467158


In [43]:
# Beware -- slow to run.
knc = KNeighborsClassifier(n_neighbors=3)
knc.fit(X_train, Y_train)
Y_pred = knc.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))
print('AUPRC: ' + str(auprc(Y_test, Y_pred)))

[[  74  298]
 [ 390 2317]]
AUPRC: 0.0551362671706


In [62]:
bnb = BernoulliNB()
bnb.fit(X_train, Y_train)
Y_pred = bnb.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))
print('AUPRC: ' + str(auprc(Y_test, Y_pred)))

[[1764  754]
 [2719 7705]]
AUPRC: 0.124617834148


In [24]:
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))
print('AUPRC: ' + str(auprc(Y_test, Y_pred)))

[[ 356   16]
 [1899  808]]
AUPRC: 0.0687317577271


In [25]:
gbc = GradientBoostingClassifier(n_estimators=100, max_depth=4)
gbc.fit(X_train, Y_train)
Y_pred = gbc.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))
print('AUPRC: ' + str(auprc(Y_test, Y_pred)))

[[ 179  193]
 [ 414 2293]]
AUPRC: 0.0727622750216
