For my data mining assignment, I used https://nijianmo.github.io/amazon/index.html. I used the smaller subset of the "Magazines Subscriptions" dataset. I used code given in the link above, to import the dataset and convert it into a Pandas DataFrame.

In [1]:
import requests
import json
import pandas as pd
import numpy as np
import gzip

In [2]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('Magazine_Subscriptions_5.json.gz')

In [3]:
df

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,4.0,True,"02 26, 2014",A5QQOOZJOVPSF,B00005N7P0,John L. Mehlmauer,"I'm old, and so is my computer. Any advice th...",Cheapskates guide,1393372800,,,
1,5.0,False,"03 6, 2004",A5RHZE7B8SV5Q,B00005N7PS,gorillazfan249,"There's nothing to say, but if you want a REAL...",The best mature Men's magazine.,1078531200,3,,
2,1.0,False,"07 15, 2003",A1RPTVW5VEOSI,B00005N7PS,Michael J. Edelman,If you're the kind of man who looks at himself...,THE Magazine for the Self-Centered Male,1058227200,17,,
3,1.0,True,"01 31, 2015",A1SFRBCMW8XVBW,B00005N7PS,Hoyett L. Barnett,Nothing to it. Just an advertisement. Little...,Nothing to it. Just an advertisement. Little a...,1422662400,,,
4,5.0,True,"10 5, 2010",A1IU9VPCBKZPE8,B00005N7P0,Randolph Eck,When PC Magazine ceased publication of their p...,Excellent Computer Magazine,1286236800,2,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2370,1.0,True,"02 7, 2018",A18X3E6V8DGIDZ,B00X6LREJU,gatormum64,Not what I expected. Found it boring and lacki...,Cancelled subscripton,1517961600,,,
2371,4.0,True,"03 3, 2017",A1Y98LVYJ0YZJ0,B00X6LREHM,Michele,I have been a Family Circle readers for years ...,You can find a good variety of articles,1488499200,,,
2372,5.0,True,"01 15, 2017",A1VTKYW3YQEHTN,B00X6LREHM,Linda,"Even though I have only received 3 issues, I r...",I really enjoy reading Family Circle,1484438400,,,
2373,5.0,True,"05 30, 2017",ASU7EOBD3Y4BV,B01HI8V10E,Miss Betty,Great magazine at a great price through Amazon!,Great magazine at a great price through Amazon!,1496102400,,,


In [4]:
reviews = df[['overall', 'reviewText']]

In [5]:
reviews

Unnamed: 0,overall,reviewText
0,4.0,"I'm old, and so is my computer. Any advice th..."
1,5.0,"There's nothing to say, but if you want a REAL..."
2,1.0,If you're the kind of man who looks at himself...
3,1.0,Nothing to it. Just an advertisement. Little...
4,5.0,When PC Magazine ceased publication of their p...
...,...,...
2370,1.0,Not what I expected. Found it boring and lacki...
2371,4.0,I have been a Family Circle readers for years ...
2372,5.0,"Even though I have only received 3 issues, I r..."
2373,5.0,Great magazine at a great price through Amazon!


In [6]:
reviews.isnull().sum()

overall       0
reviewText    1
dtype: int64

In [7]:
reviews = reviews.dropna()
reviews.isnull().sum()

overall       0
reviewText    0
dtype: int64

In [8]:
X = reviews[['reviewText']]
y = reviews['overall']

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [10]:
X_train.shape

(1780, 1)

In [11]:
X_test.shape

(594, 1)

### CountVectorizer

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [13]:
cv = CountVectorizer(stop_words='english')

In [14]:
X_train_cv = cv.fit_transform(X_train['reviewText'])
X_train_cv

<1780x7252 sparse matrix of type '<class 'numpy.int64'>'
	with 39024 stored elements in Compressed Sparse Row format>

In [15]:
X_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [16]:
X_test_cv = cv.transform(X_test['reviewText'])
X_test_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### CountVectorizer and SVC

In [17]:
from sklearn.svm import SVC
cv_svc = SVC(gamma='scale')
cv_svc.fit(X_train_cv, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [18]:
cv_svc.predict(X_test_cv)

array([5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 2., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       4., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 3., 5., 5., 5., 5., 5., 5., 4., 5.,
       5., 5., 5., 5., 5., 5., 1., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 4., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5.

### Evaluation

In [19]:
cv_svc_score= cv_svc.score(X_test_cv, y_test)
cv_svc_score

0.6936026936026936

### TFID Vectorizer

In [20]:
tfid = TfidfVectorizer(stop_words='english')

In [21]:
X_train_tfid = tfid.fit_transform(X_train['reviewText'])
X_train_tfid
X_test_tfid = tfid.transform(X_test['reviewText'])
X_test_tfid.toarray()
X_train_tfid.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [22]:
from sklearn.svm import SVC
tfid_svc = SVC(gamma='scale')
tfid_svc.fit(X_train_tfid, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [23]:
y_pred_tfid_svc = tfid_svc.predict(X_test_tfid)

In [24]:
tfid_svc_score = tfid_svc.score(X_test_tfid, y_test)
tfid_svc_score

0.7491582491582491

### GridSearchCV for CV

In [25]:
from sklearn.model_selection import GridSearchCV

In [64]:
cv_grid = {'C':[0.1,0.5, 1,2,3,4,5,6,7,10, 20,50,100,1000],
        'gamma': ['scale'],
        'kernel':['linear','rbf']}

In [65]:
gs_cv = GridSearchCV(cv_svc, cv_grid, n_jobs=-1)

In [66]:
gs_cv.fit(X_train_cv, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.1, 0.5, 1, 2, 3, 4, 5, 6, 7, 10, 20, 50, 100,
                               1000],
                         'gamma': ['scale'], 'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [67]:
gs_cv.best_params_

{'C': 5, 'gamma': 'scale', 'kernel': 'rbf'}

In [30]:
pd.DataFrame(gs_cv.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
9,0.382479,0.009914,0.116348,0.001737,5,scale,rbf,"{'C': 5, 'gamma': 'scale', 'kernel': 'rbf'}",0.677311,0.684654,0.690878,0.68427,0.005546,1
7,0.382252,0.007757,0.115523,0.000532,4,scale,rbf,"{'C': 4, 'gamma': 'scale', 'kernel': 'rbf'}",0.678992,0.686341,0.685811,0.683708,0.003349,2
15,0.409667,0.028755,0.132191,0.023461,10,scale,rbf,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}",0.665546,0.686341,0.697635,0.683146,0.013295,3
13,0.392189,0.002345,0.11625,0.002034,7,scale,rbf,"{'C': 7, 'gamma': 'scale', 'kernel': 'rbf'}",0.663866,0.682968,0.694257,0.680337,0.012547,4
11,0.391856,0.002591,0.117051,0.001722,6,scale,rbf,"{'C': 6, 'gamma': 'scale', 'kernel': 'rbf'}",0.658824,0.681282,0.694257,0.67809,0.014642,5
5,0.379543,0.001879,0.118308,0.002646,3,scale,rbf,"{'C': 3, 'gamma': 'scale', 'kernel': 'rbf'}",0.665546,0.681282,0.680743,0.675843,0.007299,6
3,0.36993,0.006679,0.114769,0.001872,2,scale,rbf,"{'C': 2, 'gamma': 'scale', 'kernel': 'rbf'}",0.658824,0.67285,0.668919,0.666854,0.005912,7
17,0.435668,0.029091,0.136728,0.020146,50,scale,rbf,"{'C': 50, 'gamma': 'scale', 'kernel': 'rbf'}",0.65042,0.655987,0.663851,0.656742,0.00551,8
0,0.287773,0.001169,0.075941,0.000643,1,scale,linear,"{'C': 1, 'gamma': 'scale', 'kernel': 'linear'}",0.630252,0.657673,0.673986,0.653933,0.018052,9
1,0.366894,0.001169,0.114039,0.003778,1,scale,rbf,"{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}",0.648739,0.657673,0.655405,0.653933,0.003794,9


### GridSearchCV for TFID

In [60]:
tfid_grid = {'C':[0.1, 0.5, 1,2,3,4,5,6,7,10,20,50,100,1000],
        'gamma': ['scale'],
        'kernel':['linear','rbf']}

In [61]:
gs_tfid = GridSearchCV(tfid_svc, tfid_grid, n_jobs=-1)

In [62]:
gs_tfid.fit(X_train_tfid, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.1, 0.5, 1, 2, 3, 4, 5, 6, 7, 10, 20, 50, 100,
                               1000],
                         'gamma': ['scale'], 'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [63]:
gs_tfid.best_params_

{'C': 2, 'gamma': 'scale', 'kernel': 'linear'}

In [35]:
gs_tfid.predict(X_test_tfid)

array([5., 5., 5., 5., 5., 5., 5., 4., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 3., 5., 5., 2., 5.,
       1., 5., 3., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 4., 3., 4., 5.,
       4., 3., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 3.,
       3., 5., 5., 5., 3., 3., 2., 5., 5., 5., 5., 5., 5., 4., 5., 4., 5.,
       3., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 4.,
       5., 5., 5., 5., 4., 4., 5., 5., 4., 4., 5., 4., 5., 1., 4., 5., 5.,
       5., 5., 5., 3., 5., 5., 5., 5., 5., 5., 3., 5., 3., 5., 5., 5., 5.,
       4., 5., 5., 5., 4., 1., 5., 5., 5., 4., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 3., 5., 5., 4., 5., 3., 5., 5., 5., 5., 5., 5., 4., 5.,
       5., 5., 5., 5., 5., 5., 1., 5., 5., 5., 5., 3., 5., 4., 5., 4., 2.,
       5., 4., 5., 5., 5., 3., 5., 5., 5., 5., 4., 5., 5., 5., 3., 5., 5.,
       5., 5., 5., 5., 3., 3., 5., 5., 5., 4., 5., 5., 5., 5., 5., 5., 5.,
       4., 5., 5., 5., 5.

In [36]:
pd.DataFrame(gs_tfid.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
2,0.386042,0.027725,0.090803,0.002228,2,scale,linear,"{'C': 2, 'gamma': 'scale', 'kernel': 'linear'}",0.670588,0.703204,0.692568,0.688764,0.013591,1
3,0.516824,0.023042,0.14813,0.01053,2,scale,rbf,"{'C': 2, 'gamma': 'scale', 'kernel': 'rbf'}",0.680672,0.689713,0.684122,0.684831,0.003727,2
5,0.53084,0.008517,0.127872,0.008375,3,scale,rbf,"{'C': 3, 'gamma': 'scale', 'kernel': 'rbf'}",0.680672,0.688027,0.685811,0.684831,0.003083,2
7,0.538354,0.023811,0.12054,0.001578,4,scale,rbf,"{'C': 4, 'gamma': 'scale', 'kernel': 'rbf'}",0.677311,0.686341,0.685811,0.683146,0.00414,4
21,0.424029,0.023579,0.079743,0.00122,1000,scale,rbf,"{'C': 1000, 'gamma': 'scale', 'kernel': 'rbf'}",0.67395,0.686341,0.684122,0.681461,0.005399,5
4,0.376485,0.01541,0.095741,0.014761,3,scale,linear,"{'C': 3, 'gamma': 'scale', 'kernel': 'linear'}",0.665546,0.689713,0.689189,0.681461,0.011279,5
19,0.479306,0.006253,0.117684,0.005799,100,scale,rbf,"{'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}",0.67395,0.686341,0.684122,0.681461,0.005399,5
9,0.506563,0.027909,0.137711,0.015167,5,scale,rbf,"{'C': 5, 'gamma': 'scale', 'kernel': 'rbf'}",0.67395,0.686341,0.684122,0.681461,0.005399,5
17,0.483566,0.00419,0.123532,0.002008,50,scale,rbf,"{'C': 50, 'gamma': 'scale', 'kernel': 'rbf'}",0.67395,0.686341,0.684122,0.681461,0.005399,5
11,0.526757,0.027897,0.123435,0.000471,6,scale,rbf,"{'C': 6, 'gamma': 'scale', 'kernel': 'rbf'}",0.67395,0.686341,0.684122,0.681461,0.005399,5


### New Model with Parameters from GridSearchCV

In [56]:
from sklearn.svm import SVC
cv_svc2 = SVC(C=5,kernel='rbf',gamma='scale')
cv_svc2.fit(X_train_cv, y_train)

SVC(C=5, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [57]:
cv_svc2.score(X_test_cv, y_test)

0.734006734006734

In [58]:
from sklearn.svm import SVC
tfid_svc2 = SVC(C=2,kernel='rbf',gamma='scale')
tfid_svc2.fit(X_train_tfid, y_train)

SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [59]:
tfid_svc2.score(X_test_tfid, y_test)

0.765993265993266