## Data Prep

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('Amazon_Unlocked_Mobile.csv')
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [2]:
df.dropna(inplace=True)
df = df[df['Rating'] != 3]
df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)
df.head(10)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0,1
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0,1
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0,1
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0,1
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0,1
5,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,1,I already had a phone with problems... I know ...,1.0,0
6,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,The charging port was loose. I got that solder...,0.0,0
7,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,"Phone looks good but wouldn't stay charged, ha...",0.0,0
8,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I originally was using the Samsung S2 Galaxy f...,0.0,1
11,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,This is a great product it came after two days...,0.0,1


In [3]:
df['Positively Rated'].mean()

0.7482686025879323

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], df['Positively Rated'], random_state=0)

In [5]:
print('X_train first entry:\n\n', X_train[0])
print('\n\nX_train shape: ', X_train.shape)

X_train first entry:

 I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!


X_train shape:  (231207,)


 ## CountVectorizer

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer().fit(X_train)

In [13]:
vect.get_feature_names()[::2000]

['00',
 '4less',
 'adr6275',
 'assignment',
 'blazingly',
 'cassettes',
 'condishion',
 'debi',
 'dollarsshipping',
 'esteem',
 'flashy',
 'gorila',
 'human',
 'irullu',
 'like',
 'microsaudered',
 'nightmarish',
 'p770',
 'poori',
 'quirky',
 'responseive',
 'send',
 'sos',
 'synch',
 'trace',
 'utiles',
 'withstanding']

In [14]:
len(vect.get_feature_names())

53216

In [15]:
X_train_vectorized = vect.transform(X_train)

X_train_vectorized

<231207x53216 sparse matrix of type '<class 'numpy.int64'>'
	with 6117776 stored elements in Compressed Sparse Row format>

In [16]:
print(X_train_vectorized)

  (0, 4986)	1
  (0, 7259)	2
  (0, 7676)	1
  (0, 7878)	1
  (0, 8476)	1
  (0, 9637)	1
  (0, 14420)	1
  (0, 26003)	2
  (0, 31892)	1
  (0, 32284)	1
  (0, 33437)	1
  (0, 37356)	1
  (0, 38473)	1
  (0, 42146)	1
  (0, 46946)	1
  (0, 47462)	1
  (0, 47639)	1
  (0, 51169)	1
  (0, 51673)	1
  (1, 142)	1
  (1, 807)	1
  (1, 860)	1
  (1, 1960)	2
  (1, 2372)	1
  (1, 3696)	1
  :	:
  (231205, 32284)	1
  (231205, 46946)	1
  (231206, 4986)	1
  (231206, 5188)	1
  (231206, 5868)	1
  (231206, 8751)	1
  (231206, 9956)	1
  (231206, 21497)	2
  (231206, 26106)	3
  (231206, 30767)	1
  (231206, 31617)	1
  (231206, 32100)	1
  (231206, 32422)	1
  (231206, 33069)	1
  (231206, 33080)	1
  (231206, 35827)	1
  (231206, 37940)	1
  (231206, 41563)	1
  (231206, 47193)	1
  (231206, 47233)	1
  (231206, 47473)	1
  (231206, 47639)	2
  (231206, 49855)	1
  (231206, 51606)	1
  (231206, 52818)	1


In [17]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
from sklearn.metrics import roc_auc_score

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.930570850202


In [20]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['mony' 'worst' 'false' 'worthless' 'horribly' 'messing' 'unsatisfied'
 'junk' 'blacklist' 'superthin']

Largest Coefs:
['excelent' 'excelente' '4eeeks' 'exelente' 'efficient' 'excellent'
 'loving' 'pleasantly' 'loves' 'mn8k2ll']



## Tfidf

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(min_df=5).fit(X_train)
len(vect.get_feature_names())

17951

In [22]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.926610066675


In [24]:
feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf:
['commenter' 'pthalo' 'warmness' 'storageso' 'aggregration' '1300'
 '625nits' 'a10' 'submarket' 'brawns']

Largest tfidf:
['defective' 'batteries' 'gooood' 'epic' 'luis' 'goood' 'basico'
 'aceptable' 'problems' 'excellant']



In [26]:
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs:\n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['not' 'worst' 'useless' 'disappointed' 'terrible' 'return' 'waste' 'poor'
 'horrible' 'doesn']

Largest Coefs:
['love' 'great' 'excellent' 'perfect' 'amazing' 'awesome' 'perfectly'
 'easy' 'best' 'loves']


In [27]:
print(model.predict(vect.transform(['not an issue, phone is working', 'an issue, phone is not working'])))

[0 0]


## n-grams

In [28]:
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

len(vect.get_feature_names())

198917

In [30]:
model = LogisticRegression()
model.fit(vect.transform(X_train), y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.967135073021


In [34]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['no good' 'worst' 'junk' 'not good' 'not happy' 'horrible' 'garbage'
 'terrible' 'looks ok' 'nope']

Largest Coefs:
['not bad' 'excelent' 'excelente' 'excellent' 'perfect' 'no problems'
 'exelente' 'awesome' 'no issues' 'great']



In [35]:
print(model.predict(vect.transform(['not an issue, phone is working', 'an issue, phone is not working'])))

[1 0]
