In [1]:
import pandas as pd
import numpy as np

df=pd.read_csv("C:\\Users\\Venky\\Desktop\\Amazon_Unlocked_Mobile.csv")
df = df.sample(frac=0.1, random_state=10)
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
394349,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,Very good one! Better than Samsung S and iphon...,0.0
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0


In [2]:
df.dropna(inplace=True)

#Remove the neutral ratings
df=df[df['Rating']!=3]

#Encode 4's and 5's as 1
#Encode 2's and 1's as 0
df['Positive Ratings']=np.where(df['Rating']> 3,1,0)
df.head(7)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positive Ratings
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0,0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0,1
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0,0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0,1
277158,Nokia N8 Unlocked GSM Touch Screen Phone Featu...,Nokia,95.0,5,I fell in love with this phone because it did ...,0.0,1
100311,Blackberry Torch 2 9810 Unlocked Phone with 1....,BlackBerry,77.49,5,I am pleased with this Blackberry phone! The p...,0.0,1
251669,Motorola Moto E (1st Generation) - Black - 4 G...,Motorola,89.99,5,"Great product, best value for money smartphone...",0.0,1


In [3]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df['Reviews'],df['Positive Ratings'],random_state=0)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
vect=CountVectorizer().fit(X_train)

#we can get feature names by using get_feature_names
vect.get_feature_names()[::2000]

['00',
 'arroja',
 'comapañias',
 'dvds',
 'golden',
 'lands',
 'oil',
 'razonable',
 'smallsliver',
 'tweak']

In [5]:
X_train_vectorized=vect.transform(X_train)
X_train_vectorized

<23052x19601 sparse matrix of type '<class 'numpy.int64'>'
	with 613289 stored elements in Compressed Sparse Row format>

In [6]:
from sklearn.linear_model import LogisticRegression

logreg=LogisticRegression().fit(X_train_vectorized,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [7]:
X_test_vectorized=vect.transform(X_test)
logreg.score(X_test_vectorized,y_test)

0.9319453480806766

In [8]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,logreg.predict(vect.transform(X_test)))

0.8971767259522229

In [9]:
# These reviews are treated the same by our current model
print(logreg.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[0 0]


In [11]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names()

# Sort the coefficients from the model
sorted_coef_index = logreg.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

SyntaxError: invalid syntax (<ipython-input-11-eb1dbf0daa04>, line 5)

## TF-IDF

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer().fit(X_train)

In [13]:
tfidf_log_reg=LogisticRegression().fit(tfidf.transform(X_train),y_train)
print("AUC: ",roc_auc_score(y_test,tfidf_log_reg.predict(tfidf.transform(X_test))))

AUC:  0.8890901979167192


In [14]:
# These reviews are treated the same by our current model
print(tfidf_log_reg.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[0 0]


## ngrams

In [15]:
# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams and 2-grams
ngrams_vect=CountVectorizer(min_df=5,ngram_range=(1,2)).fit(X_train)
X_train_ngram_vectorized=ngrams_vect.transform(X_train)

In [16]:
ngram_log_reg=LogisticRegression().fit(X_train_ngram_vectorized,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [17]:
print("AUC :",roc_auc_score(y_test,ngram_log_reg.predict(ngrams_vect.transform(X_test))))

AUC : 0.9104640361714084


In [18]:
# These reviews are treated the same by our current model
print(ngram_log_reg.predict(ngrams_vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[1 0]


In [19]:
# These reviews are now correctly identified
print(ngram_log_reg.predict(ngrams_vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[1 0]


In [27]:
ngram_log_reg.predict_proba(ngrams_vect.transform(X_test))

array([[0.02782631, 0.97217369],
       [0.00140865, 0.99859135],
       [0.01208278, 0.98791722],
       ...,
       [0.00985677, 0.99014323],
       [0.24862111, 0.75137889],
       [0.84533296, 0.15466704]])

In [28]:
ngram_log_reg.predict(ngrams_vect.transform(X_test))

array([1, 1, 1, ..., 1, 1, 0])

In [29]:
X_test[0]

"I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!"

In [30]:
df[df['Reviews']==X_test[0]]['Review Votes']

0    1.0
Name: Review Votes, dtype: float64