In [2]:
import pandas as pd
import numpy as np

## Loading the dataset

In [23]:
df = pd.read_csv('Amazon_Unlocked_Mobile.csv')

df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


## Dealing with dataset

In [24]:
#drop missing values
df.dropna(inplace = True)

#removing any neutral ratings i.e. equal to 3
df = df[df['Rating']!=3]

#making positive reviews(5s and 4s) as 1
#making negative reviews(1s and 2s) as 0
df['Positive review'] = np.where(df['Rating']>3, 1, 0)
df['Positive review'].head(10)

0     1
1     1
2     1
3     1
4     1
5     0
6     0
7     0
8     1
11    1
Name: Positive review, dtype: int32

In [25]:
df['Positive review'].mean()

0.7482686025879323

## Spliting dataset into train and test set

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], df['Positive review'], random_state=0)
print('X_train first entry:\n\n', X_train.iloc[0])
print('\n\nX_train shape: ', X_train.shape)

X_train first entry:

 I bought a BB Black and was deliveried a White BB.Really is not a serious provider...Next time is better to cancel the order.


X_train shape:  (231207,)


## CountVectorizer

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams and 2-grams
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

198917

In [30]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['no good' 'not happy' 'not worth' 'worst' 'junk' 'not satisfied'
 'garbage' 'not good' 'terrible' 'defective']

Largest Coefs: 
['excelent' 'excelente' 'not bad' 'excellent' 'exelente' 'perfect'
 'awesome' 'no problems' 'no issues' 'perfecto']


## Training the model

In [28]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression().fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


AUC:  0.9599601154447429


## Predicting the sentiment of new reviews

In [36]:
#output of '1' indicates positive review
#output of '0' indicates negative review
print(model.predict(vect.transform(['This phone is fabulous'])))
print(model.predict(vect.transform(['Worst phone ever in the world'])))

[1]
[0]


In [37]:
#handling n-gram
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[1 0]
