In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('Amazon_Unlocked_Mobile.csv', engine='python')

df.head()

In [None]:
df.dropna(inplace=True)

df = df[df['Rating'] != 3]

# Encode 4s and 5s as 1 (rated positively)
# Encode 1s and 2s as 0 (rated poorly)

df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)
df.head(10)

In [3]:
df['Positively Rated'].mean()

0.7084798749511528

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], 
                                                    df['Positively Rated'], 
                                                    random_state=0)

# CountVectorizer

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer().fit(X_train)

In [6]:
vect.get_feature_names()[::2000]

['00', 'cheesiness', 'flimsy', 'material', 'reply', 'two']

In [7]:
len(vect.get_feature_names())

10861

In [8]:
X_train_vectorized = vect.transform(X_train)

X_train_vectorized

<19192x10861 sparse matrix of type '<class 'numpy.int64'>'
	with 426894 stored elements in Compressed Sparse Row format>

In [9]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
from sklearn.metrics import roc_auc_score

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.95082056892779


In [11]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['mony' 'terrible' 'dieing' 'poor' 'worst' 'not' 'broken' 'dies' 'junk'
 'wasn']

Largest Coefs: 
['excellent' 'love' 'perfect' 'awesome' 'great' 'loves' 'excelente'
 'satisfied' 'amazing' 'loved']


In [12]:
print(model.predict(vect.transform(['the phone works perfectly',
                                    'an issue, phone is not working'])))

[1 0]
