In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('Reviews.csv')
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [2]:
df.dropna(inplace=True)
df[df['Score'] != 3]
df['Positivity'] = np.where(df['Score'] > 3, 1, 0)
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Positivity
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,1
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,0
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,1
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,0
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,1


In [6]:
from sklearn.model_selection import train_test_split
X=df['Text']
y = df['Positivity']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

print('X_train first entry: \n\n', X_train[0])
print('\n\nX_train shape: ', X_train.shape)

X_train first entry: 

 I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.


X_train shape:  (426308,)


In [13]:

#Bag of words - generating count of each word

from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer().fit(X_train)
vect.get_feature_names()[::8000]


['00',
 'antes',
 'b0054phg2q',
 'causal',
 'deadman',
 'exerice',
 'guility',
 'kary',
 'miscarriage',
 'paternal',
 'refigerate',
 'slurping',
 'thwart',
 'windex']

In [19]:
from sklearn.linear_model import LogisticRegression

X_train_vectorized = vect.transform(X_train)


# Logistics Regression works well for high dimensional sparse data.
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [22]:
from sklearn.metrics import roc_auc_score

#This type of graph is called a Receiver Operating Characteristic curve (or ROC curve.)
#It is a plot of the true positive rate against the false positive rate for the different possible cutpoints of a diagnostic test.


predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.8402824658587869


In [23]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs: \n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}\n'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs: 
['downhill' 'weiner' 'amendment' 'cardboardy' 'ick' 'unappealing' 'worst'
 'tastless' 'limpest' 'redeeming']

Largest Coefs: 
['blowout' 'concur' 'hears' 'pleasantly' 'emeraldforest' 'tribute'
 'bertie' 'addicting' 'hooked' 'skeptical']



# Tf-idf allows us to weight terms based on how important they are to a document.

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(min_df = 5).fit(X_train)
len(vect.get_feature_names())

36638

In [26]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)
predictions = model.predict(vect.transform(X_test))
print('AUC: ', roc_auc_score(y_test, predictions))



AUC:  0.8286462422705612


 # #AUC score dropped by almost 4%.

In [27]:
feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest Tfidf: \n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest Tfidf: \n{}\n'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest Tfidf: 
['furnishings' '0373' 'amortization' '0004' '1534' '0377' '368' 'proline'
 'serine' 'isoleucine2']

Largest Tfidf: 
['carmel' '98' 'aaa' 'good' 'filler' 'word' 'love' 'mmm' 'awesome' 'yum']



In [None]:
lst=[]
for i in range(10):
    review = input("Say something aboout your hostel food - ")
    lst.append(review)
print(model.predict(vect.transform(lst)))



In [None]:
Until now the reviews are judged only one wrd at a time. So, reviews like 'not liked the food' will be considered poitive because of the word liked.

# n-grams

In [None]:
vect = CountVectorizer(min_df = 5, ngram_range = (1,2)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
len(vect.get_feature_names())

In [None]:
val_predictions = model.predict(X_train_vectorized)

from sklearn.metrics import mean_absolute_error

val_mae = mean_absolute_error(val_predictions, y_test)



In [None]:
model.fit(X_train_vectorized, y_train)

#Checking AOC for X-test

predictions = model.predict(vect.transform(X_test))


print('AUC: ', roc_auc_score(y_test, predictions) , 'MAE   = ', val_mae)

In [None]:
lst=[]
for i in range(10):
    review = input("Say something aboout your hostel food - ")
    lst.append(review)
print(model.predict(vect.transform(lst)))