In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pickle
from google.colab import drive
import pandas as pd

drive.mount("/content/drive",force_remount=True)
INPUT_FILEBASE = "/content/drive/MyDrive/yelp_dataset_in/"
OUTPUT_FILEBASE = "/content/drive/MyDrive/yelp_dataset_out/"

pickle_name = "yelp_reviews_Electronics_categories_final.pickle"
df1 = pd.read_pickle("%s%s" %(OUTPUT_FILEBASE,pickle_name))
print(df1.shape)
print(df1.head())
print(df1["sentiment_"].unique())

X = df1.review_
y = df1.sentiment_
indices = df1.index

X_train, X_test, y_train, y_test, i_train, i_test = train_test_split(X, y, indices, train_size = 0.8, random_state = 7)

Mounted at /content/drive
(10000, 17613)
   usual                                            review_  polarity  \
0    0.0  I usually love going to this t-mobile. The rep...  0.207273   
1    0.0  The store gave me misleading information.  One...  0.019444   
2    0.0  Nice, stand-alone T-Mobile in the parking lot ...  0.433333   
3    0.0  I get awesome service every time I come here. ...  0.221875   
4    0.0  Very busy I waited for 45 min before even bein...  0.300000   

          sentiment_  love   go  tmobil  repres  alway  nice  ...  200mbps  \
0  Slightly Positive   0.0  0.0     0.0     0.0    0.0   0.0  ...      0.0   
1  Slightly Negative   0.0  0.0     0.0     0.0    0.0   0.0  ...      0.0   
2           Positive   0.0  0.0     0.0     0.0    0.0   0.0  ...      0.0   
3  Slightly Positive   0.0  0.0     0.0     0.0    0.0   0.0  ...      0.0   
4  Slightly Positive   0.0  0.0     0.0     0.0    0.0   0.0  ...      0.0   

   50mbps  starlink  sparklightcar  10100  17044  d

**Naive Bayes with CountVectorizer**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
steps = [('vec', CountVectorizer(stop_words = 'english', ngram_range = (1, 2))), ('nb', MultinomialNB())] 
pipeline = Pipeline(steps) 
parameters = {'vec__min_df':[0.01, 0.1, 1, 10, 100], 'nb__alpha':[0.01, 0.1, 1, 10, 100]}

clf = GridSearchCV(pipeline, parameters, cv = 5, scoring="accuracy") 
clf.fit(X_train, y_train)

print(clf.best_params_)
results = clf.predict(X_test)
test_accuracy = clf.score(X_test, y_test)
probs = clf.predict_proba(X_test)[:, 1]
f1_accuracy = f1_score(y_test,results,average='macro')
f1_accuracym = f1_score(y_test,results,average='micro')
f1_accuracyw = f1_score(y_test,results,average='weighted')
print("Accuracy on test data: " ,test_accuracy)
print('F1 Score (macro): ', f1_accuracy)
print('F1 Score (micro): ', f1_accuracym)
print('F1 Score (weighted): ', f1_accuracyw)

{'nb__alpha': 0.01, 'vec__min_df': 10}
Accuracy on test data:  0.6135
F1 Score (macro):  0.5980484065627263
F1 Score (micro):  0.6135
F1 Score (weighted):  0.6117488859877542


**Naive Bayes with TFIDF**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

steps = [('vec', TfidfVectorizer(stop_words = 'english', ngram_range = (1, 2))), ('nb', MultinomialNB())] 
pipeline = Pipeline(steps) 
parameters = {'vec__min_df':[0.01, 0.1, 1, 10, 100], 'nb__alpha':[0.01, 0.1, 1, 10, 100]}

clf = GridSearchCV(pipeline, parameters, cv = 10, scoring="accuracy") 
clf.fit(X_train, y_train)

print(clf.best_params_)
results = clf.predict(X_test)
test_accuracy = clf.score(X_test, y_test)
probs = clf.predict_proba(X_test)[:, 1]
f1_accuracy = f1_score(y_test,results,average='macro')
f1_accuracym = f1_score(y_test,results,average='micro')
f1_accuracyw = f1_score(y_test,results,average='weighted')
print("Accuracy on test data: " ,test_accuracy)
print('F1 Score (macro): ', f1_accuracy)
print('F1 Score (micro): ', f1_accuracym)
print('F1 Score (weighted): ', f1_accuracyw)

{'nb__alpha': 0.1, 'vec__min_df': 10}
Accuracy on test data:  0.6085
F1 Score (macro):  0.5553626505573198
F1 Score (micro):  0.6085
F1 Score (weighted):  0.5913411714973592
