In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix,make_scorer, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pickle
from google.colab import drive
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


drive.mount("/content/drive",force_remount=True)
INPUT_FILEBASE = "/content/drive/MyDrive/yelp_dataset_in/"
OUTPUT_FILEBASE = "/content/drive/MyDrive/yelp_dataset_out/"

pickle_name = "yelp_reviews_Electronics_categories_final.pickle"
df1 = pd.read_pickle("%s%s" %(OUTPUT_FILEBASE,pickle_name))
print(df1.shape)
print(df1.head())
print(df1["sentiment_"].unique())

X = df1.iloc[0:,4:]
y = df1.sentiment_
indices = df1.index
X_train, X_test, y_train, y_test, itrain, itest = train_test_split(X,y,indices,train_size=0.8,random_state=7)


Mounted at /content/drive
(10000, 17613)
   usual                                            review_  polarity  \
0    0.0  I usually love going to this t-mobile. The rep...  0.207273   
1    0.0  The store gave me misleading information.  One...  0.019444   
2    0.0  Nice, stand-alone T-Mobile in the parking lot ...  0.433333   
3    0.0  I get awesome service every time I come here. ...  0.221875   
4    0.0  Very busy I waited for 45 min before even bein...  0.300000   

          sentiment_  love   go  tmobil  repres  alway  nice  ...  200mbps  \
0  Slightly Positive   0.0  0.0     0.0     0.0    0.0   0.0  ...      0.0   
1  Slightly Negative   0.0  0.0     0.0     0.0    0.0   0.0  ...      0.0   
2           Positive   0.0  0.0     0.0     0.0    0.0   0.0  ...      0.0   
3  Slightly Positive   0.0  0.0     0.0     0.0    0.0   0.0  ...      0.0   
4  Slightly Positive   0.0  0.0     0.0     0.0    0.0   0.0  ...      0.0   

   50mbps  starlink  sparklightcar  10100  17044  d

In [None]:
steps = [('scaler', StandardScaler()), ('lr', LogisticRegression(solver = 'lbfgs',max_iter=1000))] 
pipeline = Pipeline(steps)
parameters = {'lr__C':[0.01, 0.1, 1, 10, 100]}


clf = GridSearchCV(pipeline, parameters, cv = 5, scoring="accuracy") 
clf.fit(X_train, y_train)
print(clf.best_params_)
results = clf.predict(X_test)
test_accuracy = clf.score(X_test, y_test)
f1_accuracy = f1_score(y_test,results,average='macro')
f1_accuracym = f1_score(y_test,results,average='micro')
f1_accuracyw = f1_score(y_test,results,average='weighted')
print("Accuracy on test data: " ,test_accuracy)
print('F1 Score (macro): ', f1_accuracy)
print('F1 Score (micro): ', f1_accuracym)
print('F1 Score (weighted): ', f1_accuracyw)

{'lr__C': 0.01}
Accuracy on test data:  0.597
F1 Score (macro):  0.5910366394608606
F1 Score (micro):  0.597
F1 Score (weighted):  0.5964990122086132


In [None]:
from sklearn.ensemble import RandomForestClassifier
steps = [('scaler', StandardScaler()), ('rf', RandomForestClassifier())] 
pipeline = Pipeline(steps) 
parameters = {'rf__n_estimators':[10 , 20, 30, 40, 50], 'rf__max_features':['auto','sqrt']}
clf = GridSearchCV(pipeline, parameters, cv = 5, scoring="accuracy") 
clf.fit(X_train, y_train)

print(clf.best_params_)
results = clf.predict(X_test)
test_accuracy = clf.score(X_test, y_test)
f1_accuracy = f1_score(y_test,results,average='macro')
f1_accuracym = f1_score(y_test,results,average='micro')
f1_accuracyw = f1_score(y_test,results,average='weighted')
print("Accuracy on test data: " ,test_accuracy)
print('F1 Score (macro): ', f1_accuracy)
print('F1 Score (micro): ', f1_accuracym)
print('F1 Score (weighted): ', f1_accuracyw)

Accuracy on test data:  0.6225
F1 Score (macro):  0.5939848272105531
F1 Score (micro):  0.6225
F1 Score (weighted):  0.6140657913939286


In [None]:
from sklearn.svm import SVC

steps = [('scaler', StandardScaler()), ('svc', SVC(probability=False,kernel='linear',gamma='auto'))] 
pipeline = Pipeline(steps) 
parameters = {'svc__C':[0.01, 0.1, 1]}

clf = GridSearchCV(pipeline, parameters, cv = 3, scoring="accuracy") 
clf.fit(X_train, y_train)

print(clf.best_params_)
results = clf.predict(X_test)
test_accuracy = clf.score(X_test, y_test)
f1_accuracy = f1_score(y_test,results,average='macro')
f1_accuracym = f1_score(y_test,results,average='micro')
f1_accuracyw = f1_score(y_test,results,average='weighted')
print("Accuracy on test data: " ,test_accuracy)
print('F1 Score (macro): ', f1_accuracy)
print('F1 Score (micro): ', f1_accuracym)
print('F1 Score (weighted): ', f1_accuracyw)

{'svc__C': 0.01}
Accuracy on test data:  0.619
F1 Score (macro):  0.6133826082582902
F1 Score (micro):  0.619
F1 Score (weighted):  0.6193411529481063


In [None]:
from sklearn.svm import SVC

svmClassifier = SVC()
svmClassifier.fit(X_train, y_train)

predictions = svmClassifier.predict(X_test)

# clf2 = SVC(solver = 'lbfgs')
# model = Pipeline([('classifier',clf2)])
# model.fit(Xtrain, ytrain)
# predictions = model.predict(Xtest)
mat = confusion_matrix(y_test,predictions)
cm_df =  pd.DataFrame(mat, index= [i for i in ['Negative','Positive',
                                               'Slightly Negative',
                                              'Slightly Positive']],
                     columns= [i for i in ['Negative','Positive',
                                               'Slightly Negative',
                                              'Slightly Positive']])
plt.figure(figsize=(10,10))
sns.heatmap(cm_df, annot=True,cmap='Blues',fmt='g')


print('Accuracy Score: ',accuracy_score(predictions,y_test))
# cm = confusion_matrix(list(ytest),list(predictions))
# print(cm)

#Calculate sihouette Score
# score = metrics.silhouette_score(X_test, y_pred)
# print("Sihouette Score: ",score) 



Accuracy Score:  0.7165


<Figure size 720x720 with 0 Axes>

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

steps = [('scaler', StandardScaler()), ('gbc', GradientBoostingClassifier(max_features='sqrt'))] 
pipeline = Pipeline(steps) 
parameters = {'gbc__n_estimators':[10, 50, 100, 200, 500], 'gbc__learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25]}

clf = GridSearchCV(pipeline, parameters, cv = 5, scoring="accuracy") 
clf.fit(X_train, y_train)

clf.best_params_
steps = [('scaler', StandardScaler()), ('gbc', GradientBoostingClassifier(learning_rate = 0.15, max_features = 'sqrt', n_estimators = 500))] 
clf = Pipeline(steps) 
clf.fit(X_train, y_train)
results = clf.predict(X_test)
test_accuracy = clf.score(X_test, y_test)
probs = clf.predict_proba(X_test)[:, 1]
f1_accuracy = f1_score(y_test,results,average='macro')
f1_accuracym = f1_score(y_test,results,average='micro')
f1_accuracyw = f1_score(y_test,results,average='weighted')
print("Accuracy on test data: " ,test_accuracy)
print('F1 Score (macro): ', f1_accuracy)
print('F1 Score (micro): ', f1_accuracym)
print('F1 Score (weighted): ', f1_accuracyw)