##MNB and SVM for Fake Review Detection



In [None]:
import pandas as pd
input=pd.read_csv("/content/drive/MyDrive/deception_data_converted_final(1).tsv", delimiter='\t')
input.head()

Unnamed: 0,lie,sentiment,review
0,f,n,"'Mike\'s Pizza High Point, NY Service was very..."
1,f,n,'i really like this buffet restaurant in Marsh...
2,f,n,"'After I went shopping with some of my friend,..."
3,f,n,'Olive Oil Garden was very disappointing. I ex...
4,f,n,'The Seven Heaven restaurant was never known f...


In [None]:
import re

def remove_punctuations(text):
  regex = r"[!\"#\$%&\'\(\)\*\+,-\./:;<=>\?@\[\\\]\^_`{\|}~]"
  subst = ""
  return re.sub(regex, subst, text, 0, re.MULTILINE)

input['review'] = input['review'].apply(remove_punctuations)

In [None]:
input.head()

Unnamed: 0,lie,sentiment,review
0,f,n,Mikes Pizza High Point NY Service was very slo...
1,f,n,i really like this buffet restaurant in Marsha...
2,f,n,After I went shopping with some of my friend w...
3,f,n,Olive Oil Garden was very disappointing I expe...
4,f,n,The Seven Heaven restaurant was never known fo...


#Sentiment Analysis

In [None]:
X = input['review'].values
y = input['sentiment'].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [None]:
import numpy as np

unique, counts = np.unique(y_train, return_counts=True)
print(np.asarray((unique, counts)))

unique, counts = np.unique(y_test, return_counts=True)
print(np.asarray((unique, counts)))

[['n' 'p']
 [28 27]]
[['n' 'p']
 [18 19]]


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#  unigram boolean vectorizer, set minimum document frequency to 5
unigram_bool_vectorizer = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')

#  unigram term frequency vectorizer, set minimum document frequency to 5
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')

#  unigram and bigram term frequency vectorizer, set minimum document frequency to 5
gram12_count_vectorizer = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')

#  unigram tfidf vectorizer, set minimum document frequency to 5
unigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')

# Unigram Boolean Vectorizer

## MNB

In [None]:
X_train_vec = unigram_bool_vectorizer.fit_transform(X_train)
X_test_vec = unigram_bool_vectorizer.transform(X_test)

from sklearn.naive_bayes import MultinomialNB

# initialize the MNB model
nb_clf= MultinomialNB()

# use the training data to train the MNB model
nb_clf.fit(X_train_vec,y_train)

print("Accuracy score on test data set:", nb_clf.score(X_test_vec,y_test))

# print confusion matrix (row: ground truth; col: prediction)

from sklearn.metrics import confusion_matrix
y_pred = nb_clf.fit(X_train_vec, y_train).predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred)
print(cm)

# print classification report

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(precision_score(y_test, y_pred, average=None))
print(recall_score(y_test, y_pred, average=None))

from sklearn.metrics import classification_report
target_names = ['n', 'p']
print(classification_report(y_test, y_pred, target_names=target_names))

Accuracy score on test data set: 0.7567567567567568
[[15  3]
 [ 6 13]]
[0.71428571 0.8125    ]
[0.83333333 0.68421053]
              precision    recall  f1-score   support

           n       0.71      0.83      0.77        18
           p       0.81      0.68      0.74        19

    accuracy                           0.76        37
   macro avg       0.76      0.76      0.76        37
weighted avg       0.76      0.76      0.76        37



In [None]:
# cross validation

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=True)),('nb', MultinomialNB())])
scores = cross_val_score(nb_clf_pipe, X, y, cv=5)
avg=sum(scores)/len(scores)
print(avg)

0.8035087719298245


In [None]:
#error analysis
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='n' and y_pred[i]=='p'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='p' and y_pred[i]=='n'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

This place used to be great I cant believe its current state Instead of the cool dimlylit lounge that I was used to I was in a cheap smelly bar The music has no soul the bartender is mean This place no longer exudes a welcoming spirit The crowd is awkward and old I want my old hangout back
the staff at this restaurant is very unfriendly the waitress for our table is extremely rude we need to wait for one hour for our order to come the place is noisy and the food isnt that good
Olive Oil Garden was very disappointing I expect good food and good service at least when I go out to eat The meal was cold when we got it and the waitor had no manners whatsoever Dont go to the Olive Oil Garden 
errors: 3
This place was one of the best restaurant I have been The price is little expensive but the food and the service is best around the area I went here with my family and we ordered 4 dishes They were all well cooked and their taste were nicely balanced Waiters came when we needed them without hav

In [None]:
#based on error analysis considering "0" category as negative and "1" category as positive

log_ratios = []
features = unigram_bool_vectorizer.get_feature_names_out()
negative_cond_prob = nb_clf.feature_log_prob_[0]
positive_cond_prob = nb_clf.feature_log_prob_[1]

for i in range(0, len(features)):
  log_ratio = negative_cond_prob[i] - positive_cond_prob[i]
  log_ratios.append(log_ratio)

feature_ranks = sorted(zip(log_ratios, features))
positive_10 = feature_ranks[:10]
print("Positive words")
for i in range(0, len(positive_10)):
    print(positive_10[i])
print()

negative_10 = feature_ranks[-10:]
print("Negative words")
for i in range(0, len(negative_10)):
    print(negative_10[i])
print()

Positive words
(-2.3653211431988264, 'amazing')
(-1.9598560350906618, 'need')
(-1.5543909269824971, 'best')
(-1.5543909269824971, 'fresh')
(-1.4208595343579749, 'friendly')
(-1.4208595343579749, 'nice')
(-1.084387297736762, 'prices')
(-1.084387297736762, 'sauce')
(-0.8612437464225517, 'great')
(-0.7277123537980295, 'dining')

Negative words
(0.7481941660115483, 'wasnt')
(0.9305157228055023, 'table')
(0.9305157228055028, 'went')
(1.0846664026327613, 'bad')
(1.2181977952572836, 'minutes')
(1.6236629033654477, 'came')
(1.7778135831927067, 'terrible')
(1.7778135831927067, 'took')
(1.911344975817229, 'said')
(2.0291280114736128, 'asked')



##SVM

In [None]:
from sklearn.svm import LinearSVC

# initialize the LinearSVC model
svm_clf = LinearSVC(C=1)

# use the training data to train the model
svm_clf.fit(X_train_vec,y_train)

# test the classifier on the test data set, print accuracy score

print("Accuracy score on test data set:",svm_clf.score(X_test_vec,y_test))

# print confusion matrix and classification report

from sklearn.metrics import confusion_matrix
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred)
print(cm)
print()

from sklearn.metrics import classification_report
target_names = ['n', 'p']
print(classification_report(y_test, y_pred, target_names=target_names))

Accuracy score on test data set: 0.6756756756756757
[[13  5]
 [ 7 12]]

              precision    recall  f1-score   support

           n       0.65      0.72      0.68        18
           p       0.71      0.63      0.67        19

    accuracy                           0.68        37
   macro avg       0.68      0.68      0.68        37
weighted avg       0.68      0.68      0.68        37



In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate

scoring = ['accuracy', 'f1_macro', 'f1_micro', 'precision_macro', 'recall_macro']
svm_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=True)),('svm', LinearSVC(C=1))])
scores = cross_validate(svm_clf_pipe, X, y, cv=5, scoring=scoring, return_train_score=True)

# retrieve scores from a metric
def get_metric_scores (scores, metric, train_or_test, verbose=False):
    metric_name = train_or_test + '_' + metric
    print(metric_name) 

    metric_scores = scores[metric_name]
    if (verbose == True):
        print(metric_scores)
    avg = sum(metric_scores) / len(metric_scores)
    print('average')
    avg_formatted = "{:.3f}".format(avg)
    print(avg_formatted)

#retrieve test accuracy scores
print(get_metric_scores(scores, 'accuracy', 'test', verbose=True))

test_accuracy
[0.84210526 0.84210526 0.77777778 0.83333333 0.83333333]
average
0.826
None


In [None]:
#error analysis
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='n' and y_pred[i]=='p'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='p' and y_pred[i]=='n'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

Carlos Plate Shack was the worst dining experience of my life Although my Southern Comfort Plate sounded to die for the staff was extremely unhelpful at every turn We started off with drinks I had a sick Loganberry milkshake and my friends had fresh brewed but bland iced tea the ice likely melted and diluted Eventually our server returned a half hour later to take our orders I had the aforementioned Southern Comfort Plate while my friends ordered the Buffalo Chicken Plate and the Hawaiian Plate Lunch The Southern Comfort Plate came out first a good 15 minutes before the others and was extremely greasy The other 2 ended up being nearly room temperature when they came out Our server failed to return again to check on us until she brought our check rather abruptly We want to give this place a chance but its rather difficult to subject ourselves to such brutal service and pay money
the staff at this restaurant is very unfriendly the waitress for our table is extremely rude we need to wait 

In [None]:
#Top 10 features

feature_ranks = sorted(zip(svm_clf.coef_[0], unigram_bool_vectorizer.get_feature_names_out()))

##Top 10 features for negative words
negative_10 = feature_ranks[:10]
print("Negative words")
for i in range(0, len(negative_10)):
    print(negative_10[i])
print()

##Top 10 features for positive words
positive_10 = feature_ranks[-10:]
print("Positive words")
for i in range(0, len(positive_10)):
    print(positive_10[i])
print()

Negative words
(-0.7384272170916907, 'said')
(-0.6951709541180632, 'asked')
(-0.6675277516753709, 'service')
(-0.5941878360098103, 'went')
(-0.5454756660166099, 'bad')
(-0.48024927398552053, 'restaurant')
(-0.4432394110496981, 'salad')
(-0.43210125490870066, 'dining')
(-0.40313206674786795, 'dine')
(-0.33664336745914003, 'served')

Positive words
(0.29523827386623713, 'life')
(0.3153637579778321, 'nice')
(0.3256870066232356, 'time')
(0.37196779864809654, 'prices')
(0.4627389246943684, 'amazing')
(0.49089923723518547, 'waiters')
(0.7054881439793739, 'high')
(0.9943319447218588, 'need')
(1.0257639354556223, 'fresh')
(1.0584907189356194, 'best')



## Unigram Term Frequency Vectorizer

##MNB

In [None]:
X_train_vec = unigram_count_vectorizer.fit_transform(X_train)
X_test_vec = unigram_count_vectorizer.transform(X_test)

from sklearn.naive_bayes import MultinomialNB

# initialize the MNB model
nb_clf= MultinomialNB()

# use the training data to train the MNB model
nb_clf.fit(X_train_vec,y_train)

print("Accuracy score on test data set:", nb_clf.score(X_test_vec,y_test))

# print confusion matrix (row: ground truth; col: prediction)

from sklearn.metrics import confusion_matrix
y_pred = nb_clf.fit(X_train_vec, y_train).predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred)
print(cm)

# print classification report

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(precision_score(y_test, y_pred, average=None))
print(recall_score(y_test, y_pred, average=None))

from sklearn.metrics import classification_report
target_names = ['n', 'p']
print(classification_report(y_test, y_pred, target_names=target_names))

Accuracy score on test data set: 0.8108108108108109
[[14  4]
 [ 3 16]]
[0.82352941 0.8       ]
[0.77777778 0.84210526]
              precision    recall  f1-score   support

           n       0.82      0.78      0.80        18
           p       0.80      0.84      0.82        19

    accuracy                           0.81        37
   macro avg       0.81      0.81      0.81        37
weighted avg       0.81      0.81      0.81        37



In [None]:
# cross validation

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=False)),('nb', MultinomialNB())])
scores = cross_val_score(nb_clf_pipe, X, y, cv=5)
avg=sum(scores)/len(scores)
print(avg)

0.8690058479532163


In [None]:
#error analysis
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='n' and y_pred[i]=='p'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='p' and y_pred[i]=='n'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

I went there with two friends at 6pm Long queue was there But it didnt take us long to wait The waiter was nice but worked in a hurry We ordered Todays Special some drinks and two icecreams I had a steak a little bit too salty but acceptable My friends didnt like their lamb chop and cod filet that much It costed us almost 100 Not worth it Will not visit there any more
the staff at this restaurant is very unfriendly the waitress for our table is extremely rude we need to wait for one hour for our order to come the place is noisy and the food isnt that good
This diner was not at all up to par Ive been to many diners and get eggs benedict sometimes There was nacho cheese on my eggs and a plateful of watery runny eggs And it smelled like smoke And there was no heat in the dead of winter Their prices are not ANYWHERE near what is reasonable Cool mom  pop place but terrible food smell and prices
Olive Oil Garden was very disappointing I expect good food and good service at least when I go ou

In [None]:
#based on error analysis considering "0" category as negative and "1" category as positive

log_ratios = []
features = unigram_count_vectorizer.get_feature_names_out()
negative_cond_prob = nb_clf.feature_log_prob_[0]
positive_cond_prob = nb_clf.feature_log_prob_[1]

for i in range(0, len(features)):
  log_ratio = negative_cond_prob[i] - positive_cond_prob[i]
  log_ratios.append(log_ratio)

feature_ranks = sorted(zip(log_ratios, features))
positive_10 = feature_ranks[:10]
print("Positive words")
for i in range(0, len(positive_10)):
    print(positive_10[i])
print()

negative_10 = feature_ranks[-10:]
print("Negative words")
for i in range(0, len(negative_10)):
    print(negative_10[i])
print()

Positive words
(-2.6969096578540186, 'amazing')
(-2.0037624772940736, 'need')
(-1.8702310846695505, 'best')
(-1.8214409205001187, 'fresh')
(-1.4647659765613863, 'friendly')
(-1.4647659765613863, 'nice')
(-1.3106152967341282, 'prices')
(-1.0593008684532217, 'sauce')
(-0.9051501886259636, 'definitely')
(-0.7716187960014409, 'dining')

Negative words
(0.8866092806020918, 'table')
(1.0872799760642424, 'salad')
(1.1742913530538721, 'bad')
(1.1742913530538721, 'wasnt')
(1.492745084172407, 'minutes')
(1.7339071409892952, 'came')
(1.8674385336138175, 'said')
(1.8674385336138175, 'terrible')
(1.9852215692702013, 'took')
(2.0905820849280277, 'asked')



##SVM

In [None]:
from sklearn.svm import LinearSVC

# initialize the LinearSVC model
svm_clf = LinearSVC(C=1)

# use the training data to train the model
svm_clf.fit(X_train_vec,y_train)

# test the classifier on the test data set, print accuracy score

print("Accuracy score on test data set:",svm_clf.score(X_test_vec,y_test))

# print confusion matrix and classification report

from sklearn.metrics import confusion_matrix
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred)
print(cm)
print()

from sklearn.metrics import classification_report
target_names = ['n', 'p']
print(classification_report(y_test, y_pred, target_names=target_names))

Accuracy score on test data set: 0.7837837837837838
[[15  3]
 [ 5 14]]

              precision    recall  f1-score   support

           n       0.75      0.83      0.79        18
           p       0.82      0.74      0.78        19

    accuracy                           0.78        37
   macro avg       0.79      0.79      0.78        37
weighted avg       0.79      0.78      0.78        37



In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate

scoring = ['accuracy', 'f1_macro', 'f1_micro', 'precision_macro', 'recall_macro']
svm_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=False)),('svm', LinearSVC(C=1))])
scores = cross_validate(svm_clf_pipe, X, y, cv=5, scoring=scoring, return_train_score=True)

# retrieve scores from a metric
def get_metric_scores (scores, metric, train_or_test, verbose=False):
    metric_name = train_or_test + '_' + metric
    print(metric_name) 

    metric_scores = scores[metric_name]
    if (verbose == True):
        print(metric_scores)
    avg = sum(metric_scores) / len(metric_scores)
    print('average')
    avg_formatted = "{:.3f}".format(avg)
    print(avg_formatted)

#retrieve test accuracy scores
print(get_metric_scores(scores, 'accuracy', 'test', verbose=True))

test_accuracy
[0.78947368 0.78947368 0.72222222 0.77777778 0.83333333]
average
0.782
None


In [None]:
#error analysis
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='n' and y_pred[i]=='p'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='p' and y_pred[i]=='n'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

the staff at this restaurant is very unfriendly the waitress for our table is extremely rude we need to wait for one hour for our order to come the place is noisy and the food isnt that good
Olive Oil Garden was very disappointing I expect good food and good service at least when I go out to eat The meal was cold when we got it and the waitor had no manners whatsoever Dont go to the Olive Oil Garden 
The service was way below average and we had to wait 10 minutes before we could be seated Then had to wait 20 minutes before the menu was shown to us Let me tell you that the food at Kitty Hoynes Irish Pub  Restaurant was terrible The vegies were under cooked and the lamb was over cooked the bread was dry and the beer was flat Need I say more
errors: 3
This place was one of the best restaurant I have been The price is little expensive but the food and the service is best around the area I went here with my family and we ordered 4 dishes They were all well cooked and their taste were nicely

In [None]:
#Top 10 features

feature_ranks = sorted(zip(svm_clf.coef_[0], unigram_count_vectorizer.get_feature_names_out()))

##Top 10 features for negative words
negative_10 = feature_ranks[:10]
print("Negative words")
for i in range(0, len(negative_10)):
    print(negative_10[i])
print()

##Top 10 features for positive words
positive_10 = feature_ranks[-10:]
print("Positive words")
for i in range(0, len(positive_10)):
    print(positive_10[i])
print()

Negative words
(-0.6414137953146043, 'went')
(-0.612896264023363, 'said')
(-0.5309190478286097, 'service')
(-0.4958628520728307, 'like')
(-0.4626822653488024, 'dishes')
(-0.4324623661418195, 'salad')
(-0.42384150185409464, 'place')
(-0.4203410506860616, 'asked')
(-0.3585760145062391, 'long')
(-0.3488530298600209, 'terrible')

Positive words
(0.3648355580557142, 'dont')
(0.3745064021322023, 'amazing')
(0.3925445709302675, 'high')
(0.40133509919044297, 'friendly')
(0.43659998921783605, 'waiters')
(0.5486712601703337, 'nice')
(0.6475654690390791, 'quality')
(0.6671221207599436, 'fresh')
(0.7633462544255637, 'best')
(0.8813073373968876, 'need')



#Unigram and Bigram term frequency vectorizer

## MNB

In [None]:
X_train_vec = gram12_count_vectorizer.fit_transform(X_train)
X_test_vec = gram12_count_vectorizer.transform(X_test)

from sklearn.naive_bayes import MultinomialNB

# initialize the MNB model
nb_clf= MultinomialNB()

# use the training data to train the MNB model
nb_clf.fit(X_train_vec,y_train)

print("Accuracy score on test data set:", nb_clf.score(X_test_vec,y_test))

# print confusion matrix (row: ground truth; col: prediction)

from sklearn.metrics import confusion_matrix
y_pred = nb_clf.fit(X_train_vec, y_train).predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred)
print(cm)

# print classification report

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(precision_score(y_test, y_pred, average=None))
print(recall_score(y_test, y_pred, average=None))

from sklearn.metrics import classification_report
target_names = ['n', 'p']
print(classification_report(y_test, y_pred, target_names=target_names))

Accuracy score on test data set: 0.8108108108108109
[[14  4]
 [ 3 16]]
[0.82352941 0.8       ]
[0.77777778 0.84210526]
              precision    recall  f1-score   support

           n       0.82      0.78      0.80        18
           p       0.80      0.84      0.82        19

    accuracy                           0.81        37
   macro avg       0.81      0.81      0.81        37
weighted avg       0.81      0.81      0.81        37



In [None]:
# cross validation

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', ngram_range=(1,2))),('nb', MultinomialNB())])
scores = cross_val_score(nb_clf_pipe, X, y, cv=5)
avg=sum(scores)/len(scores)
print(avg)

0.87953216374269


In [None]:
#error analysis
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='n' and y_pred[i]=='p'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='p' and y_pred[i]=='n'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

I went there with two friends at 6pm Long queue was there But it didnt take us long to wait The waiter was nice but worked in a hurry We ordered Todays Special some drinks and two icecreams I had a steak a little bit too salty but acceptable My friends didnt like their lamb chop and cod filet that much It costed us almost 100 Not worth it Will not visit there any more
the staff at this restaurant is very unfriendly the waitress for our table is extremely rude we need to wait for one hour for our order to come the place is noisy and the food isnt that good
This diner was not at all up to par Ive been to many diners and get eggs benedict sometimes There was nacho cheese on my eggs and a plateful of watery runny eggs And it smelled like smoke And there was no heat in the dead of winter Their prices are not ANYWHERE near what is reasonable Cool mom  pop place but terrible food smell and prices
Olive Oil Garden was very disappointing I expect good food and good service at least when I go ou

In [None]:
#based on error analysis considering "0" category as negative and "1" category as positive

log_ratios = []
features = gram12_count_vectorizer.get_feature_names_out()
negative_cond_prob = nb_clf.feature_log_prob_[0]
positive_cond_prob = nb_clf.feature_log_prob_[1]

for i in range(0, len(features)):
  log_ratio = negative_cond_prob[i] - positive_cond_prob[i]
  log_ratios.append(log_ratio)

feature_ranks = sorted(zip(log_ratios, features))
positive_10 = feature_ranks[:10]
print("Positive words")
for i in range(0, len(positive_10)):
    print(positive_10[i])
print()

negative_10 = feature_ranks[-10:]
print("Negative words")
for i in range(0, len(negative_10)):
    print(negative_10[i])
print()

Positive words
(-2.6969096578540186, 'amazing')
(-2.0037624772940736, 'need')
(-1.8702310846695505, 'best')
(-1.8214409205001187, 'fresh')
(-1.4647659765613863, 'friendly')
(-1.4647659765613863, 'nice')
(-1.3106152967341282, 'prices')
(-1.0593008684532217, 'sauce')
(-0.9051501886259636, 'definitely')
(-0.7716187960014409, 'dining')

Negative words
(0.8866092806020918, 'table')
(1.0872799760642424, 'salad')
(1.1742913530538721, 'bad')
(1.1742913530538721, 'wasnt')
(1.492745084172407, 'minutes')
(1.7339071409892952, 'came')
(1.8674385336138175, 'said')
(1.8674385336138175, 'terrible')
(1.9852215692702013, 'took')
(2.0905820849280277, 'asked')



##SVM

In [None]:
from sklearn.svm import LinearSVC

# initialize the LinearSVC model
svm_clf = LinearSVC(C=1)

# use the training data to train the model
svm_clf.fit(X_train_vec,y_train)

# test the classifier on the test data set, print accuracy score

print("Accuracy score on test data set:",svm_clf.score(X_test_vec,y_test))

# print confusion matrix and classification report

from sklearn.metrics import confusion_matrix
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred)
print(cm)
print()

from sklearn.metrics import classification_report
target_names = ['n', 'p']
print(classification_report(y_test, y_pred, target_names=target_names))

Accuracy score on test data set: 0.7837837837837838
[[15  3]
 [ 5 14]]

              precision    recall  f1-score   support

           n       0.75      0.83      0.79        18
           p       0.82      0.74      0.78        19

    accuracy                           0.78        37
   macro avg       0.79      0.79      0.78        37
weighted avg       0.79      0.78      0.78        37



In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate

scoring = ['accuracy', 'f1_macro', 'f1_micro', 'precision_macro', 'recall_macro']
svm_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', ngram_range=(1,2))),('svm', LinearSVC(C=1))])
scores = cross_validate(svm_clf_pipe, X, y, cv=5, scoring=scoring, return_train_score=True)

# retrieve scores from a metric
def get_metric_scores (scores, metric, train_or_test, verbose=False):
    metric_name = train_or_test + '_' + metric
    print(metric_name) 

    metric_scores = scores[metric_name]
    if (verbose == True):
        print(metric_scores)
    avg = sum(metric_scores) / len(metric_scores)
    print('average')
    avg_formatted = "{:.3f}".format(avg)
    print(avg_formatted)

#retrieve test accuracy scores
print(get_metric_scores(scores, 'accuracy', 'test', verbose=True))

test_accuracy
[0.78947368 0.84210526 0.77777778 0.77777778 0.83333333]
average
0.804
None


In [None]:
#error analysis
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='n' and y_pred[i]=='p'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='p' and y_pred[i]=='n'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

the staff at this restaurant is very unfriendly the waitress for our table is extremely rude we need to wait for one hour for our order to come the place is noisy and the food isnt that good
Olive Oil Garden was very disappointing I expect good food and good service at least when I go out to eat The meal was cold when we got it and the waitor had no manners whatsoever Dont go to the Olive Oil Garden 
The service was way below average and we had to wait 10 minutes before we could be seated Then had to wait 20 minutes before the menu was shown to us Let me tell you that the food at Kitty Hoynes Irish Pub  Restaurant was terrible The vegies were under cooked and the lamb was over cooked the bread was dry and the beer was flat Need I say more
errors: 3
This place was one of the best restaurant I have been The price is little expensive but the food and the service is best around the area I went here with my family and we ordered 4 dishes They were all well cooked and their taste were nicely

In [None]:
#Top 10 features

feature_ranks = sorted(zip(svm_clf.coef_[0], gram12_count_vectorizer.get_feature_names_out()))

##Top 10 features for negative words
negative_10 = feature_ranks[:10]
print("Negative words")
for i in range(0, len(negative_10)):
    print(negative_10[i])
print()

##Top 10 features for positive words
positive_10 = feature_ranks[-10:]
print("Positive words")
for i in range(0, len(positive_10)):
    print(positive_10[i])
print()

Negative words
(-0.6414153792814327, 'went')
(-0.6128947107351101, 'said')
(-0.5309206948000039, 'service')
(-0.49586478362878467, 'like')
(-0.46268203813371184, 'dishes')
(-0.43246587110820617, 'salad')
(-0.4238428000989887, 'place')
(-0.42034163722331586, 'asked')
(-0.3585766327294618, 'long')
(-0.3488523942304764, 'terrible')

Positive words
(0.36483586561583325, 'dont')
(0.3745076161929085, 'amazing')
(0.3925419507095628, 'high')
(0.40133206980566344, 'friendly')
(0.43659851103729985, 'waiters')
(0.5486715594556089, 'nice')
(0.6475649930785303, 'quality')
(0.6671231446387473, 'fresh')
(0.7633467961720526, 'best')
(0.8813062884334484, 'need')



#Unigram TF-IDF Vectorizer

##MNB

In [None]:
X_train_vec = unigram_tfidf_vectorizer.fit_transform(X_train)
X_test_vec = unigram_tfidf_vectorizer.transform(X_test)

from sklearn.naive_bayes import MultinomialNB

# initialize the MNB model
nb_clf= MultinomialNB()

# use the training data to train the MNB model
nb_clf.fit(X_train_vec,y_train)

print("Accuracy score on test data set:", nb_clf.score(X_test_vec,y_test))

# print confusion matrix (row: ground truth; col: prediction)

from sklearn.metrics import confusion_matrix
y_pred = nb_clf.fit(X_train_vec, y_train).predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred)
print(cm)

# print classification report

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(precision_score(y_test, y_pred, average=None))
print(recall_score(y_test, y_pred, average=None))

from sklearn.metrics import classification_report
target_names = ['n', 'p']
print(classification_report(y_test, y_pred, target_names=target_names))

Accuracy score on test data set: 0.8378378378378378
[[15  3]
 [ 3 16]]
[0.83333333 0.84210526]
[0.83333333 0.84210526]
              precision    recall  f1-score   support

           n       0.83      0.83      0.83        18
           p       0.84      0.84      0.84        19

    accuracy                           0.84        37
   macro avg       0.84      0.84      0.84        37
weighted avg       0.84      0.84      0.84        37



In [None]:
# cross validation

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
nb_clf_pipe = Pipeline([('vect', TfidfVectorizer(encoding='latin-1', use_idf=True)),('nb', MultinomialNB())])
scores = cross_val_score(nb_clf_pipe, X, y, cv=5)
avg=sum(scores)/len(scores)
print(avg)

0.8695906432748538


In [None]:
#error analysis
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='n' and y_pred[i]=='p'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='p' and y_pred[i]=='n'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

the staff at this restaurant is very unfriendly the waitress for our table is extremely rude we need to wait for one hour for our order to come the place is noisy and the food isnt that good
This diner was not at all up to par Ive been to many diners and get eggs benedict sometimes There was nacho cheese on my eggs and a plateful of watery runny eggs And it smelled like smoke And there was no heat in the dead of winter Their prices are not ANYWHERE near what is reasonable Cool mom  pop place but terrible food smell and prices
Olive Oil Garden was very disappointing I expect good food and good service at least when I go out to eat The meal was cold when we got it and the waitor had no manners whatsoever Dont go to the Olive Oil Garden 
errors: 3
I went to this ultraluxurious restaurant in Downtown New York which is known for its exotic and expensive cuisine I had a glass of champagne along with very expensive Caviar I had a delicious Chicken Pasta cooked in white sauce This was followed

In [None]:
#based on error analysis considering "0" category as negative and "1" category as positive

log_ratios = []
features = unigram_tfidf_vectorizer.get_feature_names_out()
negative_cond_prob = nb_clf.feature_log_prob_[0]
positive_cond_prob = nb_clf.feature_log_prob_[1]

for i in range(0, len(features)):
  log_ratio = negative_cond_prob[i] - positive_cond_prob[i]
  log_ratios.append(log_ratio)

feature_ranks = sorted(zip(log_ratios, features))
positive_10 = feature_ranks[:10]
print("Positive words")
for i in range(0, len(positive_10)):
    print(positive_10[i])
print()

negative_10 = feature_ranks[-10:]
print("Negative words")
for i in range(0, len(negative_10)):
    print(negative_10[i])
print()

Positive words
(-1.3534882757718734, 'amazing')
(-1.2720874094463768, 'best')
(-1.2304761017456043, 'fresh')
(-1.0405658175602834, 'need')
(-0.9238795757989893, 'nice')
(-0.7772747053769047, 'prices')
(-0.7577149973440553, 'friendly')
(-0.6701667336927399, 'great')
(-0.4700767621555584, 'definitely')
(-0.45652134421311086, 'sauce')

Negative words
(0.6192376174420913, 'went')
(0.6716702360737918, 'salad')
(0.6919270608393173, 'wasnt')
(0.7579303857540398, 'bad')
(0.7773278774124339, 'came')
(0.8783903450307302, 'minutes')
(1.030980217409697, 'said')
(1.0399151954913135, 'took')
(1.0800998928856593, 'terrible')
(1.1056261347945728, 'asked')



##SVM

In [None]:
from sklearn.svm import LinearSVC

# initialize the LinearSVC model
svm_clf = LinearSVC(C=1)

# use the training data to train the model
svm_clf.fit(X_train_vec,y_train)

# test the classifier on the test data set, print accuracy score

print("Accuracy score on test data set:",svm_clf.score(X_test_vec,y_test))

# print confusion matrix and classification report

from sklearn.metrics import confusion_matrix
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred)
print(cm)
print()

from sklearn.metrics import classification_report
target_names = ['n', 'p']
print(classification_report(y_test, y_pred, target_names=target_names))

Accuracy score on test data set: 0.8378378378378378
[[15  3]
 [ 3 16]]

              precision    recall  f1-score   support

           n       0.83      0.83      0.83        18
           p       0.84      0.84      0.84        19

    accuracy                           0.84        37
   macro avg       0.84      0.84      0.84        37
weighted avg       0.84      0.84      0.84        37



In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate

scoring = ['accuracy', 'f1_macro', 'f1_micro', 'precision_macro', 'recall_macro']
svm_clf_pipe = Pipeline([('vect', TfidfVectorizer(encoding='latin-1', use_idf=True)),('svm', LinearSVC(C=1))])
scores = cross_validate(svm_clf_pipe, X, y, cv=5, scoring=scoring, return_train_score=True)

# retrieve scores from a metric
def get_metric_scores (scores, metric, train_or_test, verbose=False):
    metric_name = train_or_test + '_' + metric
    print(metric_name) 

    metric_scores = scores[metric_name]
    if (verbose == True):
        print(metric_scores)
    avg = sum(metric_scores) / len(metric_scores)
    print('average')
    avg_formatted = "{:.3f}".format(avg)
    print(avg_formatted)

#retrieve test accuracy scores
print(get_metric_scores(scores, 'accuracy', 'test', verbose=True))

test_accuracy
[0.89473684 0.89473684 0.94444444 0.94444444 0.77777778]
average
0.891
None


In [None]:
#error analysis
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='n' and y_pred[i]=='p'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='p' and y_pred[i]=='n'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

the staff at this restaurant is very unfriendly the waitress for our table is extremely rude we need to wait for one hour for our order to come the place is noisy and the food isnt that good
Olive Oil Garden was very disappointing I expect good food and good service at least when I go out to eat The meal was cold when we got it and the waitor had no manners whatsoever Dont go to the Olive Oil Garden 
The service was way below average and we had to wait 10 minutes before we could be seated Then had to wait 20 minutes before the menu was shown to us Let me tell you that the food at Kitty Hoynes Irish Pub  Restaurant was terrible The vegies were under cooked and the lamb was over cooked the bread was dry and the beer was flat Need I say more
errors: 3
I went to this ultraluxurious restaurant in Downtown New York which is known for its exotic and expensive cuisine I had a glass of champagne along with very expensive Caviar I had a delicious Chicken Pasta cooked in white sauce This was foll

In [None]:
#Top 10 features

feature_ranks = sorted(zip(svm_clf.coef_[0], unigram_tfidf_vectorizer.get_feature_names_out()))

##Top 10 features for negative words
negative_10 = feature_ranks[:10]
print("Negative words")
for i in range(0, len(negative_10)):
    print(negative_10[i])
print()

##Top 10 features for positive words
positive_10 = feature_ranks[-10:]
print("Positive words")
for i in range(0, len(positive_10)):
    print(positive_10[i])
print()

Negative words
(-0.9941388689518573, 'went')
(-0.8460276357016443, 'said')
(-0.7811260137950655, 'asked')
(-0.7714609384417883, 'terrible')
(-0.7600535358388077, 'place')
(-0.7575635999838614, 'salad')
(-0.7489866424561434, 'took')
(-0.7181159444368663, 'bad')
(-0.6961469841801705, 'dishes')
(-0.6788066552718461, 'dish')

Positive words
(0.42570648143495454, 'prices')
(0.5044421574140608, 'friendly')
(0.548697206341155, 'dont')
(0.6357305108908016, 'high')
(0.6634149247099183, 'quality')
(0.7775354738114069, 'nice')
(0.9311286709221271, 'amazing')
(1.1584332622101625, 'fresh')
(1.227919458033131, 'need')
(1.6409342389060588, 'best')



#Authenticity Analysis


In [None]:
X = input['review'].values
y = input['lie'].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [None]:
import numpy as np

unique, counts = np.unique(y_train, return_counts=True)
print(np.asarray((unique, counts)))

unique, counts = np.unique(y_test, return_counts=True)
print(np.asarray((unique, counts)))

[['f' 't']
 [29 26]]
[['f' 't']
 [17 20]]


#Unigram Boolean Vectorizer

##MNB



In [None]:
X_train_vec = unigram_bool_vectorizer.fit_transform(X_train)
X_test_vec = unigram_bool_vectorizer.transform(X_test)

from sklearn.naive_bayes import MultinomialNB

# initialize the MNB model
nb_clf= MultinomialNB()

# use the training data to train the MNB model
nb_clf.fit(X_train_vec,y_train)

print("Accuracy score on test data set:", nb_clf.score(X_test_vec,y_test))

# print confusion matrix (row: ground truth; col: prediction)

from sklearn.metrics import confusion_matrix
y_pred = nb_clf.fit(X_train_vec, y_train).predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred)
print(cm)

# print classification report

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(precision_score(y_test, y_pred, average=None))
print(recall_score(y_test, y_pred, average=None))

from sklearn.metrics import classification_report
target_names = ['f', 't']
print(classification_report(y_test, y_pred, target_names=target_names))

Accuracy score on test data set: 0.5135135135135135
[[ 6 11]
 [ 7 13]]
[0.46153846 0.54166667]
[0.35294118 0.65      ]
              precision    recall  f1-score   support

           f       0.46      0.35      0.40        17
           t       0.54      0.65      0.59        20

    accuracy                           0.51        37
   macro avg       0.50      0.50      0.50        37
weighted avg       0.50      0.51      0.50        37



In [None]:
# cross validation

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=True)),('nb', MultinomialNB())])
scores = cross_val_score(nb_clf_pipe, X, y, cv=5)
avg=sum(scores)/len(scores)
print(avg)

0.5771929824561404


In [None]:
#error analysis
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='f' and y_pred[i]=='t'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='t' and y_pred[i]=='f'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

After I went shopping with some of my friend we went to DODO restaurant for dinner I found worm in one of the dishes 
I recently ate at a restaurant called White Castle and it was a dine in I had to wait 20 minutes before the waiter came to my table to take my order even though it was not busy I had to wait another 30 minutes for my order to come I had ordered a veggie burger with fries and Iced tea The veggie patty was not properly cooked the lettuce had black patches and the tomatoes looked horrible Overall the burger was a disaster When I turned to the fries in the hopes that it would taste goodto my disappointment they were also not fried properly and had a raw taste To top it all when picked up my iced tea took take a sip I found a fly swimming in my iced tea Overall it was a really bad experience and I would not recommend White Castle to anybody 
I went there with two friends at 6pm Long queue was there But it didnt take us long to wait The waiter was nice but worked in a hurry W

In [None]:
#based on error analysis considering "0" category as fake and "1" category as truth

log_ratios = []
features = unigram_bool_vectorizer.get_feature_names_out()
fake_cond_prob = nb_clf.feature_log_prob_[0]
true_cond_prob = nb_clf.feature_log_prob_[1]

for i in range(0, len(features)):
  log_ratio = fake_cond_prob[i] - true_cond_prob[i]
  log_ratios.append(log_ratio)

feature_ranks = sorted(zip(log_ratios, features))
true_10 = feature_ranks[:10]
print("True words")
for i in range(0, len(true_10)):
    print(true_10[i])
print()

fake_10 = feature_ranks[-10:]
print("Fake words")
for i in range(0, len(fake_10)):
    print(fake_10[i])
print()

True words
(-0.7353667993468385, 'terrible')
(-0.6945448048265832, 'time')
(-0.6765262993239052, 'good')
(-0.5122232480326288, 'came')
(-0.5122232480326288, 'dish')
(-0.5122232480326288, 'life')
(-0.5122232480326288, 'ny')
(-0.4758556038617541, 'restaurant')
(-0.44768472689505767, 'bad')
(-0.44768472689505767, 'friendly')

Fake words
(0.6917495562933071, 'high')
(0.6917495562933071, 'prices')
(0.6917495562933071, 'sauce')
(0.6917495562933071, 'staff')
(0.6917495562933071, 'waiters')
(0.6917495562933071, 'wasnt')
(0.6917495562933071, 'worth')
(0.8740711130872612, 'definitely')
(1.02822179291452, 'said')
(1.5672182936472066, 'want')



##SVM

In [None]:
from sklearn.svm import LinearSVC

# initialize the LinearSVC model
svm_clf = LinearSVC(C=1)

# use the training data to train the model
svm_clf.fit(X_train_vec,y_train)

# test the classifier on the test data set, print accuracy score

print("Accuracy score on test data set:",svm_clf.score(X_test_vec,y_test))

# print confusion matrix and classification report

from sklearn.metrics import confusion_matrix
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred)
print(cm)
print()

from sklearn.metrics import classification_report
target_names = ['f', 't']
print(classification_report(y_test, y_pred, target_names=target_names))

Accuracy score on test data set: 0.4594594594594595
[[ 5 12]
 [ 8 12]]

              precision    recall  f1-score   support

           f       0.38      0.29      0.33        17
           t       0.50      0.60      0.55        20

    accuracy                           0.46        37
   macro avg       0.44      0.45      0.44        37
weighted avg       0.45      0.46      0.45        37



In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate

scoring = ['accuracy', 'f1_macro', 'f1_micro', 'precision_macro', 'recall_macro']
svm_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=True)),('svm', LinearSVC(C=1))])
scores = cross_validate(svm_clf_pipe, X, y, cv=5, scoring=scoring, return_train_score=True)

# retrieve scores from a metric
def get_metric_scores (scores, metric, train_or_test, verbose=False):
    metric_name = train_or_test + '_' + metric
    print(metric_name) 

    metric_scores = scores[metric_name]
    if (verbose == True):
        print(metric_scores)
    avg = sum(metric_scores) / len(metric_scores)
    print('average')
    avg_formatted = "{:.3f}".format(avg)
    print(avg_formatted)

#retrieve test accuracy scores
print(get_metric_scores(scores, 'accuracy', 'test', verbose=True))

test_accuracy
[0.57894737 0.47368421 0.38888889 0.38888889 0.72222222]
average
0.511
None


In [None]:
#error analysis
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='f' and y_pred[i]=='t'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='t' and y_pred[i]=='f'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

After I went shopping with some of my friend we went to DODO restaurant for dinner I found worm in one of the dishes 
I recently ate at a restaurant called White Castle and it was a dine in I had to wait 20 minutes before the waiter came to my table to take my order even though it was not busy I had to wait another 30 minutes for my order to come I had ordered a veggie burger with fries and Iced tea The veggie patty was not properly cooked the lettuce had black patches and the tomatoes looked horrible Overall the burger was a disaster When I turned to the fries in the hopes that it would taste goodto my disappointment they were also not fried properly and had a raw taste To top it all when picked up my iced tea took take a sip I found a fly swimming in my iced tea Overall it was a really bad experience and I would not recommend White Castle to anybody 
I ate at this restaurant called Banana Leaf As I entered the restaurant I really liked the ambiance I ordered noodle soup and fried ric

In [None]:
#Top 10 features

feature_ranks = sorted(zip(svm_clf.coef_[0], unigram_bool_vectorizer.get_feature_names_out()))

##Top 10 features for fake words
fake_10 = feature_ranks[-10:]
print("Fake words")
for i in range(0, len(fake_10)):
    print(fake_10[i])
print()

##Top 10 features for true words
true_10 = feature_ranks[:10]
print("True words")
for i in range(0, len(true_10)):
    print(true_10[i])
print()

Fake words
(0.497985105991149, 'wait')
(0.5506437386440068, 'time')
(0.5718216532921692, 'good')
(0.5776009651554882, 'dinner')
(0.6068998513397714, 'ny')
(0.6721278241520858, 'food')
(0.7110268628762259, 'bad')
(0.7277222310931813, 'came')
(0.8202677365525408, 'friendly')
(1.0896627804944337, 'overall')

True words
(-1.6943097014357344, 'want')
(-1.2757043013017149, 'said')
(-1.0561241921024938, 'definitely')
(-1.0249021749470348, 'high')
(-0.7280920187859549, 'prices')
(-0.7002115194776946, 'waiters')
(-0.6843507775256299, 'took')
(-0.6402214212541258, 'experience')
(-0.6372558879520928, 'worth')
(-0.6355579359052305, 'dining')



#Unigram Term Frequency Vectorizer
##MNB

In [None]:
X_train_vec = unigram_count_vectorizer.fit_transform(X_train)
X_test_vec = unigram_count_vectorizer.transform(X_test)

from sklearn.naive_bayes import MultinomialNB

# initialize the MNB model
nb_clf= MultinomialNB()

# use the training data to train the MNB model
nb_clf.fit(X_train_vec,y_train)

print("Accuracy score on test data set:", nb_clf.score(X_test_vec,y_test))

# print confusion matrix (row: ground truth; col: prediction)

from sklearn.metrics import confusion_matrix
y_pred = nb_clf.fit(X_train_vec, y_train).predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred)
print(cm)

# print classification report

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(precision_score(y_test, y_pred, average=None))
print(recall_score(y_test, y_pred, average=None))

from sklearn.metrics import classification_report
target_names = ['f', 't']
print(classification_report(y_test, y_pred, target_names=target_names))

Accuracy score on test data set: 0.4594594594594595
[[ 8  9]
 [11  9]]
[0.42105263 0.5       ]
[0.47058824 0.45      ]
              precision    recall  f1-score   support

           f       0.42      0.47      0.44        17
           t       0.50      0.45      0.47        20

    accuracy                           0.46        37
   macro avg       0.46      0.46      0.46        37
weighted avg       0.46      0.46      0.46        37



In [None]:
# cross validation

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=False)),('nb', MultinomialNB())])
scores = cross_val_score(nb_clf_pipe, X, y, cv=5)
avg=sum(scores)/len(scores)
print(avg)

0.5649122807017544


In [None]:
#error analysis
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='f' and y_pred[i]=='t'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='t' and y_pred[i]=='f'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

After I went shopping with some of my friend we went to DODO restaurant for dinner I found worm in one of the dishes 
I ate at this restaurant called Banana Leaf As I entered the restaurant I really liked the ambiance I ordered noodle soup and fried rice with spicy black bean curry The service was pretty fast and the food tasted amazing There was a lot flavor in the food which I truly enjoyed Two thumbs up for Banana Leaf and I would totally recommend this restaurant
OMG This restaurant is horrible The receptionist did not greet us we just stood there and waited for five minutes The food came late and served not warm Me and my pet ordered a bowl of salad and a cheese pizza The salad was not fresh the crust of a pizza was so hard like plastics My dog didnt even eat that pizza I hate this place
I went to this ultraluxurious restaurant in Downtown New York which is known for its exotic and expensive cuisine I had a glass of champagne along with very expensive Caviar I had a delicious Chic

In [None]:
#based on error analysis considering "0" category as fake and "1" category as truth

log_ratios = []
features = unigram_count_vectorizer.get_feature_names_out()
fake_cond_prob = nb_clf.feature_log_prob_[0]
true_cond_prob = nb_clf.feature_log_prob_[1]

for i in range(0, len(features)):
  log_ratio = fake_cond_prob[i] - true_cond_prob[i]
  log_ratios.append(log_ratio)

feature_ranks = sorted(zip(log_ratios, features))
true_10 = feature_ranks[:10]
print("True words")
for i in range(0, len(true_10)):
    print(true_10[i])
print()

fake_10 = feature_ranks[-10:]
print("Fake words")
for i in range(0, len(fake_10)):
    print(fake_10[i])
print()

True words
(-0.8893412391846018, 'life')
(-0.7839807235267755, 'time')
(-0.6817018744063574, 'good')
(-0.601659166732821, 'bad')
(-0.601659166732821, 'going')
(-0.5528690025633889, 'ordered')
(-0.5326662952458694, 'waitress')
(-0.4838761310764377, 'salad')
(-0.48387613107643723, 'ny')
(-0.48387613107643723, 'restaurant')

Fake words
(0.651103801762547, 'sauce')
(0.651103801762547, 'wasnt')
(0.7200966732494987, 'high')
(0.7200966732494987, 'staff')
(0.7200966732494987, 'waiters')
(0.7200966732494987, 'worth')
(0.9024182300434527, 'dine')
(1.0565689098707116, 'definitely')
(1.0565689098707116, 'said')
(1.8832474830551793, 'want')



##SVM

In [None]:
from sklearn.svm import LinearSVC

# initialize the LinearSVC model
svm_clf = LinearSVC(C=1)

# use the training data to train the model
svm_clf.fit(X_train_vec,y_train)

# test the classifier on the test data set, print accuracy score

print("Accuracy score on test data set:",svm_clf.score(X_test_vec,y_test))

# print confusion matrix and classification report

from sklearn.metrics import confusion_matrix
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred)
print(cm)
print()

from sklearn.metrics import classification_report
target_names = ['f', 't']
print(classification_report(y_test, y_pred, target_names=target_names))

Accuracy score on test data set: 0.4594594594594595
[[ 4 13]
 [ 7 13]]

              precision    recall  f1-score   support

           f       0.36      0.24      0.29        17
           t       0.50      0.65      0.57        20

    accuracy                           0.46        37
   macro avg       0.43      0.44      0.43        37
weighted avg       0.44      0.46      0.44        37



In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate

scoring = ['accuracy', 'f1_macro', 'f1_micro', 'precision_macro', 'recall_macro']
svm_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=False)),('svm', LinearSVC(C=1))])
scores = cross_validate(svm_clf_pipe, X, y, cv=5, scoring=scoring, return_train_score=True)

# retrieve scores from a metric
def get_metric_scores (scores, metric, train_or_test, verbose=False):
    metric_name = train_or_test + '_' + metric
    print(metric_name) 

    metric_scores = scores[metric_name]
    if (verbose == True):
        print(metric_scores)
    avg = sum(metric_scores) / len(metric_scores)
    print('average')
    avg_formatted = "{:.3f}".format(avg)
    print(avg_formatted)

#retrieve test accuracy scores
print(get_metric_scores(scores, 'accuracy', 'test', verbose=True))

test_accuracy
[0.63157895 0.52631579 0.44444444 0.5        0.5       ]
average
0.520
None


In [None]:
#error analysis
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='f' and y_pred[i]=='t'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='t' and y_pred[i]=='f'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

After I went shopping with some of my friend we went to DODO restaurant for dinner I found worm in one of the dishes 
I recently ate at a restaurant called White Castle and it was a dine in I had to wait 20 minutes before the waiter came to my table to take my order even though it was not busy I had to wait another 30 minutes for my order to come I had ordered a veggie burger with fries and Iced tea The veggie patty was not properly cooked the lettuce had black patches and the tomatoes looked horrible Overall the burger was a disaster When I turned to the fries in the hopes that it would taste goodto my disappointment they were also not fried properly and had a raw taste To top it all when picked up my iced tea took take a sip I found a fly swimming in my iced tea Overall it was a really bad experience and I would not recommend White Castle to anybody 
I ate at this restaurant called Banana Leaf As I entered the restaurant I really liked the ambiance I ordered noodle soup and fried ric

In [None]:
#Top 10 features

feature_ranks = sorted(zip(svm_clf.coef_[0], unigram_count_vectorizer.get_feature_names_out()))

##Top 10 features for fake words
fake_10 = feature_ranks[-10:]
print("Fake words")
for i in range(0, len(fake_10)):
    print(fake_10[i])
print()

##Top 10 features for true words
true_10 = feature_ranks[:10]
print("True words")
for i in range(0, len(true_10)):
    print(true_10[i])
print()

Fake words
(0.38852874064326015, 'didnt')
(0.4010278745485775, 'good')
(0.4337657258881227, 'food')
(0.4498916090927833, 'time')
(0.4734313574336397, 'great')
(0.50524056114873, 'bad')
(0.6882022004698609, 'ny')
(0.7518640541390356, 'overall')
(0.8187492968922648, 'dinner')
(0.8808013037917823, 'friendly')

True words
(-1.1809768637224178, 'definitely')
(-0.9720659618157339, 'experience')
(-0.8772387096333821, 'said')
(-0.8256647977177729, 'want')
(-0.7411672233642352, 'like')
(-0.6913211192448717, 'served')
(-0.6674909811533987, 'quality')
(-0.654108374912961, 'worth')
(-0.5305466064886586, 'high')
(-0.5290435929085762, 'took')



#Unigram and bigram Term Frequency Vectorizer
##MNB

In [None]:
X_train_vec = gram12_count_vectorizer.fit_transform(X_train)
X_test_vec = gram12_count_vectorizer.transform(X_test)

from sklearn.naive_bayes import MultinomialNB

# initialize the MNB model
nb_clf= MultinomialNB()

# use the training data to train the MNB model
nb_clf.fit(X_train_vec,y_train)

print("Accuracy score on test data set:", nb_clf.score(X_test_vec,y_test))

# print confusion matrix (row: ground truth; col: prediction)

from sklearn.metrics import confusion_matrix
y_pred = nb_clf.fit(X_train_vec, y_train).predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred)
print(cm)

# print classification report

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(precision_score(y_test, y_pred, average=None))
print(recall_score(y_test, y_pred, average=None))

from sklearn.metrics import classification_report
target_names = ['f', 't']
print(classification_report(y_test, y_pred, target_names=target_names))

Accuracy score on test data set: 0.4594594594594595
[[ 8  9]
 [11  9]]
[0.42105263 0.5       ]
[0.47058824 0.45      ]
              precision    recall  f1-score   support

           f       0.42      0.47      0.44        17
           t       0.50      0.45      0.47        20

    accuracy                           0.46        37
   macro avg       0.46      0.46      0.46        37
weighted avg       0.46      0.46      0.46        37



In [None]:
# cross validation

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', ngram_range=(1,2))),('nb', MultinomialNB())])
scores = cross_val_score(nb_clf_pipe, X, y, cv=5)
avg=sum(scores)/len(scores)
print(avg)

0.5432748538011696


In [None]:
#error analysis
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='f' and y_pred[i]=='t'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='t' and y_pred[i]=='f'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

After I went shopping with some of my friend we went to DODO restaurant for dinner I found worm in one of the dishes 
I ate at this restaurant called Banana Leaf As I entered the restaurant I really liked the ambiance I ordered noodle soup and fried rice with spicy black bean curry The service was pretty fast and the food tasted amazing There was a lot flavor in the food which I truly enjoyed Two thumbs up for Banana Leaf and I would totally recommend this restaurant
OMG This restaurant is horrible The receptionist did not greet us we just stood there and waited for five minutes The food came late and served not warm Me and my pet ordered a bowl of salad and a cheese pizza The salad was not fresh the crust of a pizza was so hard like plastics My dog didnt even eat that pizza I hate this place
I went to this ultraluxurious restaurant in Downtown New York which is known for its exotic and expensive cuisine I had a glass of champagne along with very expensive Caviar I had a delicious Chic

In [None]:
#based on error analysis considering "0" category as fake and "1" category as truth

log_ratios = []
features = gram12_count_vectorizer.get_feature_names_out()
fake_cond_prob = nb_clf.feature_log_prob_[0]
true_cond_prob = nb_clf.feature_log_prob_[1]

for i in range(0, len(features)):
  log_ratio = fake_cond_prob[i] - true_cond_prob[i]
  log_ratios.append(log_ratio)

feature_ranks = sorted(zip(log_ratios, features))
true_10 = feature_ranks[:10]
print("True words")
for i in range(0, len(true_10)):
    print(true_10[i])
print()

fake_10 = feature_ranks[-10:]
print("Fake words")
for i in range(0, len(fake_10)):
    print(fake_10[i])
print()

True words
(-0.8893412391846018, 'life')
(-0.7839807235267755, 'time')
(-0.6817018744063574, 'good')
(-0.601659166732821, 'bad')
(-0.601659166732821, 'going')
(-0.5528690025633889, 'ordered')
(-0.5326662952458694, 'waitress')
(-0.4838761310764377, 'salad')
(-0.48387613107643723, 'ny')
(-0.48387613107643723, 'restaurant')

Fake words
(0.651103801762547, 'sauce')
(0.651103801762547, 'wasnt')
(0.7200966732494987, 'high')
(0.7200966732494987, 'staff')
(0.7200966732494987, 'waiters')
(0.7200966732494987, 'worth')
(0.9024182300434527, 'dine')
(1.0565689098707116, 'definitely')
(1.0565689098707116, 'said')
(1.8832474830551793, 'want')



##SVM

In [None]:
from sklearn.svm import LinearSVC

# initialize the LinearSVC model
svm_clf = LinearSVC(C=1)

# use the training data to train the model
svm_clf.fit(X_train_vec,y_train)

# test the classifier on the test data set, print accuracy score

print("Accuracy score on test data set:",svm_clf.score(X_test_vec,y_test))

# print confusion matrix and classification report

from sklearn.metrics import confusion_matrix
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred)
print(cm)
print()

from sklearn.metrics import classification_report
target_names = ['f', 't']
print(classification_report(y_test, y_pred, target_names=target_names))

Accuracy score on test data set: 0.4594594594594595
[[ 4 13]
 [ 7 13]]

              precision    recall  f1-score   support

           f       0.36      0.24      0.29        17
           t       0.50      0.65      0.57        20

    accuracy                           0.46        37
   macro avg       0.43      0.44      0.43        37
weighted avg       0.44      0.46      0.44        37



In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate

scoring = ['accuracy', 'f1_macro', 'f1_micro', 'precision_macro', 'recall_macro']
svm_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', ngram_range=(1,2))),('svm', LinearSVC(C=1))])
scores = cross_validate(svm_clf_pipe, X, y, cv=5, scoring=scoring, return_train_score=True)

# retrieve scores from a metric
def get_metric_scores (scores, metric, train_or_test, verbose=False):
    metric_name = train_or_test + '_' + metric
    print(metric_name) 

    metric_scores = scores[metric_name]
    if (verbose == True):
        print(metric_scores)
    avg = sum(metric_scores) / len(metric_scores)
    print('average')
    avg_formatted = "{:.3f}".format(avg)
    print(avg_formatted)

#retrieve test accuracy scores
print(get_metric_scores(scores, 'accuracy', 'test', verbose=True))

test_accuracy
[0.57894737 0.52631579 0.5        0.5        0.5       ]
average
0.521
None


In [None]:
#error analysis
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='f' and y_pred[i]=='t'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='t' and y_pred[i]=='f'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

After I went shopping with some of my friend we went to DODO restaurant for dinner I found worm in one of the dishes 
I recently ate at a restaurant called White Castle and it was a dine in I had to wait 20 minutes before the waiter came to my table to take my order even though it was not busy I had to wait another 30 minutes for my order to come I had ordered a veggie burger with fries and Iced tea The veggie patty was not properly cooked the lettuce had black patches and the tomatoes looked horrible Overall the burger was a disaster When I turned to the fries in the hopes that it would taste goodto my disappointment they were also not fried properly and had a raw taste To top it all when picked up my iced tea took take a sip I found a fly swimming in my iced tea Overall it was a really bad experience and I would not recommend White Castle to anybody 
I ate at this restaurant called Banana Leaf As I entered the restaurant I really liked the ambiance I ordered noodle soup and fried ric

In [None]:
#Top 10 features

feature_ranks = sorted(zip(svm_clf.coef_[0], gram12_count_vectorizer.get_feature_names_out()))

##Top 10 features for fake words
fake_10 = feature_ranks[-10:]
print("Fake words")
for i in range(0, len(fake_10)):
    print(fake_10[i])
print()

##Top 10 features for true words
true_10 = feature_ranks[:10]
print("True words")
for i in range(0, len(true_10)):
    print(true_10[i])
print()

Fake words
(0.3885294710449072, 'didnt')
(0.40102518068210047, 'good')
(0.4337647391676839, 'food')
(0.449890514229654, 'time')
(0.47343295189751966, 'great')
(0.5052398818456328, 'bad')
(0.6882033287072165, 'ny')
(0.7518656310867319, 'overall')
(0.8187505266525416, 'dinner')
(0.8808006454626588, 'friendly')

True words
(-1.1809749972842973, 'definitely')
(-0.9720677025926572, 'experience')
(-0.8772383027242512, 'said')
(-0.8256620799543126, 'want')
(-0.7411674178017926, 'like')
(-0.6913212104593095, 'served')
(-0.6674893246867495, 'quality')
(-0.6541087208103876, 'worth')
(-0.5305457740907414, 'high')
(-0.529043421878272, 'took')



#Unigram TF_IDF Vectorizer
##MNB

In [None]:
X_train_vec = unigram_tfidf_vectorizer.fit_transform(X_train)
X_test_vec = unigram_tfidf_vectorizer.transform(X_test)

from sklearn.naive_bayes import MultinomialNB

# initialize the MNB model
nb_clf= MultinomialNB()

# use the training data to train the MNB model
nb_clf.fit(X_train_vec,y_train)

print("Accuracy score on test data set:", nb_clf.score(X_test_vec,y_test))

# print confusion matrix (row: ground truth; col: prediction)

from sklearn.metrics import confusion_matrix
y_pred = nb_clf.fit(X_train_vec, y_train).predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred)
print(cm)

# print classification report

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(precision_score(y_test, y_pred, average=None))
print(recall_score(y_test, y_pred, average=None))

from sklearn.metrics import classification_report
target_names = ['f', 't']
print(classification_report(y_test, y_pred, target_names=target_names))

Accuracy score on test data set: 0.40540540540540543
[[ 8  9]
 [13  7]]
[0.38095238 0.4375    ]
[0.47058824 0.35      ]
              precision    recall  f1-score   support

           f       0.38      0.47      0.42        17
           t       0.44      0.35      0.39        20

    accuracy                           0.41        37
   macro avg       0.41      0.41      0.40        37
weighted avg       0.41      0.41      0.40        37



In [None]:
# cross validation

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
nb_clf_pipe = Pipeline([('vect', TfidfVectorizer(encoding='latin-1', use_idf=True)),('nb', MultinomialNB())])
scores = cross_val_score(nb_clf_pipe, X, y, cv=5)
avg=sum(scores)/len(scores)
print(avg)

0.5766081871345029


In [None]:
#error analysis
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='f' and y_pred[i]=='t'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='t' and y_pred[i]=='f'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

After I went shopping with some of my friend we went to DODO restaurant for dinner I found worm in one of the dishes 
I recently ate at a restaurant called White Castle and it was a dine in I had to wait 20 minutes before the waiter came to my table to take my order even though it was not busy I had to wait another 30 minutes for my order to come I had ordered a veggie burger with fries and Iced tea The veggie patty was not properly cooked the lettuce had black patches and the tomatoes looked horrible Overall the burger was a disaster When I turned to the fries in the hopes that it would taste goodto my disappointment they were also not fried properly and had a raw taste To top it all when picked up my iced tea took take a sip I found a fly swimming in my iced tea Overall it was a really bad experience and I would not recommend White Castle to anybody 
I went there with two friends at 6pm Long queue was there But it didnt take us long to wait The waiter was nice but worked in a hurry W

In [None]:
#based on error analysis considering "0" category as fake and "1" category as truth

log_ratios = []
features = unigram_tfidf_vectorizer.get_feature_names_out()
fake_cond_prob = nb_clf.feature_log_prob_[0]
true_cond_prob = nb_clf.feature_log_prob_[1]

for i in range(0, len(features)):
  log_ratio = fake_cond_prob[i] - true_cond_prob[i]
  log_ratios.append(log_ratio)

feature_ranks = sorted(zip(log_ratios, features))
true_10 = feature_ranks[:10]
print("True words")
for i in range(0, len(true_10)):
    print(true_10[i])
print()

fake_10 = feature_ranks[-10:]
print("Fake words")
for i in range(0, len(fake_10)):
    print(fake_10[i])
print()

True words
(-0.501688134990478, 'terrible')
(-0.46461375427651586, 'bad')
(-0.43486919919156275, 'time')
(-0.41899512630091706, 'life')
(-0.4136543444913534, 'ordered')
(-0.3800421835560366, 'overall')
(-0.3757575659020649, 'good')
(-0.3513581732494506, 'friendly')
(-0.3332140336030891, 'really')
(-0.2925326109939075, 'ny')

Fake words
(0.3466109453394375, 'experience')
(0.35310471285063727, 'dine')
(0.35337929940709767, 'sauce')
(0.35554017542564287, 'like')
(0.37338977381608096, 'waiters')
(0.4191727130485976, 'wasnt')
(0.4348201552581763, 'high')
(0.7036314782925546, 'said')
(0.7084999778564756, 'definitely')
(0.9440189403907806, 'want')



##SVM

In [None]:
from sklearn.svm import LinearSVC

# initialize the LinearSVC model
svm_clf = LinearSVC(C=1)

# use the training data to train the model
svm_clf.fit(X_train_vec,y_train)

# test the classifier on the test data set, print accuracy score

print("Accuracy score on test data set:",svm_clf.score(X_test_vec,y_test))

# print confusion matrix and classification report

from sklearn.metrics import confusion_matrix
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred)
print(cm)
print()

from sklearn.metrics import classification_report
target_names = ['f', 't']
print(classification_report(y_test, y_pred, target_names=target_names))

Accuracy score on test data set: 0.40540540540540543
[[ 4 13]
 [ 9 11]]

              precision    recall  f1-score   support

           f       0.31      0.24      0.27        17
           t       0.46      0.55      0.50        20

    accuracy                           0.41        37
   macro avg       0.38      0.39      0.38        37
weighted avg       0.39      0.41      0.39        37



In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate

scoring = ['accuracy', 'f1_macro', 'f1_micro', 'precision_macro', 'recall_macro']
svm_clf_pipe = Pipeline([('vect', TfidfVectorizer(encoding='latin-1', use_idf=True)),('svm', LinearSVC(C=1))])
scores = cross_validate(svm_clf_pipe, X, y, cv=5, scoring=scoring, return_train_score=True)

# retrieve scores from a metric
def get_metric_scores (scores, metric, train_or_test, verbose=False):
    metric_name = train_or_test + '_' + metric
    print(metric_name) 

    metric_scores = scores[metric_name]
    if (verbose == True):
        print(metric_scores)
    avg = sum(metric_scores) / len(metric_scores)
    print('average')
    avg_formatted = "{:.3f}".format(avg)
    print(avg_formatted)

#retrieve test accuracy scores
print(get_metric_scores(scores, 'accuracy', 'test', verbose=True))

test_accuracy
[0.52631579 0.63157895 0.44444444 0.55555556 0.55555556]
average
0.543
None


In [None]:
#error analysis
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='f' and y_pred[i]=='t'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='t' and y_pred[i]=='f'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

After I went shopping with some of my friend we went to DODO restaurant for dinner I found worm in one of the dishes 
I recently ate at a restaurant called White Castle and it was a dine in I had to wait 20 minutes before the waiter came to my table to take my order even though it was not busy I had to wait another 30 minutes for my order to come I had ordered a veggie burger with fries and Iced tea The veggie patty was not properly cooked the lettuce had black patches and the tomatoes looked horrible Overall the burger was a disaster When I turned to the fries in the hopes that it would taste goodto my disappointment they were also not fried properly and had a raw taste To top it all when picked up my iced tea took take a sip I found a fly swimming in my iced tea Overall it was a really bad experience and I would not recommend White Castle to anybody 
I went there with two friends at 6pm Long queue was there But it didnt take us long to wait The waiter was nice but worked in a hurry W

In [None]:
#Top 10 features

feature_ranks = sorted(zip(svm_clf.coef_[0], unigram_tfidf_vectorizer.get_feature_names_out()))

##Top 10 features for fake words
fake_10 = feature_ranks[-10:]
print("Fake words")
for i in range(0, len(fake_10)):
    print(fake_10[i])
print()

##Top 10 features for true words
true_10 = feature_ranks[:10]
print("True words")
for i in range(0, len(true_10)):
    print(true_10[i])
print()

Fake words
(0.5381776524280878, 'really')
(0.5796752490930338, 'time')
(0.5815708411883416, 'bad')
(0.6617410848988401, 'ordered')
(0.6632484657165811, 'overall')
(0.6876959452904681, 'ny')
(0.6934343843805914, 'good')
(0.7698068997030711, 'dinner')
(0.7768648855439806, 'friendly')
(0.7917840991252714, 'food')

True words
(-1.442684419752063, 'want')
(-1.433935729163597, 'definitely')
(-1.3110224187598694, 'said')
(-1.163941050081389, 'experience')
(-0.9351105902117167, 'like')
(-0.8574747548031666, 'dining')
(-0.7760741498656102, 'high')
(-0.7089199586943215, 'dine')
(-0.7032597176527685, 'worth')
(-0.6768855156247641, 'amazing')

