In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

from wordcloud import WordCloud, STOPWORDS
import nltk
nltk.download('vader_lexicon')
nltk.download('brown')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
import scipy as sp

from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn import svm

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve  
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import datetime

## FLOW:

- This is a study that interests in detecting whether a review is fake or true on available yelp review data on Kaggle. The aim here is to construct a pipeline, strecth muscles for analytical thinking and sensible business approach. Therefore, I did not focus on model trials and relevant operations since they can be technically enhanced by including hyperparameter optimization, cross validation or so on... Rather than that, I approached to this issue in a way that woul help me construct a pipeline.

- To go one step ahead, the outcome of this study might be used as a fake review detector which might create an alert for yelp to suggest identity control step to the user and a collection of a certain number of fake alerts might result in account suspension. 

- A beneficial side of this study is that there are no features available in the original dataset which requires us to think outside-the-box and generate ideas, use NLP operations and so forth...

### Data Retrieval & Examination:

In [None]:
data = pd.read_csv("./labeled_yelp/yelp.csv").dropna()

rows = len(data)
print(f"Number of rows: {rows}")

data.isna().sum()

##### Let's change the columns names into meaningful ones:

In [None]:
data.columns = ["user_id","product_id","rating","date","review","label"]

##### Target variable label is constructed as categorical -1 & 1. Let's switch it to binary:

In [None]:
data["label"] = data["label"].apply(lambda x: 1 if x== -1 else 0)

##### The date column is constructed poorly, but since the dataset comes with limited number of variables, we might need the date to extract features. To do that, let's correct date column in a proper form:

In [None]:
date_series = list(data["date"])
date_list = [[int(date.split('/')[0]),int(date.split('/')[1]),int(date.split('/')[2])] for date in date_series]
data['date'] = [datetime.date(tarih[2], tarih[0], tarih[1]) for tarih in date_list]

##### Let's save the corrected data as pickle in case we need it later:

In [None]:
data.to_pickle("yelp_initial.pkl")

### Detailed Data Examination & Feature Generation:

- Since dataset comes with almost no feature, we will be deriving potentially explanatory features for later use.
- Whilst doing that, we will examine the relation between the derived feature and the target. 

##### Let's start with feature generation brain storming, below are some ideas that I suggest as they might be explanatory to target variable:

- length of the review: Fake reviews might tend to be shorter than the true ones.
- average rating of the restaurant 
- average rating of the users
- number of reviews that the restaurant received
- number of unique users that the restaurant received review from
- number of reviews that the user made
- number of unique restaurants that the user made review to
- time between the first and the last time a restaurant got a review
- time between the first and the last time a user made a review

##### In addition to all those, we might have a gold mine in terms of fake review detection: the review itself. It is a text area which requires treatment to be available for prediction

- Text Correction
- Word Cloud Analysis
- Sentiment Analysis & Its Harmony with the rating
- Tagging the text & Analysis of Sentence Structure

### Text Correction:

In [None]:
def correct_contractions(text):
    for word in text.split():
        if word.lower() in contractions:
            text = text.replace(word, contractions[word.lower()])
    return text

In [None]:
contractions = {
"ain't": "are not",
"aren't": "am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"i'd": "I had",
"i'd've": "I would have",
"i'll": "I will",
"i'll've": "I will have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it has",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there had",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}


In [None]:
data["review1"] = data["review"].apply(correct_contractions)
data["tokens"] = data["review1"].apply(lambda x: nltk.RegexpTokenizer(r'\w+').tokenize(x))

data["cleaned_tokens"] = data["tokens"].apply(lambda x: [token for token in x if token.lower() not in ['â']])

data["review_corrected"] = data["cleaned_tokens"].apply(lambda x: " ".join(x))

### Feature Generation:

In [None]:
## Length of review:

data["len_review"] = data["review"].apply(lambda x: len(x))

In [None]:
## Average rating of the restaurant
per_prod = data.groupby("product_id")["rating"].mean()
data_1 = data.merge(per_prod, how="left", on="product_id").rename(columns={"rating_x":"rating","rating_y":"avg_rating_prod"})

In [None]:
## Average rating from the user
per_user = data_1.groupby("user_id")["rating"].mean()
data_1 = data_1.merge(per_user,how="left",on="user_id").rename(columns={"rating_x":"rating","rating_y":"avg_rating_user"})

In [None]:
## Number of ratings for the restaurant
num_prod = data_1.groupby("product_id")["rating"].count()
data_1 = data_1.merge(num_prod, how="left",on="product_id").rename(columns={"rating_x":"rating","rating_y":"num_review_prod"})

In [None]:
## number of unique users that the restaurant received review from
prod_unique = pd.DataFrame(data_1.groupby("product_id")["user_id"].nunique()).reset_index()
data_1 = data_1.merge(prod_unique,on="product_id",how="left")

data_1 = data_1.rename(columns={"user_id_y":"prod_unique_review",
                                "user_id_x":"user_id"})

In [None]:
## Number of ratings from the user
num_user = data_1.groupby("user_id")["rating"].count()
data_1 = data_1.merge(num_user,how="left",on="user_id").rename(columns={"rating_x":"rating","rating_y":"num_review_user"})

In [None]:
## Number of distinct restaurants that the user rated
user_unique = pd.DataFrame(data_1.groupby("user_id")["product_id"].nunique()).reset_index()
data_1 = data_1.merge(user_unique,on="user_id",how="left")

data_1 = data_1.rename(columns={"product_id_y":"user_unique_review",
                                "product_id_x":"product_id"})

In [None]:
## User review Density
data_1["user_review_density"] = data_1["num_review_user"]/data_1["user_unique_review"]

In [None]:
## Restaurant review density
data_1["prod_review_density"] = data_1["num_review_prod"]/data_1["prod_unique_review"]

In [None]:
## Max time range between user's reviews
user_max_df = pd.DataFrame(data_1.groupby("user_id")["date"].max()).reset_index()
user_min_df = pd.DataFrame(data_1.groupby("user_id")["date"].min()).reset_index()

user_diff_df = user_max_df.merge(user_min_df, on="user_id", how='inner')

user_diff_df["diff"] = (user_diff_df["date_x"] - user_diff_df["date_y"])
user_diff_df["diff"] = user_diff_df["diff"].apply(lambda x: x.days)

data_1 = data_1.merge(user_diff_df,on="user_id",how="left")

data_1.drop(["date_x","date_y"],axis=1,inplace=True)
data_1 = data_1.rename(columns={"diff":"user_time_max"})

In [None]:
## Max time range between restaurant's reviews
prod_max_df = pd.DataFrame(data_1.groupby("product_id")["date"].max()).reset_index()
prod_min_df = pd.DataFrame(data_1.groupby("product_id")["date"].min()).reset_index()

prod_diff_df = prod_max_df.merge(prod_min_df, on="product_id", how='inner')

prod_diff_df["diff"] = (prod_diff_df["date_x"] - prod_diff_df["date_y"])
prod_diff_df["diff"] = prod_diff_df["diff"].apply(lambda x: x.days)

data_1 = data_1.merge(prod_diff_df,on="product_id",how="left")

data_1.drop(["date_x","date_y"],axis=1,inplace=True)
data_1 = data_1.rename(columns={"diff":"prod_time_max"})

In [None]:
data_1.to_pickle("yelp_text_cleaned.pkl")

### Feature Generation with Sentiment Analysis:

In [None]:
data_1.drop(["tokens","review","review1"], axis=1, inplace=True)

In [None]:
def nltk_sentiment(sentence):
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    nltk_sentiment = SentimentIntensityAnalyzer()
    score = nltk_sentiment.polarity_scores(sentence)
    return score

In [None]:
review_sentiment =  data_1['review_corrected']

nltk_results = [nltk_sentiment(row) for row in review_sentiment]
results_df = pd.DataFrame(nltk_results)
nltk_df = review_sentiment.to_frame().join(results_df)

data_1 = data.merge(nltk_df, left_index=True, right_index=True, how="inner")

In [None]:
data_1.drop("review_corrected_y", axis=1, inplace=True)
data_1.rename(columns={"review_corrected_x":"review_corrected"}, inplace=True)

In [None]:
## Consistency between the text review sentiment & the rating
data_1["consistency"] = data_1["rating"]\
                       /(5*(data_1["compound"]-data_1["compound"].min())/(data_1["compound"].max()-data_1["compound"].min()))

In [None]:
# It takes a long time conduct sentiment analysis for a dataset this large, therefore it is saved as a pickle...
data_1.to_pickle("yelp_sentimented.pkl")

##### Examples:

In [None]:
data_1[data_1.compound>0.9][["review_corrected","compound"]].head(5)

In [None]:
data_1[data_1.compound<-0.9][["review_corrected","compound"]].head(5)

### Text Tagging:

- Fake reviewers might be using more adjective or adverb etc to put more emphasis on the comment, therefore we utilize this information from the review text. Let's examine with this perspective to add more dimension to our analysis:

In [None]:
nltk.download('averaged_perceptron_tagger')

data_1["tag_tuples"] = data_1["cleaned_tokens"].apply(lambda x: nltk.pos_tag(x))
data_1["pos_tags"] = data_1["tag_tuples"].apply(lambda x: [element[1] for element in x])

data_1["pos_tag_dict"] = data_1["pos_tags"].apply(lambda x: nltk.FreqDist(x))

In [None]:
def create_cols(df, liste):
    for element in liste:
        df[element] = df["pos_tag_dict"].apply(lambda x: x[element])
    return df

In [None]:
tag_list = ['$',"CC","CD","DT","EX","FW","IN","JJ","JJR","JJS","LS","MD","NN","NNP","NNPS","NNS","PDT","POS",
            "PRP","PRP$","RB","RBR","RBS","RP","SYM","TO","UH","VB","VBD","VBG","VBN","VBP","VBZ","WDT","WP","WP$","WRB"]

data_1 = create_cols(data_1, tag_list)

In [None]:
data_1["all_types_sum"] = data_1[['$', 'CC',
       'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN',
       'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS',
       'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT',
       'WP', 'WP$', 'WRB',]].sum(axis=1)

In [None]:
data_1["noun_perc"] = data_1[['NN','NNP', 'NNPS', 'NNS']].sum(axis=1) / data_1["all_types_sum"]

data_1["adjective_perc"] = data_1[['JJ', 'JJR', 'JJS']].sum(axis=1)/ data_1["all_types_sum"]

data_1["adverb_perc"] = data_1[[ 'RB', 'RBR', 'RBS','RP']].sum(axis=1)/ data_1["all_types_sum"]

data_1["verb_perc"] = data_1[['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']].sum(axis=1)/ data_1["all_types_sum"]

In [None]:
data_1.drop(["tag_tuples","pos_tags","pos_tag_dict"], axis=1, inplace = True)

In [None]:
data_1.to_pickle("yelp_tagged.pkl")

### Word Cloud Analysis:

- We are already considering and deriving effects from the text area, but by simply looking at a word cloud may generate additional ideas:

In [None]:
from nltk.corpus import stopwords
from wordcloud import WordCloud

In [None]:
## TRUE Review Word Cloud

wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords.words('english'), 
                min_font_size = 10).generate(" ".join(data_1[data_1['label'] == 0].review_corrected)) 
  
# plot the word cloud for true review data                      
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.show()

In [None]:
## FAKE Review Word Cloud

wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords.words('english'), 
                min_font_size = 10).generate(" ".join(data_1[data_1['label'] == 1].review_corrected)) 
  
# plot the word cloud for fake review data                      
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.show()

### Feature Examination:

- Let's see the effects we are looking for with all these recently created variables:

In [None]:
fake_data = data_1[data_1.label==1].reset_index(drop=True)
true_data = data_1[data_1.label==0].reset_index(drop=True)

In [None]:
true_color = ()
fake_color = ()

In [None]:
def bar_plot(data, feature, ax):
    grouped_data = data.groupby("label")[feature].mean()
    bar = group_data.plot(kind = "barh", ax = ax)
    
    ax.set_facecolor('white')
    ax.set_xlabel(feature, rotation = 0)
    ax.set_ylabel("Target", rotation = 90)
    
    return None

In [None]:
def hist_plot(data, feature, ax, bins, color):
    data[feature].hist(ax=ax, bins = bins)
    
    ax.set_facecolor('white')
    ax.set_xlabel(feature, rotation = 0)
    
    return None

In [None]:
features_to_look = ['len_review','neg', 'neu', 'pos', 'compound',
                    'consistency', 'avg_rating_prod', 'avg_rating_user',
                    'num_review_prod','num_review_user', 'user_time_max',
                    'prod_time_max','user_unique_review', 'prod_unique_review',
                    'user_review_density', 'prod_review_density',"adjective_perc",
                    "adverb_perc","noun_perc","verb_perc"]

n_of_rows = len(features_to_look)

In [None]:
fig, axes = plt.subplots(n_of_rows, 3)
bins = 10

for i, feature in enumerate(features_to_look):
    
    bar_plot(data_1, feature, axes[i,0])
    hist_plot(true_data, feature, axes[i,1], bins, color)
    hist_plot(fake_data, feature, axes[i,2], bins, color)
    
plt.show()  

### Model Phase:

- Let's start with conventional models, and following these, we will also try models based on text areas:

In [None]:
final_data = data_1[['label', 'len_review','neg', 'neu', 'pos', 'compound',
       'consistency', 'avg_rating_prod', 'avg_rating_user', 'num_review_prod',
       'num_review_user', '$', 'CC',
       'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN',
       'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS',
       'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT',
       'WP', 'WP$', 'WRB', 'user_time_max', 'prod_time_max',
       'user_unique_review', 'prod_unique_review', 'user_review_density',
       'prod_review_density',"adjective_perc","adverb_perc","noun_perc","verb_perc"]]

In [None]:
final_data["neg"] = (final_data["neg"]-final_data["neg"].min())/(final_data["neg"].max()-final_data["neg"].min())
final_data["neu"] = (final_data["neg"]-final_data["neg"].min())/(final_data["neg"].max()-final_data["neg"].min())
final_data["pos"] = (final_data["neg"]-final_data["neg"].min())/(final_data["neg"].max()-final_data["neg"].min())
final_data["compound"] = (final_data["neg"]-final_data["neg"].min())/(final_data["neg"].max()-final_data["neg"].min())

In [None]:
final_data.dropna(inplace=True)
final_data.to_pickle("yelp_final_data.pkl")

#### Distribution of target variable:

- In line with this, our study might require up/down sampling techniques...

In [None]:
# Let's see how label is distributed:
sns.set_style("darkgrid")
sns.countplot(final_data["label"])

#### Train - Test Split:

In [None]:
y = final_data["label"]
x = final_data.drop("label",axis=1)

In [None]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=123)

#### Resampling:

##### Upsampling:

In [None]:
from sklearn.utils import resample

X = pd.concat([train_x, train_y], axis=1)

# separate minority and majority classes
true = X[X.label==0]
fake = X[X.label==1]

# upsample minority
fake_upsampled = resample(fake,
                          replace=True, # sample with replacement
                          n_samples=int(len(true)/2), # match number in majority class
                          random_state=123) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([true, fake_upsampled])

train_yu = upsampled.label
train_xu = upsampled.drop('label', axis=1)

##### Downsampling 1:

In [None]:
from sklearn.utils import resample

X = pd.concat([train_x, train_y], axis=1)

# separate minority and majority classes
true = X[X.label==0]
fake = X[X.label==1]

# upsample minority
true_downsampled = resample(true,
                          replace=True, # sample with replacement
                          n_samples=len(fake), # match number in majority class
                          random_state=123) # reproducible results

# combine majority and upsampled minority
downsampled = pd.concat([fake, true_downsampled])

train_yd = downsampled.label
train_xd = downsampled.drop('label', axis=1)

##### Downsampling 2:

In [None]:
from sklearn.utils import resample

X = pd.concat([train_x, train_y], axis=1)

# separate minority and majority classes
true = X[X.label==0]
fake = X[X.label==1]

# upsample minority
true_downsampled = resample(true,
                          replace=True, # sample with replacement
                          n_samples=int(len(fake)*1.5), # match number in majority class
                          random_state=123) # reproducible results

# combine majority and upsampled minority
downsampled = pd.concat([fake, true_downsampled])

train_yd2 = downsampled.label
train_xd2 = downsampled.drop('label', axis=1)

#### For confusion matrix:

In [None]:
from mlxtend.plotting import plot_confusion_matrix

def consufion_plot(y_test,prediction):

    fig, ax = plot_confusion_matrix(conf_mat=confusion_matrix(y_test, prediction),
                                    show_absolute=True,
                                    show_normed=True,
                                    colorbar=True)
    plt.show()

### MODELS:

### Decision Tree

#### Decision Tree - original:

In [None]:
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(train_x,train_y)

#Predict the response for test dataset
y_pred1 = clf.predict(test_x)

y_pred_prob = clf.predict_proba(test_x)[:,1]

consufion_plot(test_y, y_pred1)
print("Accuracy:",metrics.accuracy_score(test_y, y_pred1))
print ("AUC Score:", roc_auc_score(test_y, y_pred1))
print ("AUC Score prob:", roc_auc_score(test_y, y_pred_prob))
print ("Precision:", precision_score(test_y, y_pred1))
print ("Recall:", recall_score(test_y, y_pred1))
print ("F1 Score:", f1_score(test_y, y_pred1))

#### Decision Tree - downsampled 1:

In [None]:
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(train_xd,train_yd)

#Predict the response for test dataset
y_pred1 = clf.predict(test_x)

y_pred_prob = clf.predict_proba(test_x)[:,1]

consufion_plot(test_y, y_pred1)
print("Accuracy:",metrics.accuracy_score(test_y, y_pred1))
print ("AUC Score:", roc_auc_score(test_y, y_pred1))
print ("AUC Score prob:", roc_auc_score(test_y, y_pred_prob))
print ("Precision:", precision_score(test_y, y_pred1))
print ("Recall:", recall_score(test_y, y_pred1))
print ("F1 Score:", f1_score(test_y, y_pred1))


#### Decision Tree - downsampled 2:

In [None]:
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(train_xd2,train_yd2)

#Predict the response for test dataset
y_pred1 = clf.predict(test_x)

y_pred_prob = clf.predict_proba(test_x)[:,1]

consufion_plot(test_y, y_pred1)
print("Accuracy:",metrics.accuracy_score(test_y, y_pred1))
print ("AUC Score:", roc_auc_score(test_y, y_pred1))
print ("AUC Score prob:", roc_auc_score(test_y, y_pred_prob))
print ("Precision:", precision_score(test_y, y_pred1))
print ("Recall:", recall_score(test_y, y_pred1))
print ("F1 Score:", f1_score(test_y, y_pred1))


#### Decision Tree - upsampled:

In [None]:
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(train_xu,train_yu)

#Predict the response for test dataset
y_pred1 = clf.predict(test_x)

y_pred_prob = clf.predict_proba(test_x)[:,1]

consufion_plot(test_y, y_pred1)
print("Accuracy:",metrics.accuracy_score(test_y, y_pred1))
print ("AUC Score:", roc_auc_score(test_y, y_pred1))
print ("AUC Score prob:", roc_auc_score(test_y, y_pred_prob))
print ("Precision:", precision_score(test_y, y_pred1))
print ("Recall:", recall_score(test_y, y_pred1))
print ("F1 Score:", f1_score(test_y, y_pred1))


### Random Forest

#### Random Forest - original:

In [None]:
rfc = RandomForestClassifier()

rfc = rfc.fit(train_x,train_y)

#Predict the response for test dataset
y_pred3 = rfc.predict(test_x)
y_pred_proba = rfc.predict_proba(test_x)

consufion_plot(test_y, y_pred3)
print("Accuracy:",metrics.accuracy_score(test_y, y_pred3))
print ("AUC Score:", roc_auc_score(test_y, y_pred3))
print ("AUC Score prob:", roc_auc_score(test_y, y_pred_prob))
print ("Precision:", precision_score(test_y, y_pred3))
print ("Recall:", recall_score(test_y, y_pred3))
print ("F1 Score:", f1_score(test_y, y_pred3))

#### Random Forest - downsampled 1:

In [None]:
rfc = RandomForestClassifier()

rfc = rfc.fit(train_xd,train_yd)

#Predict the response for test dataset
y_pred3 = rfc.predict(test_x)
y_pred_proba = rfc.predict_proba(test_x)

consufion_plot(test_y, y_pred3)
print(classification_report(test_y, y_pred3))
print("Accuracy:",metrics.accuracy_score(test_y, y_pred3))
print ("AUC Score:", roc_auc_score(test_y, y_pred3))
print ("AUC Score prob:", roc_auc_score(test_y, y_pred_prob))
print ("Precision:", precision_score(test_y, y_pred3))
print ("Recall:", recall_score(test_y, y_pred3))
print ("F1 Score:", f1_score(test_y, y_pred3))

#### Random Forest - downsampled 2:

In [None]:
rfc = RandomForestClassifier()

rfc = rfc.fit(train_xd2,train_yd2)

#Predict the response for test dataset
y_pred3 = rfc.predict(test_x)
y_pred_proba = rfc.predict_proba(test_x)

consufion_plot(test_y, y_pred3)
print(classification_report(test_y, y_pred3))
print("Accuracy:",metrics.accuracy_score(test_y, y_pred3))
print ("AUC Score:", roc_auc_score(test_y, y_pred3))
print ("AUC Score prob:", roc_auc_score(test_y, y_pred_prob))
print ("Precision:", precision_score(test_y, y_pred3))
print ("Recall:", recall_score(test_y, y_pred3))
print ("F1 Score:", f1_score(test_y, y_pred3))

#### Random Forest - upsampled:

In [None]:
rfc = RandomForestClassifier()

rfc = rfc.fit(train_xu,train_yu)

#Predict the response for test dataset
y_pred3 = rfc.predict(test_x)
y_pred_proba = rfc.predict_proba(test_x)

consufion_plot(test_y, y_pred3)
print(classification_report(test_y, y_pred3))
print("Accuracy:",metrics.accuracy_score(test_y, y_pred3))
print ("AUC Score:", roc_auc_score(test_y, y_pred3))
print ("AUC Score prob:", roc_auc_score(test_y, y_pred_prob))
print ("Precision:", precision_score(test_y, y_pred3))
print ("Recall:", recall_score(test_y, y_pred3))
print ("F1 Score:", f1_score(test_y, y_pred3))

### Naive Bayes

#### Naive Bayes - original:

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [None]:
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(train_x, train_y)
predicted= clf.predict(test_x)
consufion_plot(test_y, predicted)
print("MultinomialNB Accuracy:",metrics.accuracy_score(test_y, predicted))

#### Naive Bayes - downsampled 1:

In [None]:
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(train_xd, train_yd)
predicted= clf.predict(test_x)
consufion_plot(test_y, predicted)
print("MultinomialNB Accuracy:",metrics.accuracy_score(test_y, predicted))

#### Naive Bayes - downsampled 2:

In [None]:
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(train_xd2, train_yd2)
predicted= clf.predict(test_x)
consufion_plot(test_y, predicted)
print("MultinomialNB Accuracy:",metrics.accuracy_score(test_y, predicted))

#### Naive Bayes - upsampled:

In [None]:
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(train_xu, train_yu)
predicted= clf.predict(test_x)
consufion_plot(test_y, predicted)
print("MultinomialNB Accuracy:",metrics.accuracy_score(test_y, predicted))

### XGBoost

In [None]:
import xgboost
from xgboost import XGBClassifier

#### XGBoost - original:

In [None]:
xgb = XGBClassifier()

xgb = xgb.fit(train_x,train_y)

#Predict the response for test dataset
y_pred3 = xgb.predict(test_x)
y_pred_proba = xgb.predict_proba(test_x)

consufion_plot(test_y, y_pred3)
print("Accuracy:",metrics.accuracy_score(test_y, y_pred3))
print ("AUC Score:", roc_auc_score(test_y, y_pred3))
print ("AUC Score prob:", roc_auc_score(test_y, y_pred_prob))
print ("Precision:", precision_score(test_y, y_pred3))
print ("Recall:", recall_score(test_y, y_pred3))
print ("F1 Score:", f1_score(test_y, y_pred3))

#### XGBoost - downsampled 1:

In [None]:
xgb = XGBClassifier()

xgb = xgb.fit(train_xd,train_yd)

#Predict the response for test dataset
y_pred3 = xgb.predict(test_x)
y_pred_proba = xgb.predict_proba(test_x)

consufion_plot(test_y, y_pred3)
print("Accuracy:",metrics.accuracy_score(test_y, y_pred3))
print ("AUC Score:", roc_auc_score(test_y, y_pred3))
print ("AUC Score prob:", roc_auc_score(test_y, y_pred_prob))
print ("Precision:", precision_score(test_y, y_pred3))
print ("Recall:", recall_score(test_y, y_pred3))
print ("F1 Score:", f1_score(test_y, y_pred3))

#### XGBoost - downsampled 2:

In [None]:
xgb = XGBClassifier()

xgb = xgb.fit(train_xd2,train_yd2)

#Predict the response for test dataset
y_pred3 = xgb.predict(test_x)
y_pred_proba = xgb.predict_proba(test_x)

consufion_plot(test_y, y_pred3)
print("Accuracy:",metrics.accuracy_score(test_y, y_pred3))
print ("AUC Score:", roc_auc_score(test_y, y_pred3))
print ("AUC Score prob:", roc_auc_score(test_y, y_pred_prob))
print ("Precision:", precision_score(test_y, y_pred3))
print ("Recall:", recall_score(test_y, y_pred3))
print ("F1 Score:", f1_score(test_y, y_pred3))

#### XGBoost - upsampled:

In [None]:
xgb = XGBClassifier()

xgb = xgb.fit(train_xu,train_yu)

#Predict the response for test dataset
y_pred3 = xgb.predict(test_x)
y_pred_proba = xgb.predict_proba(test_x)

consufion_plot(test_y, y_pred3)
print("Accuracy:",metrics.accuracy_score(test_y, y_pred3))
print ("AUC Score:", roc_auc_score(test_y, y_pred3))
print ("AUC Score prob:", roc_auc_score(test_y, y_pred_prob))
print ("Precision:", precision_score(test_y, y_pred3))
print ("Recall:", recall_score(test_y, y_pred3))
print ("F1 Score:", f1_score(test_y, y_pred3))

### MODEL Trials with Text Area:

#### Bag of Words Model:

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True, stop_words='english', ngram_range = (2,2), tokenizer = token.tokenize)
text_counts= cv.fit_transform(data_1['review_corrected'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(text_counts, data_1['label'], test_size=0.3, random_state=123)

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print(confusion_matrix(y_test, predicted))
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

#### TFIDF Model:

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
text_tf = tf.fit_transform(data_1['review_corrected'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(text_tf, data_1['label'], test_size=0.3, random_state=123)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)

consufion_plot(y_test, predicted)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

### TFIDF combined with our features:

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(ngram_range=(3,3),max_features= 15000)
text_tf = tf.fit_transform(data1['review_corrected'])

X_train, X_test, y_train, y_test = train_test_split(text_tf, data1['label'], test_size=0.3, random_state=123)

In [None]:
final_train = sp.sparse.hstack((X_train, train_x.values),format='csr')
final_test = sp.sparse.hstack((X_test, test_x.values),format='csr')

In [None]:
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(final_train, y_train)
predicted= clf.predict(final_test)
consufion_plot(y_test, predicted)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))