# **Fake Reviews - Modeling**
---
## **Import Libraries**

In [None]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import balanced_accuracy_score, auc, roc_auc_score, roc_curve, cohen_kappa_score

#algorithms
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from catboost import CatBoostClassifier
from sklearn.linear_model import RidgeClassifier, PassiveAggressiveClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier, BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.feature_selection import SelectKBest, chi2, f_classif, SelectPercentile
from sklearn.ensemble import AdaBoostClassifier
from lightgbm import LGBMClassifier

import time
import warnings
warnings.filterwarnings('ignore')

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

# set this to True to run hyperparameter tuning for each model
run_hyperparameter_tuning=False

# DataFrame to store the results of each model
results = pd.DataFrame(columns=['accuracy', 'balanced-accuracy', 'roc-auc', 'precision', 'recall', 'f1', 'kappa', 'time', 'model'])

In [None]:
# function to format the confusion matrix
def format_confusion_matrix(conf):
    group_names = ['True Neg','False Pos','False Neg','True Pos']
    group_counts = ["{0:0.0f}".format(value) for value in
                conf.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in
                     conf.flatten()/np.sum(conf)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names, group_counts, group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    return labels

# function to create a confusion matrix heatmap
def create_conf_heatmap(conf, acc, cross, report, auc, kappa, title):
    plt.rcParams.update({'font.size': 10})
    ax = sns.heatmap(conf, annot=format_confusion_matrix(conf), fmt='', cmap='Blues')
    ax.set_title('{0} Confusion Matrix\n\naccuracy: {1:0.4f};\ncross-validation (k=10): {2:0.4f};\nauc: {3:0.4f};\nkappa: {4:0.4f}' 
                 .format(title, acc, cross, auc, kappa), fontsize=10)
    ax.set_xlabel('\nPredicted Values\n\nMetrics:\nprecision    recall    f1-score\n0 - {:0.4f}   {:0.4f}   {:0.4f}\n1 - {:0.4f}   {:0.4f}   {:0.4f}\navg - {:0.4f}   {:0.4f}   {:0.4f}'.format(
        report["0"]["precision"], report["0"]["recall"], report["0"]["f1-score"],                                                                                                      
        report["1"]["precision"], report["1"]["recall"], report["1"]["f1-score"],
        report["weighted avg"]["precision"], report["weighted avg"]["recall"], report["weighted avg"]["f1-score"],))
    ax.set_ylabel('Actual Values ')
    ax.xaxis.set_ticklabels(['Deceptive','Truthful'])
    ax.yaxis.set_ticklabels(['Deceptive','Truthful'])
    return ax

# function to train and evaluate a model
def train_model(base_model, model_name):
    start_time = time.perf_counter()
    model = base_model.fit(X_train, y_train)
    model_time = time.perf_counter() - start_time

    predict = model.predict(X_test)
    conf = confusion_matrix(y_test, predict)
    acc = accuracy_score(y_test, predict)
    bacc = balanced_accuracy_score(y_test, predict)
    kappa = cohen_kappa_score(y_test, predict)
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
    cross = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1).mean()
    report = classification_report(y_test, predict, output_dict=True)

    #ROC - AUC
    try:
        pred_prob = model.predict_proba(X_test)
        fpr, tpr, _ = roc_curve(y_test, pred_prob[:,1])
        roc_auc = auc(fpr, tpr)
    except:
        roc_auc = 0

    plt.figure(figsize=(3, 3))
    ax = create_conf_heatmap(conf, acc, cross, report, roc_auc, kappa, title = model_name)
    plt.show()
    
    new_row = [acc, bacc, roc_auc, report["0"]["precision"], report["0"]["recall"], report["0"]["f1-score"], kappa, model_time, base_model]
    new_row[:2] = [round(value * 100, 2) for value in new_row[:2]]
    new_row[3:6] = [round(value * 100, 2) for value in new_row[3:6]]
    new_row[:8] = [round(value, 2) for value in new_row[:8]]
    results.loc[model_name] = new_row

---
## **Load Dataset**

In [None]:
# load the dataset
df = pd.read_pickle('t10.pkl')

# split the dataset into features and target
X = df.drop(['class'], axis=1)
y = df['class']

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7, stratify=y)

### Logistic Regression

In [None]:
lr = LogisticRegression(C=0.20565123083486536, solver='liblinear')
train_model(lr, 'Logistic Regression')

In [None]:
if run_hyperparameter_tuning:
    param_grid = [    
        {#'penalty' : ['l1', 'l2'],
        'C': np.logspace(-4, 4, 500),
        #'solver' : ['lbfgs','newton-cg','liblinear','sag','saga', 'newton-cholesky'],
        'solver' : ['liblinear'],
        #'max_iter' : [100, 1000, 2500, 5000, 10000]
        }
    ]

    clf = GridSearchCV(LogisticRegression(), param_grid = param_grid, scoring='accuracy', verbose=True, n_jobs=-1)
    clf.fit(X_train, y_train)
    print(clf.best_estimator_)

### MLP

In [None]:
mlp = MLPClassifier(activation='tanh', hidden_layer_sizes=(50, 100, 50),
              learning_rate='adaptive', solver='sgd')
train_model(mlp, 'MLP')

In [None]:
if run_hyperparameter_tuning:
    parameter_space = {
        'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
        'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam'],
        'alpha': [0.0001, 0.05],
        'learning_rate': ['constant','adaptive'],
    }

    clf = GridSearchCV(MLPClassifier(), parameter_space, scoring='accuracy', n_jobs=-1, verbose=True)
    clf.fit(X_train, y_train)

    # Best parameter set
    print('Best parameters found:\n', clf.best_estimator_)

### Decision Tree

In [None]:
dt = DecisionTreeClassifier(criterion='entropy', max_depth=75, min_samples_leaf=21)
train_model(dt, 'Decision Tree')

In [None]:
if run_hyperparameter_tuning:
    params = {
        'max_depth': np.arange(2, 100, 5),
        'min_samples_leaf': np.arange(2, 100, 5),
        'criterion': ["gini", "entropy"]
    }

    clf = GridSearchCV(estimator=DecisionTreeClassifier(), 
                               param_grid=params, 
                               n_jobs=-1, verbose=1, scoring="accuracy")

    clf.fit(X_train, y_train)
    print(clf.best_estimator_)

### Naive Bayes

In [None]:
nb = GaussianNB()
train_model(nb, 'Naive Bayes')

### SVM

In [None]:
svm = SVC(C=100, degree=2, gamma=0.03, kernel='poly', probability=True)
train_model(svm, 'SVM')

In [None]:
linsvm = LinearSVC(C=0.1)
train_model(linsvm, 'Linear SVM')

In [None]:
if run_hyperparameter_tuning:
    # instantiate classifier with default hyperparameters with kernel=rbf, C=1.0 and gamma=auto
    svc = SVC(probability=True) 

    # declare parameters for hyperparameter tuning
    parameters = [ #{'C':[1, 10, 100, 1000], 'kernel':['linear']},
                   #{'C':[1, 10, 100, 1000], 'kernel':['rbf'], 'gamma':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]},
                   {'C':[1, 10, 100, 1000], 'kernel':['poly'],
                    'degree': [2,3,4,5],
                    'gamma':[0.01,0.02,0.03,0.04,0.05,0.06,0.07]} 
                  ]

    grid_search = GridSearchCV(estimator = svc,  
                               param_grid = parameters,
                               scoring = 'accuracy',
                               cv = cv,
                               n_jobs = -1,
                               verbose=1)

    grid_search.fit(X, y)
    print(grid_search.best_estimator_)

### KNN

In [None]:
knn = KNeighborsClassifier(metric='manhattan', n_neighbors=30, weights='distance')
train_model(knn, 'KNN')

In [None]:
if run_hyperparameter_tuning:
    grid_params = { 'n_neighbors' : list(range(1, 50, 1)),
                    'weights' : ['uniform','distance'],
                    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                    'metric' : ['minkowski','euclidean','manhattan']
                  }

    clf = GridSearchCV(KNeighborsClassifier(), grid_params, verbose=1, cv=cv, scoring='accuracy', n_jobs=-1)
    clf.fit(X_train, y_train)
    print(clf.best_estimator_)

### Passive Aggressive Classifier

In [None]:
pa = PassiveAggressiveClassifier(max_iter=1000, random_state=7, tol=1e-3)
train_model(pa, 'Passive Aggressive')

### Random Forest

In [None]:
rf = RandomForestClassifier(max_depth=12, max_features='log2', n_estimators=300)
train_model(rf, 'Random Forest')

In [None]:
if run_hyperparameter_tuning:
    grid_params = { 'n_estimators' : [100, 200, 300, 400, 500],
                   #'criterion' : ['gini','entropy', 'log_loss'],
                   'max_depth': list(range(1, 30, 1)),
                   #'max_features' : ['sqrt','log2'],
                   #'sampling_strategy' : ['majority','not minority','not majority','all'],
                  }

    clf = GridSearchCV(RandomForestClassifier(), grid_params, verbose=1, scoring='accuracy', n_jobs=-1)
    clf.fit(X_train, y_train)
    print(clf.best_estimator_)

### Gradient Boosting

In [None]:
gb = GradientBoostingClassifier(max_depth=4)
train_model(gb, 'Gradient Boosting')

In [None]:
if run_hyperparameter_tuning:
    params = {
        "n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 250, 500],
        "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
        "learning_rate": [0.001, 0.01, 0.1, 0.5, 1, 2, 3, 10],
        "max_depth": np.arange(1, 10, 2).tolist(),
    }

    search_cv = GridSearchCV(GradientBoostingClassifier(), param_grid=params, scoring="accuracy", n_jobs=-1)
    search_cv.fit(X_train, y_train)
    print(search_cv.best_estimator_)

### XGBoost

In [None]:
xgb = XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.6,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=1.5, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.5, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=7, max_leaves=0, min_child_weight=5,
              missing=np.nan, monotone_constraints='()', n_estimators=200,
              n_jobs=-1, nthread=-1, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0)
train_model(xgb, 'XGBoost')

In [None]:
if run_hyperparameter_tuning:
    # A parameter grid for XGBoost
    params = {
            'min_child_weight': [1, 5, 10, 15, 20, 25, 30],
            'gamma': [0.1, 0.5, 1, 1.5, 2, 5, 10],
            'subsample': [0.4, 0.6, 0.8, 1.0],
            'colsample_bytree': [0.4, 0.6, 0.8, 1.0],
            'max_depth': np.arange(1, 10, 2).tolist(),
            'n_estimators': np.arange(0, 500, 50).tolist(),
            'learning_rate': [0.001, 0.01, 0.1, 0.5, 1, 2, 3, 10],
            }

    xgb = XGBClassifier(objective='binary:logistic', nthread=-1)
    #random_search = GridSearchCV(xgb, param_grid=params, scoring='accuracy', n_jobs=-1, cv=cv, verbose=1)
    random_search = RandomizedSearchCV(xgb, params, scoring='accuracy', n_jobs=-1, cv=cv, n_iter=200, verbose=1)

    # Here we go
    random_search.fit(X_train, y_train)
    print(random_search.best_estimator_)

### AdaBoost

In [None]:
ada = AdaBoostClassifier()
train_model(ada, 'AdaBoost')

In [None]:
if run_hyperparameter_tuning:
    params = {
        "n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 250, 500],
        "learning_rate": [0.001, 0.01, 0.1, 0.5, 1, 2, 3, 10],
    }

    search_cv = GridSearchCV(AdaBoostClassifier(), param_grid=params, scoring="accuracy", cv=cv, n_jobs=-1)
    search_cv.fit(X_train, y_train)
    print(search_cv.best_estimator_)

### LightBGM

In [None]:
lgb = LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.2,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=400, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, verbose=-1,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
train_model(lgb, 'LightGBM')

### CatBoost

In [None]:
cb = CatBoostClassifier(verbose=False, depth=6, learning_rate=0.1, rsm=0.5, l2_leaf_reg=10, min_data_in_leaf=20,
                       random_strength=0.175)
train_model(cb, 'CatBoost')

### Hard Voting

In [None]:
estimator = []
estimator.append(('CB', cb))
estimator.append(('LGB', lgb))
#estimator.append(('RF', rf))
#estimator.append(('GB', gb))
estimator.append(('XGB', xgb))
#estimator.append(('ADA', ada))
#estimator.append(('PA', pa))
estimator.append(('KNN', knn))
estimator.append(('SVM', svm))
#estimator.append(('LinSVM', linsvm))
#estimator.append(('DT', dt))
estimator.append(('MLP', mlp))
#estimator.append(('NB', nb))
estimator.append(('LR', lr))

hv = VotingClassifier(estimators = estimator, voting ='hard')
train_model(hv, 'Hard Voting')

### Soft Voting

In [None]:
sv = VotingClassifier(estimators = estimator, voting ='soft')
train_model(sv, 'Soft Voting')

### Stacking

In [None]:
estimator_s = []
#estimator_s.append(('CB', cb))
#estimator_s.append(('LGB', lgb))
#estimator_s.append(('RF', rf))
#estimator_s.append(('GB', gb))
#estimator_s.append(('XGB', xgb))
#estimator_s.append(('ADA', ada))
#estimator_s.append(('PA', pa))
estimator_s.append(('KNN', knn))
#estimator_s.append(('SVM', svm))
#estimator_s.append(('LinSVM', linsvm))
estimator_s.append(('DT', dt))
#estimator_s.append(('MLP', mlp))
estimator_s.append(('NB', nb))
estimator_s.append(('LR', lr))

st = StackingClassifier(estimators=estimator_s,
                        final_estimator=lr,
                        cv=cv,
                        n_jobs=-1,
                        passthrough=True,
                        verbose=1)

train_model(st, 'Stacking')

## **Evaluation**

In [None]:
results