### Imports

In [None]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier

from sklearn import feature_extraction, linear_model, model_selection, preprocessing

# from nltk.tokenize import WhitespaceTokenizer

## Important functions

In [None]:
# Function to evaluate: accuracy, precision, recall, f1-score

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results


# Create a helper function to compare our baseline results to new model results
def compare_baseline_to_new_results(baseline_results, new_model_results):
  for key, value in baseline_results.items():
    print(f"Baseline {key}: {value:.2f}, New {key}: {new_model_results[key]:.2f}, Difference: {new_model_results[key]-value:.2f}")

### Function for plotting confusion metrics

In [None]:
# Set the font scale
sns.set(font_scale = 1.5)

def plot_conf_mat(conf_mat):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(4, 4))
    ax = sns.heatmap(conf_mat,
                     fmt="d",
                     annot=True, # Annotate the boxes 
                     cbar=False)
    plt.xlabel('Predicted label')
    plt.ylabel('True label');

### Function for plotting ROC curve

In [None]:
def plot_roc_curve(fpr, tpr):
    """
    Plots a ROC curve given the false positve rate (fpr) and 
    true postive rate (tpr) of a classifier.
    """
    # Plot ROC curve
    plt.plot(fpr, tpr, color='orange', label='ROC')
    # Plot line with no predictive power (baseline)
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--', label='Guessing')
    # Customize the plot
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

### Getting our data

In [None]:
dataset = pd.read_csv("../input/banfake-fractioned/labelled_combined.csv")

In [None]:
train_df = dataset.sample(frac = 0.8, random_state = 42)
test_df = dataset.drop(train_df.index)

train_df.sort_index(axis = 0, inplace = True)
test_df.sort_index(axis = 0, inplace = True)

print(f"No. of training examples: {train_df.shape[0]}")
print(f"No. of testing examples: {test_df.shape[0]}")

In [None]:
train_df

In [None]:
y_train = train_df["label"]
y_test = test_df["label"]

In [None]:
y_train

In [None]:
y_test

## Machine Learning models

### Initializing

#### CountVectorizer

In [None]:
count_vectorizer = feature_extraction.text.CountVectorizer()
train_count = count_vectorizer.fit_transform(train_df["content"])
test_count = count_vectorizer.transform(test_df["content"])

#### TfidfVectorizer

In [None]:
# tfid_transformer= feature_extraction.text.TfidfTransformer(analyzer = 'char')
# train_tfidf = tfid_transformer.fit_transform(train_count)
# test_tfidf = tfid_transformer.fit_transform(test_count)

In [None]:
tfidf_vectorizer = TfidfVectorizer(ngram_range = (1, 2), analyzer = 'word')
train_tfidf = tfidf_vectorizer.fit_transform(train_df["content"])
test_tfidf = tfidf_vectorizer.transform(test_df["content"])

### Logistic Regression

#### Logistic Regression with CountVectorizer

In [None]:
count_LR = LogisticRegression(C = 30.0, class_weight = 'balanced',
                              solver = 'newton-cg', multi_class = 'multinomial',
                              n_jobs = -1, random_state = 42)

count_LR.fit(train_count, y_train)

y_predicted_count = count_LR.predict(test_count)

In [None]:
count_LR_results = calculate_results(y_true = y_test,
                                     y_pred = y_predicted_count)

print("Results for Logistic Regression with CountVectorizer")
count_LR_results

In [None]:
confusion_matrix(y_test, y_predicted_count)

#### Logistic Regression with TfidfVectorizer

In [None]:
tfidf_LR = LogisticRegression(C = 30.0, class_weight = 'balanced',
                              solver = 'newton-cg', multi_class = 'multinomial', 
                              n_jobs = -1, random_state = 42)

tfidf_LR.fit(train_tfidf, y_train)

LR_predicted_tfidf = tfidf_LR.predict(test_tfidf)

In [None]:
tfidf_LR_results = calculate_results(y_true = y_test,
                                     y_pred = LR_predicted_tfidf)

print("Results for Logistic Regression with TfidfVectorizer")
tfidf_LR_results

In [None]:
confusion_matrix(y_test, LR_predicted_tfidf)

In [None]:
# Create a confusion matrix
conf_mat = confusion_matrix(y_test, LR_predicted_tfidf)
plot_conf_mat(conf_mat)

In [None]:
# Make predictions with probabilities
y_probs_LR = tfidf_LR.predict_proba(test_tfidf)

# Keep the probabilites of the positive class only
y_probs_LR = y_probs_LR[:, 1]

# Calculate fpr, tpr and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_probs_LR)

plot_roc_curve(fpr, tpr)

In [None]:
roc_auc_score(y_test, y_probs_LR)

In [None]:
print(classification_report(y_test, LR_predicted_tfidf))

### Support Vector Machine

#### SVM with CountVectorizer

In [None]:
count_SVM = svm.SVC(kernel = 'rbf', gamma = 1, C = 5)

count_SVM.fit(train_count, y_train)

y_predicted_count = count_SVM.predict(test_count)

In [None]:
count_SVM_results = calculate_results(y_true = y_test,
                                      y_pred = y_predicted_count)

print("Results for Support Vector Machine with CountVectorizer")
count_SVM_results

#### SVM with TfidfVectorizer

In [None]:
tfidf_SVM = svm.SVC(kernel = 'rbf', gamma = 1, C = 5)

tfidf_SVM.fit(train_tfidf, y_train)

y_predicted_tfidf = tfidf_SVM.predict(test_tfidf)

In [None]:
tfidf_SVM_results = calculate_results(y_true = y_test,
                                      y_pred = y_predicted_tfidf)

print("Results for Support Vector Machine with TfidfVectorizer")
tfidf_SVM_results

### Multinomial Naive Bayes

#### MNB with CountVectorizer

In [None]:
count_MNB = MultinomialNB()

count_MNB.fit(train_count, y_train)

y_predicted_count = count_MNB.predict(test_count)

In [None]:
count_MNB_results = calculate_results(y_true = y_test,
                                      y_pred = y_predicted_count)

print("Results for MNB with CountVectorizer")
count_MNB_results

### Multinomial Naive Bayes

#### MNB with TfidfVectorizer

In [None]:
tfidf_MNB = MultinomialNB()

tfidf_MNB.fit(train_tfidf, y_train)

y_predicted_tfidf = tfidf_MNB.predict(test_tfidf)

In [None]:
tfidf_MNB_results = calculate_results(y_true = y_test,
                                      y_pred = y_predicted_tfidf)

print("Results for MNB with tfidfVectorizer")
tfidf_MNB_results

### Bernoulli Naive Bayes

#### BNB with CountVectorizer

In [None]:
count_BNB = BernoulliNB()

count_BNB.fit(train_count, y_train)

y_predicted_count = count_BNB.predict(test_count)

In [None]:
count_BNB_results = calculate_results(y_true = y_test,
                                      y_pred = y_predicted_count)

print("Results for BNB with CountVectorizer")
count_BNB_results

#### BNB with TfidfVectorizer

In [None]:
tfidf_BNB = BernoulliNB()

tfidf_BNB.fit(train_tfidf, y_train)

y_predicted_tfidf = tfidf_BNB.predict(test_tfidf)

In [None]:
tfidf_BNB_results = calculate_results(y_true = y_test,
                                      y_pred = y_predicted_tfidf)

print("Results for BNB with tfidfVectorizer")
tfidf_BNB_results

### Gradient Boost Classifier

#### Gradient Boost Classifier with TfidfVectorizer

In [None]:
tfidf_GBC = GradientBoostingClassifier()

tfidf_GBC.fit(train_tfidf, y_train)

y_predicted_tfidf = tfidf_GBC.predict(test_tfidf)

In [None]:
tfidf_GBC_results = calculate_results(y_true = y_test,
                                      y_pred = y_predicted_tfidf)

print("Results for GBC with tfidfVectorizer")
tfidf_GBC_results

### XGBoost Classifier

#### XGBoost Classifier with TfidfVectorizer

In [None]:
tfidf_XGB = XGBClassifier()

tfidf_XGB.fit(train_tfidf, y_train)

y_predicted_tfidf = tfidf_XGB.predict(test_tfidf)

In [None]:
tfidf_XGB_results = calculate_results(y_true = y_test,
                                      y_pred = y_predicted_tfidf)

print("Results for XGB with tfidfVectorizer")
tfidf_XGB_results

### Stochastic Gradient Descent

#### SGD Classifier with TfidfVectorizer

In [None]:
tfidf_SGD = SGDClassifier()

tfidf_SGD.fit(train_tfidf, y_train)

y_predicted_tfidf = tfidf_SGD.predict(test_tfidf)

In [None]:
tfidf_SGD_results = calculate_results(y_true = y_test,
                                      y_pred = y_predicted_tfidf)

print("Results for SGD with tfidfVectorizer")
tfidf_SGD_results

## Combine model results into a DataFrame

In [None]:
all_ml_model_results = pd.DataFrame({#"Logistic Regression with CountVectorizer": count_LR_results,
                                     "Logistic Regression": tfidf_LR_results,
                                     #"SVM with CountVectorizer": count_SVM_results,
                                     "Support Vector Machine": tfidf_SVM_results,
                                     #"MNB with CountVectorizer": count_MNB_results,
                                     "Multinomial Naive Bayes": tfidf_MNB_results,
                                     #"BNB with CountVectorizer": count_BNB_results,
                                     "Bernoulli Naive Bayes": tfidf_BNB_results,
                                     "Gradient Boost Classifier": tfidf_GBC_results,
                                     "XGBoost Classifier": tfidf_XGB_results,
                                     "Stochastic Gradient Descent": tfidf_SGD_results})

all_ml_model_results = all_ml_model_results.transpose()
all_ml_model_results

## Model Stacking

In [None]:
clf1 = LogisticRegression(random_state=42, penalty='elasticnet', solver='saga', l1_ratio=0.5 ,n_jobs=-1,
                          max_iter=1000, class_weight='balanced')
clf2 = svm.SVC(kernel = 'rbf', gamma = 1, C = 5, probability = True)
clf3 = BernoulliNB()
clf4 = GradientBoostingClassifier()
# clf5 = XGBClassifier()
clf6 = SGDClassifier()

### Using TfidfVectorizer

In [None]:
from mlxtend.classifier import StackingCVClassifier
import random

In [None]:
scores = model_selection.cross_val_score(clf1, train_tfidf, train_df["label"], cv = 3, scoring="f1",n_jobs=-1)
scores

In [None]:
scores = model_selection.cross_val_score(clf2, train_tfidf, train_df["label"], cv = 3, scoring="f1",n_jobs=-1)
scores

In [None]:
scores = model_selection.cross_val_score(clf3, train_tfidf, train_df["label"], cv = 3, scoring="f1",n_jobs=-1)
scores

In [None]:
scores = model_selection.cross_val_score(clf4, train_tfidf, train_df["label"], cv = 3, scoring="f1",n_jobs=-1)
scores

In [None]:
# scores = model_selection.cross_val_score(clf5, train_tfidf, train_df["label"], cv = 3, scoring="f1",n_jobs=-1)
# scores

In [None]:
scores = model_selection.cross_val_score(clf6, train_tfidf, train_df["label"], cv = 3, scoring="f1",n_jobs=-1)
scores

In [None]:
stackmodel_tfidf = StackingCVClassifier(classifiers = [clf1, clf2, clf6],
                             meta_classifier = clf2,
                             cv = 5,
                             use_probas = False, 
                             use_features_in_secondary = False,
                             verbose = -2,
                             n_jobs = -1)

In [None]:
stackmodel_tfidf.fit(train_tfidf, train_df["label"])

In [None]:
stack_preds = stackmodel_tfidf.predict(test_tfidf)

In [None]:
labels = np.array(test_df["label"])

In [None]:
stacking_tfidf_result = calculate_results(y_true = test_df["label"],
                                          y_pred = stack_preds)

In [None]:
stacking_tfidf_result

In [None]:
confusion_matrix(y_test, stack_preds)

In [None]:
# Create a confusion matrix
conf_mat = confusion_matrix(y_test, stack_preds)
plot_conf_mat(conf_mat)

In [None]:
# Make predictions with probabilities
y_probs_stack = stackmodel_tfidf.predict_proba(test_tfidf)

# Keep the probabilites of the positive class only
y_probs_stack = y_probs_stack[:, 1]

# Calculate fpr, tpr and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_probs_stack)

plot_roc_curve(fpr, tpr)

In [None]:
roc_auc_score(y_test, y_probs_stack)

In [None]:
print(classification_report(y_test, stack_preds))

In [None]:
all_ml_model_results.loc["Stacked ML model"] = stacking_tfidf_result

In [None]:
all_ml_model_results.loc["Stacked ML model"]["accuracy"] = all_ml_model_results.loc["Stacked ML model"]["accuracy"]

In [None]:
all_ml_model_results