## 1. Imports

In [None]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#import tensorflow as tf

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler

from joblib import dump, load

In [None]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category = ConvergenceWarning)

## 2. Functions

In [None]:
# Function to evaluate: accuracy, precision, recall, f1-score

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results


# Create a helper function to compare our baseline results to new model results
def compare_baseline_to_new_results(baseline_results, new_model_results):
  for key, value in baseline_results.items():
    print(f"Baseline {key}: {value:.2f}, New {key}: {new_model_results[key]:.2f}, Difference: {new_model_results[key]-value:.2f}")

In [None]:
# Set the font scale
sns.set(font_scale = 1.5)

def plot_conf_mat(conf_mat):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(4, 4))
    ax = sns.heatmap(conf_mat,
                     fmt="d",
                     annot=True, # Annotate the boxes 
                     cbar=False)
    plt.xlabel('Predicted label')
    plt.ylabel('True label');

## 3. Working with Data

In [None]:
df_true = pd.read_csv("../input/banfakepreprocessed/Authentic-48K.csv")
df_fake = pd.read_csv("../input/banfakepreprocessed/Fake-1K.csv")

In [None]:
dataset = pd.concat([df_true, df_fake], axis = 0)
dataset

In [None]:
dataset["content"] = dataset["content"].fillna(' ')
dataset

In [None]:
df_true_train = df_true.sample(frac = 0.7, random_state = 42)
df_true_test = df_true.drop(df_true_train.index)

df_fake_train = df_fake.sample(frac = 0.7, random_state = 42)
df_fake_test = df_fake.drop(df_fake_train.index)

In [None]:
df_true_train.sort_index(axis = 0, inplace = True)
df_true_test.sort_index(axis = 0, inplace = True)
df_fake_train.sort_index(axis = 0, inplace = True)
df_fake_test.sort_index(axis = 0, inplace = True)

print(f"No. of training examples: {df_true_train.shape[0]}")
print(f"No. of testing examples: {df_true_test.shape[0]}")
print(f"No. of training examples: {df_fake_train.shape[0]}")
print(f"No. of testing examples: {df_fake_test.shape[0]}")

In [None]:
dataset_train = pd.concat([df_true_train, df_fake_train], axis = 0)
dataset_train

In [None]:
dataset_train["label"].value_counts()

In [None]:
dataset_test = pd.concat([df_true_test, df_fake_test], axis = 0)
dataset_test

In [None]:
dataset_test["label"].value_counts()

## 4. Initializing
### 4.1. Count Vectorization

In [None]:
def count(data):
    
    df_temp = data.copy(deep = True)
    df = dataset.copy(deep = True)
    df_temp["content"] = df_temp["content"].fillna(' ')
    df["content"] = df["content"].fillna(' ')

    count_vectorizer = CountVectorizer()
    count_vectorizer.fit(df["content"])

    list_corpus = df_temp["content"].tolist()
    list_labels = df_temp["label"].tolist()
    
    X = count_vectorizer.transform(list_corpus)
    
    return X, list_labels

### 4.2. Tfidf Vectorization

In [None]:
def tfidf(data, ngrams = 1):

    df_temp = data.copy(deep = True)
    df = dataset.copy(deep = True)
    df_temp["content"] = df_temp["content"].fillna(' ')
    df["content"] = df["content"].fillna(' ')
    
    tfidf_vectorizer = TfidfVectorizer(ngram_range = (1, ngrams))
    tfidf_vectorizer.fit(df['content'])

    list_corpus = df_temp["content"].tolist()
    list_labels = df_temp["label"].tolist()

    X = tfidf_vectorizer.transform(list_corpus)
    
    return X, list_labels

## 5. Defining Models
### 5.1. Logistic Regression

In [None]:
def logistic_regression(X_train, X_test, y_train, y_test, preprocessor):
    
    folds = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 42)
    
    clf = LogisticRegressionCV(cv = folds, solver = 'saga', multi_class = 'multinomial', n_jobs = -1)
    clf.fit(X_train, y_train)

    result = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])
    
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, fscore, support = score(y_test, y_pred)
    
    result = result.append({'Preprocessing': preprocessor, 
                            'Model': f'Logistic Regression', 'Accuracy': accuracy,
                            'F1-score-0': fscore[0], 'F1-score-1': fscore[1]}, ignore_index = True)

    return result

### 5.2. Support Vector Machine

In [None]:
def support_vector_machine(X_train, X_test, y_train, y_test, preprocessor):
    
    clf = SVC(kernel = 'rbf', gamma = 1, C = 10)
    clf.fit(X_train, y_train)

    result = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])
    
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, fscore, support = score(y_test, y_pred)
    
    result = result.append({'Preprocessing': preprocessor, 
                            'Model': f'Support Vector Machine', 'Accuracy': accuracy,
                            'F1-score-0': fscore[0], 'F1-score-1': fscore[1]}, ignore_index = True)

    return result

### 5.3 Multinomial Naive Bayes

In [None]:
def multinomial_nb(X_train, X_test, y_train, y_test, preprocessor):
    
    clf = MultinomialNB(alpha = 0.01)
    clf.fit(X_train, y_train)

    result = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])
    
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, fscore, support = score(y_test, y_pred)
    calc = f1_score(y_test, y_pred)
    
    result = result.append({'Preprocessing': preprocessor, 
                            'Model': f'Multinomial Naive Bayes', 'Accuracy': accuracy,
                            'F1-score-0': fscore[0], 'F1-score-1': fscore[1]}, ignore_index = True)

    return result

### 5.4. Bernoulli Naive Bayes

In [None]:
def bernoulli_nb(X_train, X_test, y_train, y_test, preprocessor):
    
    clf = BernoulliNB(alpha = 0.01)
    clf.fit(X_train, y_train)

    result = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])
    
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, fscore, support = score(y_test, y_pred)
    calc = f1_score(y_test, y_pred)
    
    result = result.append({'Preprocessing': preprocessor, 
                            'Model': f'Bernoulli Naive Bayes', 'Accuracy': accuracy,
                            'F1-score-0': fscore[0], 'F1-score-1': fscore[1]}, ignore_index = True)

    return result

### 5.5. Random Forest Classifier

In [None]:
def random_forest(X_train, X_test, y_train, y_test, preprocessor):
    
    clf = RandomForestClassifier(n_estimators= 400, max_features = 'sqrt')
    clf.fit(X_train, y_train)

    result = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])
    
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, fscore, support = score(y_test, y_pred)
    calc = f1_score(y_test, y_pred)
    
    result = result.append({'Preprocessing': preprocessor, 
                            'Model': f'Random Forest Classifier', 'Accuracy': accuracy,
                            'F1-score-0': fscore[0], 'F1-score-1': fscore[1]}, ignore_index = True)

    return result

### 5.6. Decision Tree

In [None]:
def decision_tree(X_train, X_test, y_train, y_test, preprocessor):
    
    clf = DecisionTreeClassifier(criterion = 'gini', max_depth=6)
    clf.fit(X_train, y_train)

    result = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])
    
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, fscore, support = score(y_test, y_pred)
    calc = f1_score(y_test, y_pred)
    
    result = result.append({'Preprocessing': preprocessor, 
                            'Model': f'Decision Tree Classifier', 'Accuracy': accuracy,
                            'F1-score-0': fscore[0], 'F1-score-1': fscore[1]}, ignore_index = True)

    return result

## 6. Oversampling
### 6.1. Random Over Sampling

In [None]:
naive_over_sample = RandomOverSampler(sampling_strategy = 'minority')
X, y = count(dataset_test)
print(f"Initial set observations {X.shape[0]}")
print(f"Initial set target classes {len(set(y))}")
X, y = naive_over_sample.fit_resample(X, y)
print(f"Modified set observations {X.shape[0]}")
print(f"Modified set target classes {len(set(y))}")

In [None]:
naive_over_sample = RandomOverSampler(sampling_strategy = 'minority')

In [None]:
result_over_count = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])

# Using Count
X, y = count(dataset_train)
X_train, y_train = naive_over_sample.fit_resample(X, y)
X, y = count(dataset_test)
X_test, y_test = naive_over_sample.fit_resample(X, y)

In [None]:
# clf1 = LogisticRegression(solver = 'saga', multi_class = 'multinomial')
# clf2 = SVC(kernel = 'rbf', gamma = 1, C = 5, probability = True)
# clf3 = MultinomialNB(alpha = 0.01)
# clf4 = BernoulliNB(alpha = 0.01)
# clf5 = RandomForestClassifier(n_estimators= 400, max_features = 'sqrt')
# clf6 = DecisionTreeClassifier(criterion = 'gini', max_depth=6)

In [None]:
clf = DecisionTreeClassifier(criterion = 'gini', max_depth=6)
clf.fit(X_train, y_train)

In [None]:
dump(clf, filename = "random-test-not-count-DTC.joblib")

In [None]:

# # Using Tfidf 1-gram
# X, y = tfidf(dataset_train)
# X_test, y_test = tfidf(dataset_test)
# X_train, y_train = naive_over_sample.fit_resample(X, y)
# result_over = result_over.append(logistic_regression(X_train, X_test, y_train, y_test, 'Tfidf 1-gram'), ignore_index = True)
# result_over = result_over.append(support_vector_machine(X_train, X_test, y_train, y_test, 'Tfidf 1-gram'), ignore_index = True)

In [None]:
result_over_tfidf2 = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])

# Using Tfidf 2-gram
X, y = tfidf(dataset_train, ngrams = 2)
X_train, y_train = naive_over_sample.fit_resample(X, y)
X, y = tfidf(dataset_test, ngrams = 2)
X_test, y_test = naive_over_sample.fit_resample(X, y)

In [None]:
clf = DecisionTreeClassifier(criterion = 'gini', max_depth=6)
clf.fit(X_train, y_train)

In [None]:
dump(clf, filename = "random-test-not-tfidf2-DTC.joblib")

### 7.2. SMOTE

In [None]:
# smote = SMOTE(sampling_strategy = 'minority')
# X, y = count(dataset)
# print(f"Initial set observations {X.shape[0]}")
# print(f"Initial set target classes {len(set(y))}")
# X, y = smote.fit_resample(X, y)
# print(f"Modified set observations {X.shape[0]}")
# print(f"Modified set target classes {len(set(y))}")

In [None]:
smote = SMOTE(sampling_strategy = 'minority')

In [None]:
# Using Count
result_smote_count = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])

X, y = count(dataset_train)
X_train, y_train = smote.fit_resample(X, y)
X, y = count(dataset_test)
X_test, y_test = smote.fit_resample(X, y)

In [None]:
clf = DecisionTreeClassifier(criterion = 'gini', max_depth=6)
clf.fit(X_train, y_train)

In [None]:
dump(clf, filename = "smote-test-not-count-DTC.joblib")

In [None]:
# clf1 = LogisticRegression(solver = 'saga', multi_class = 'multinomial')
# clf2 = SVC(kernel = 'rbf', gamma = 1, C = 5, probability = True)
# clf3 = MultinomialNB(alpha = 0.01)
# clf4 = BernoulliNB(alpha = 0.01)
# clf5 = RandomForestClassifier(n_estimators= 400, max_features = 'sqrt')
# clf6 = DecisionTreeClassifier(criterion = 'gini', max_depth=6)

In [None]:
result_smote_tfidf2 = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])

# Using Tfidf 2-gram
X, y = tfidf(dataset_train, ngrams = 2)
X_train, y_train = smote.fit_resample(X, y)
X, y = tfidf(dataset_test, ngrams = 2)
X_test, y_test = smote.fit_resample(X, y)

In [None]:
clf = DecisionTreeClassifier(criterion = 'gini', max_depth=6)
clf.fit(X_train, y_train)

In [None]:
dump(clf, filename = "smote-test-not-tfidf2-DTC.joblib")

### 7.3. ADASYN

In [None]:
# adasyn = ADASYN(sampling_strategy = 'minority')
# X, y = count(dataset)
# print(f"Initial set observations {X.shape[0]}")
# print(f"Initial set target classes {len(set(y))}")
# X, y = smote.fit_resample(X, y)
# print(f"Modified set observations {X.shape[0]}")
# print(f"Modified set target classes {len(set(y))}")

In [None]:
adasyn = ADASYN(sampling_strategy = 'minority')

In [None]:
result_adasyn_count = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])

# Using Count
X, y = count(dataset_train)
X_train, y_train = adasyn.fit_resample(X, y)
X, y = count(dataset_test)
X_test, y_test = adasyn.fit_resample(X, y)

In [None]:
clf = DecisionTreeClassifier(criterion = 'gini', max_depth=6)
clf.fit(X_train, y_train)

In [None]:
dump(clf, filename = "adasyn-test-not-count-DTC.joblib")

In [None]:
# clf1 = LogisticRegression(solver = 'saga', multi_class = 'multinomial')
# clf2 = SVC(kernel = 'rbf', gamma = 1, C = 5, probability = True)
# clf3 = MultinomialNB(alpha = 0.01)
# clf4 = BernoulliNB(alpha = 0.01)
# clf5 = RandomForestClassifier(n_estimators= 400, max_features = 'sqrt')
# clf6 = DecisionTreeClassifier(criterion = 'gini', max_depth=6)

In [None]:
# # Using Tfidf 1-gram
# X, y = tfidf(dataset_train)
# X_test, y_test = tfidf(dataset_test)
# X_train, y_train = adasyn.fit_resample(X, y)
# result_adasyn = result_adasyn.append(logistic_regression(X_train, X_test, y_train, y_test, 'Tfidf 1-gram'), ignore_index = True)
# result_adasyn = result_adasyn.append(support_vector_machine(X_train, X_test, y_train, y_test, 'Tfidf 1-gram'), ignore_index = True)


In [None]:
result_adasyn_tfidf2 = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])

# Using Tfidf 2-gram
X, y = tfidf(dataset_train, ngrams = 2)
X_train, y_train = adasyn.fit_resample(X, y)
X, y = tfidf(dataset_test, ngrams = 2)
X_test, y_test = adasyn.fit_resample(X, y)

In [None]:
clf = DecisionTreeClassifier(criterion = 'gini', max_depth=6)
clf.fit(X_train, y_train)

In [None]:
dump(clf, filename = "adasyn-test-not-tfidf2-DTC.joblib")