In [None]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#import tensorflow as tf

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.over_sampling import SMOTE, ADASYN

from joblib import dump, load

In [None]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

## 2. Functions

In [None]:
# Function to evaluate: accuracy, precision, recall, f1-score

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred)
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results


# Create a helper function to compare our baseline results to new model results
def compare_baseline_to_new_results(baseline_results, new_model_results):
  for key, value in baseline_results.items():
    print(f"Baseline {key}: {value:.2f}, New {key}: {new_model_results[key]:.2f}, Difference: {new_model_results[key]-value:.2f}")

In [None]:
# Set the font scale
sns.set(font_scale = 1.5)

def plot_conf_mat(conf_mat):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(4, 4))
    ax = sns.heatmap(conf_mat,
                     fmt="d",
                     annot=True, # Annotate the boxes 
                     cbar=False)
    plt.xlabel('Predicted label')
    plt.ylabel('True label');

## 3. Working with data

In [None]:
df_true = pd.read_csv("../input/banfakepreprocessed/Authentic-48K.csv")
df_fake = pd.read_csv("../input/banfakepreprocessed/Fake-1K.csv")

In [None]:
dataset = pd.concat([df_true, df_fake], axis = 0)
dataset

In [None]:
dataset["label"].value_counts()

In [None]:
count_true, count_fake = dataset["label"].value_counts()
count_true, count_fake

## 4. Initializing

#### CountVectorization

In [None]:
def count(data):
    
    df_temp = data.copy(deep = True)
    df = dataset.copy(deep = True)
    df_temp["content"] = df_temp["content"].fillna(' ')
    df["content"] = df["content"].fillna(' ')

    count_vectorizer = CountVectorizer()
    count_vectorizer.fit(df["content"])

    list_corpus = df_temp["content"].tolist()
    list_labels = df_temp["label"].tolist()
    
    X = count_vectorizer.transform(list_corpus)
    
    return X, list_labels

#### TfidfVectorization

In [None]:
def tfidf(data, ngrams = 1):

    df_temp = data.copy(deep = True)
    df = dataset.copy(deep = True)
    df_temp["content"] = df_temp["content"].fillna(' ')
    df["content"] = df["content"].fillna(' ')
    
    tfidf_vectorizer = TfidfVectorizer(ngram_range = (1, ngrams))
    tfidf_vectorizer.fit(df['content'])

    list_corpus = df_temp["content"].tolist()
    list_labels = df_temp["label"].tolist()

    X = tfidf_vectorizer.transform(list_corpus)
    
    return X, list_labels

## 5. Defining models

### 5.1. Logistic Regression

In [None]:
def logistic_regression(X_train, X_test, y_train, y_test, preprocessor):
    
    folds = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 40)
    
    clf = LogisticRegressionCV(cv = folds, solver = 'saga', multi_class = 'multinomial', n_jobs = -1)
    clf.fit(X_train, y_train)

    result = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])
    
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, fscore, support = score(y_test, y_pred)
    
    result = result.append({'Preprocessing': preprocessor, 
                            'Model': f'Logistic Regression', 'Accuracy': accuracy,
                            'F1-score-0': fscore[0], 'F1-score-1': fscore[1]}, ignore_index = True)

    return result

### 5.2. Support Vector Machine

In [None]:
def support_vector_machine(X_train, X_test, y_train, y_test, preprocessor):
    
    clf = SVC(kernel = 'rbf', gamma = 1, C = 10)
    clf.fit(X_train, y_train)

    result = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])
    
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, fscore, support = score(y_test, y_pred)
    calc = f1_score(y_test, y_pred)
    
    result = result.append({'Preprocessing': preprocessor, 
                            'Model': f'Support Vector Machine', 'Accuracy': accuracy,
                            'F1-score-0': fscore[0], 'F1-score-1': fscore[1]}, ignore_index = True)

    return result

### 5.3. Multinomial Naive Bayes

In [None]:
def multinomial_nb(X_train, X_test, y_train, y_test, preprocessor):
    
    clf = MultinomialNB(alpha = 0.01)
    clf.fit(X_train, y_train)

    result = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])
    
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, fscore, support = score(y_test, y_pred)
    calc = f1_score(y_test, y_pred)
    
    result = result.append({'Preprocessing': preprocessor, 
                            'Model': f'Multinomial Naive Bayes', 'Accuracy': accuracy,
                            'F1-score-0': fscore[0], 'F1-score-1': fscore[1]}, ignore_index = True)

    return result

### 5.4. Bernoulli Naive Bayes

In [None]:
def bernoulli_nb(X_train, X_test, y_train, y_test, preprocessor):
    
    clf = BernoulliNB(alpha = 0.01)
    clf.fit(X_train, y_train)

    result = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])
    
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, fscore, support = score(y_test, y_pred)
    calc = f1_score(y_test, y_pred)
    
    result = result.append({'Preprocessing': preprocessor, 
                            'Model': f'Bernoulli Naive Bayes', 'Accuracy': accuracy,
                            'F1-score-0': fscore[0], 'F1-score-1': fscore[1]}, ignore_index = True)

    return result

### 5.5. Random Forest Classifier

In [None]:
def random_forest(X_train, X_test, y_train, y_test, preprocessor):
    
    clf = RandomForestClassifier(n_estimators= 400, max_features = 'sqrt')
    clf.fit(X_train, y_train)

    result = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])
    
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, fscore, support = score(y_test, y_pred)
    calc = f1_score(y_test, y_pred)
    
    result = result.append({'Preprocessing': preprocessor, 
                            'Model': f'Random Forest Classifier', 'Accuracy': accuracy,
                            'F1-score-0': fscore[0], 'F1-score-1': fscore[1]}, ignore_index = True)

    return result

### 5.6. Decision Tree

In [None]:
def decision_tree(X_train, X_test, y_train, y_test, preprocessor):
    
    clf = DecisionTreeClassifier(criterion = 'gini', max_depth=6)
    clf.fit(X_train, y_train)

    result = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])
    
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, fscore, support = score(y_test, y_pred)
    calc = f1_score(y_test, y_pred)
    
    result = result.append({'Preprocessing': preprocessor, 
                            'Model': f'Decision Tree Classifier', 'Accuracy': accuracy,
                            'F1-score-0': fscore[0], 'F1-score-1': fscore[1]}, ignore_index = True)

    return result

## 6. Undersampling
### 6.1. Random under sampling (Naive Approach)

In [None]:
naive_under_sample = RandomUnderSampler(sampling_strategy = 'majority')

In [None]:
result_under_count = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])

# Using Count
X, y = count(dataset)
X, y = naive_under_sample.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
result_under_count = result_under_count.append(logistic_regression(X_train, X_test, y_train, y_test, 'Count Vectorizer'), ignore_index = True)
result_under_count

In [None]:
result_under_count = result_under_count.append(support_vector_machine(X_train, X_test, y_train, y_test, 'Count Vectorizer'), ignore_index = True)
result_under_count

In [None]:
result_under_count = result_under_count.append(multinomial_nb(X_train, X_test, y_train, y_test, 'Count Vectorizer'), ignore_index = True)
result_under_count

In [None]:
result_under_count = result_under_count.append(bernoulli_nb(X_train, X_test, y_train, y_test, 'Count Vectorizer'), ignore_index = True)
result_under_count

In [None]:
result_under_count = result_under_count.append(random_forest(X_train, X_test, y_train, y_test, 'Count Vectorizer'), ignore_index = True)
result_under_count

In [None]:
result_under_count = result_under_count.append(decision_tree(X_train, X_test, y_train, y_test, 'Count Vectorizer'), ignore_index = True)
result_under_count

In [None]:
# # Using Tfidf 1-gram
# X, y = tfidf(dataset)
# X, y = naive_under_sample.fit_resample(X, y)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
# result_under = result_under.append(logistic_regression(X_train, X_test, y_train, y_test, 
#                                                              'Tfidf 1-gram'), ignore_index = True)

In [None]:
result_under_tfidf2 = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])

# Using Tfidf 2-gram
X, y = tfidf(dataset, ngrams = 2)
X, y = naive_under_sample.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
result_under_tfidf2 = result_under_tfidf2.append(logistic_regression(X_train, X_test, y_train, y_test, 'Tfidf 2-gram'), ignore_index = True)
result_under_tfidf2

In [None]:
result_under_tfidf2 = result_under_tfidf2.append(support_vector_machine(X_train, X_test, y_train, y_test, 'Tfidf 2-gram'), ignore_index = True)
result_under_tfidf2

In [None]:
result_under_tfidf2 = result_under_tfidf2.append(multinomial_nb(X_train, X_test, y_train, y_test, 'Tfidf 2-gram'), ignore_index = True)
result_under_tfidf2

In [None]:
result_under_tfidf2 = result_under_tfidf2.append(bernoulli_nb(X_train, X_test, y_train, y_test, 'Tfidf 2-gram'), ignore_index = True)
result_under_tfidf2

In [None]:
result_under_tfidf2 = result_under_tfidf2.append(random_forest(X_train, X_test, y_train, y_test, 'Tfidf 2-gram'), ignore_index = True)
result_under_tfidf2

In [None]:
result_under_tfidf2 = result_under_tfidf2.append(decision_tree(X_train, X_test, y_train, y_test, 'Tfidf 2-gram'), ignore_index = True)
result_under_tfidf2

### 6.2. NearMiss

In [None]:
near_under_sample = NearMiss(sampling_strategy='majority')

In [None]:
result_near_count = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])

# Using Count
X, y = count(dataset)
X, y = near_under_sample.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
result_near_count = result_near_count.append(logistic_regression(X_train, X_test, y_train, y_test, 'Count Vectorizer'), ignore_index = True)
result_near_count

In [None]:
result_near_count = result_near_count.append(support_vector_machine(X_train, X_test, y_train, y_test, 'Count Vectorizer'), ignore_index = True)
result_near_count

In [None]:
result_near_count = result_near_count.append(multinomial_nb(X_train, X_test, y_train, y_test, 'Count Vectorizer'), ignore_index = True)
result_near_count

In [None]:
result_near_count = result_near_count.append(bernoulli_nb(X_train, X_test, y_train, y_test, 'Count Vectorizer'), ignore_index = True)
result_near_count

In [None]:
result_near_count = result_near_count.append(random_forest(X_train, X_test, y_train, y_test, 'Count Vectorizer'), ignore_index = True)
result_near_count

In [None]:
result_near_count = result_near_count.append(decision_tree(X_train, X_test, y_train, y_test, 'Count Vectorizer'), ignore_index = True)
result_near_count

In [None]:
# # Using Tfidf 1-gram
# X, y = tfidf(dataset)
# X, y = near_under_sample.fit_resample(X, y)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
# result_under = result_under.append(logistic_regression(X_train, X_test, y_train, y_test, 
#                                                              'Tfidf 1-gram'), ignore_index = True)

In [None]:
result_near_tfidf2 = pd.DataFrame(columns = ['Preprocessing', 'Model', 'Accuracy', 'F1-score-0', 'F1-score-1'])

# Using Tfidf 2-gram
X, y = tfidf(dataset, ngrams = 2)
X, y = near_under_sample.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
result_near_tfidf2 = result_near_tfidf2.append(logistic_regression(X_train, X_test, y_train, y_test, 'Tfidf 2-gram'), ignore_index = True)
result_near_tfidf2

In [None]:
result_near_tfidf2 = result_near_tfidf2.append(support_vector_machine(X_train, X_test, y_train, y_test, 'Tfidf 2-gram'), ignore_index = True)
result_near_tfidf2

In [None]:
result_near_tfidf2 = result_near_tfidf2.append(multinomial_nb(X_train, X_test, y_train, y_test, 'Tfidf 2-gram'), ignore_index = True)
result_near_tfidf2

In [None]:
result_near_tfidf2 = result_near_tfidf2.append(bernoulli_nb(X_train, X_test, y_train, y_test, 'Tfidf 2-gram'), ignore_index = True)
result_near_tfidf2

In [None]:
result_near_tfidf2 = result_near_tfidf2.append(random_forest(X_train, X_test, y_train, y_test, 'Tfidf 2-gram'), ignore_index = True)
result_near_tfidf2

In [None]:
result_near_tfidf2 = result_near_tfidf2.append(decision_tree(X_train, X_test, y_train, y_test, 'Tfidf 2-gram'), ignore_index = True)
result_near_tfidf2