## 1. Imports

In [None]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#import tensorflow as tf

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.class_weight import compute_class_weight

from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler

from joblib import dump, load

In [None]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

## 2. Functions

In [None]:
# Function to evaluate: accuracy, precision, recall, f1-score

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results


# Create a helper function to compare our baseline results to new model results
def compare_baseline_to_new_results(baseline_results, new_model_results):
  for key, value in baseline_results.items():
    print(f"Baseline {key}: {value:.2f}, New {key}: {new_model_results[key]:.2f}, Difference: {new_model_results[key]-value:.2f}")

In [None]:
# Set the font scale
sns.set(font_scale = 1.5)

def plot_conf_mat(conf_mat):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(4, 4))
    ax = sns.heatmap(conf_mat,
                     fmt="d",
                     annot=True, # Annotate the boxes 
                     cbar=False)
    plt.xlabel('Predicted label')
    plt.ylabel('True label');

## 3. Working with Data

In [None]:
df_true = pd.read_csv("../input/banfakepreprocessed/Authentic-48K.csv")
df_fake = pd.read_csv("../input/banfakepreprocessed/Fake-1K.csv")

In [None]:
dataset = pd.concat([df_true, df_fake], axis = 0)
dataset

In [None]:
count_true, count_fake = dataset["label"].value_counts()
count_true, count_fake

## 4. Preprocessing

### 4.1. Count Vectorization

In [None]:
def count(data):
    
    df_temp = data.copy(deep = True)
    df = dataset.copy(deep = True)
    df_temp["content"] = df_temp["content"].fillna(' ')
    df["content"] = df["content"].fillna(' ')

    count_vectorizer = CountVectorizer()
    count_vectorizer.fit(df["content"])

    list_corpus = df_temp["content"].tolist()
    list_labels = df_temp["label"].tolist()
    
    X = count_vectorizer.transform(list_corpus)
    
    return X, list_labels

### 4.2. TFIDF Vectorization

In [None]:
def tfidf(data, ngrams = 1):

    df_temp = data.copy(deep = True)
    df = dataset.copy(deep = True)
    df_temp["content"] = df_temp["content"].fillna(' ')
    df["content"] = df["content"].fillna(' ')
    
    tfidf_vectorizer = TfidfVectorizer(ngram_range = (1, ngrams))
    tfidf_vectorizer.fit(df['content'])

    list_corpus = df_temp["content"].tolist()
    list_labels = df_temp["label"].tolist()

    X = tfidf_vectorizer.transform(list_corpus)
    
    return X, list_labels

## 5. Modeling

### 5.1. Class-weight: Balanced

In [None]:
# Using Count
X, y = count(dataset)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
clf = SVC(kernel = 'rbf', gamma = 1, C = 10, class_weight = 'balanced')
clf.fit(X_train, y_train)
dump(clf, filename = "weight-balanced-count-SVM.joblib")

In [None]:
clf = SVC(kernel = 'rbf', gamma = 1, C = 10, class_weight = 'balanced', probability = True)
clf.fit(X_train, y_train)
dump(clf, filename = "weight-balanced-count-SVM-proba.joblib")

In [None]:
# clf = LogisticRegressionCV(cv = folds, solver = 'saga', multi_class = 'multinomial', n_jobs = -1, class_weight = 'balanced')
# clf = SVC(kernel = 'rbf', gamma = 1, C = 10, class_weight = 'balanced')
# clf = RandomForestClassifier(n_estimators= 400, max_features = 'sqrt', class_weight = 'balanced')
# clf = DecisionTreeClassifier(criterion = 'gini', max_depth = 6, class_weight = 'balanced')

In [None]:
# # Using Tfidf 1-gram
# X, y = tfidf(dataset)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
# result_balanced_weight = result_balanced_weight.append(logistic_regression(X_train, X_test, y_train, y_test, 'Tfidf 1-gram', 'balanced'), ignore_index = True)
# result_balanced_weight = result_balanced_weight.append(support_vector_machine(X_train, X_test, y_train, y_test, 'Tfidf 1-gram', 'balanced'), ignore_index = True)

In [None]:
# Using Tfidf 2-gram
X, y = tfidf(dataset, ngrams = 2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
clf = SVC(kernel = 'rbf', gamma = 1, C = 10, class_weight = 'balanced')
clf.fit(X_train, y_train)
dump(clf, filename = "weight-balanced-tfidf2-SVM.joblib")