In [None]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Function to evaluate: accuracy, precision, recall, f1-score

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results


# Create a helper function to compare our baseline results to new model results
def compare_baseline_to_new_results(baseline_results, new_model_results):
  for key, value in baseline_results.items():
    print(f"Baseline {key}: {value:.2f}, New {key}: {new_model_results[key]:.2f}, Difference: {new_model_results[key]-value:.2f}")

In [None]:
dataset = pd.read_csv("../input/banfake-fractioned/labelled_combined.csv")

train_df = dataset.sample(frac = 0.8, random_state = 42)
test_df = dataset.drop(train_df.index)

train_df.sort_index(axis = 0, inplace = True)
test_df.sort_index(axis = 0, inplace = True)

print(f"No. of training examples: {train_df.shape[0]}")
print(f"No. of testing examples: {test_df.shape[0]}")

In [None]:
y_train = train_df["label"]
y_test = test_df["label"]

In [None]:
# count_vectorizer = feature_extraction.text.CountVectorizer(ngram_range = (1, 3))
# train_count = count_vectorizer.fit_transform(train_df["content"])
# test_count = count_vectorizer.transform(test_df["content"])

In [None]:
# tfid_transformer= feature_extraction.text.TfidfTransformer()
# train_tfidf = tfid_transformer.fit_transform(train_count)
# test_tfidf = tfid_transformer.fit_transform(test_count)

In [None]:
tfidf_vectorizer = TfidfVectorizer(analyzer = 'word')
train_tfidf = tfidf_vectorizer.fit_transform(train_df["content"])
test_tfidf = tfidf_vectorizer.transform(test_df["content"])

In [None]:
def evaluate_preds(y_true, y_preds):
    
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    
    metric_dict = {"accuracy": round(accuracy, 2),
                   "precision": round(precision, 2),
                   "recall": round(recall, 2),
                   "f1": round(f1, 2)}
    
    print(f"Accucary: {accuracy * 100:.02f}%")
    print(f"Precision: {precision * 100:.02f}%")
    print(f"Recall: {recall * 100:.02f}%")
    print(f"F1: {f1 * 100:.02f}%")
    
    return metric_dict

In [None]:
param_grid = {'C': [0.1,1, 10, 100], 
              'gamma': [1,0.1,0.01,0.001],
              'kernel': ['rbf', 'poly', 'sigmoid']}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 2)
grid.fit(train_tfidf, y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
tfidf_SVM = SVC(kernel = 'rbf', gamma = 1, C = 10)
tfidf_SVM.fit(train_tfidf, y_train)
y_predicted_tfidf = tfidf_SVM.predict(test_tfidf)

tfidf_SVM_results = calculate_results(y_true = y_test,
                                      y_pred = y_predicted_tfidf)

print("Results for Support Vector Machine with TfidfVectorizer")
tfidf_SVM_results