In [None]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#import tensorflow as tf

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Function to evaluate: accuracy, precision, recall, f1-score

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results


# Create a helper function to compare our baseline results to new model results
def compare_baseline_to_new_results(baseline_results, new_model_results):
  for key, value in baseline_results.items():
    print(f"Baseline {key}: {value:.2f}, New {key}: {new_model_results[key]:.2f}, Difference: {new_model_results[key]-value:.2f}")

In [None]:
df_true = pd.read_csv("../input/banfakepreprocessed/LabeledAuthentic-7K.csv")
df_fake = pd.read_csv("../input/banfakepreprocessed/Fake-1K.csv")

dataset = pd.concat([df_true, df_fake], axis = 0)

X = dataset["content"]
y = dataset["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
count = feature_extraction.text.CountVectorizer()
train_count = count.fit_transform(X_train)
test_count = count.transform(X_test)

In [None]:
tfidf = TfidfVectorizer(ngram_range = (1, 2), analyzer = 'word')
train_tfidf = tfidf.fit_transform(X_train)
test_tfidf = tfidf.transform(X_test)

In [None]:
def evaluate_preds(y_true, y_preds):
    
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    
    metric_dict = {"accuracy": round(accuracy, 2),
                   "precision": round(precision, 2),
                   "recall": round(recall, 2),
                   "f1": round(f1, 2)}
    
    print(f"Accucary: {accuracy * 100:.02f}%")
    print(f"Precision: {precision * 100:.02f}%")
    print(f"Recall: {recall * 100:.02f}%")
    print(f"F1: {f1 * 100:.02f}%")
    
    return metric_dict

In [None]:
param_grid = {'n_estimators': [int(x) for x in np.linspace(200, 2000, 200)],
              'max_features': ['auto', 'sqrt','log2'],
              'max_depth': [int(x) for x in np.linspace(10, 1000, 50)],
              'min_samples_split': [2, 5, 10, 14],
              'min_samples_leaf': [1, 2, 4, 6, 8]}

grid = GridSearchCV(RandomForestClassifier(), param_grid, cv = 5, verbose = 2)
grid.fit(train_tfidf, y_train)

print("Train Accuracy: %.3f", grid.best_estimator_.score(train_tfidf, y_train))
print("Test Accuracy: %.3f", grid.best_estimator_.score(test_tfidf, y_test))
print("Best Score: %.3f", grid.best_score_)
print("Best Parameters: %.3f", grid.best_params_)