In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import seaborn as sns
import string

#from eli5 import show_weights, TextExplainer
#from wordcloud import WordCloud, STOPWORDS

import nltk
nltk.download('stopwords')
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay


In [None]:
real = pd.read_csv('./data/True.csv')
fake = pd.read_csv('./data/Fake.csv')

real['fake'] = 0
fake['fake'] = 1

In [None]:
articles = pd.concat([real, fake])

In [None]:
def preprocess_stem(text):
    #Tokenize text
    tokens = word_tokenize(text.lower())
    
    #define stop words and punctuation
    stop_words = stopwords.words('english')
    punc = list(string.punctuation)
    
    #Stem the tokens
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words and token not in punc]
    
    return ' '.join(stemmed_tokens)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(articles['title'].apply(preprocess_stem), articles['fake'],
                                                   test_size=0.25, random_state=42)

In [None]:
rf_pipe = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('model', RandomForestClassifier(max_depth=None,
                                    max_features=5,
                                    n_estimators=1000))
])

rf_pipe.fit(X_train, y_train)

In [None]:
y_pred = rf_pipe.predict(X_test)

rf_accuracy = accuracy_score(y_test, y_pred)
rf_precision = precision_score(y_test, y_pred)
rf_recall = recall_score(y_test, y_pred)

print('Metrics of Random Forest model:')
print(f'Accuracy: {rf_accuracy}')
print(f'Precision: {rf_precision}')
print(f'Recall: {rf_recall}')

In [None]:
feature_importances = rf_pipe['model'].feature_importances_

top_ind = np.argpartition(feature_importances, -25)[-25:]

top_tokens = rf_pipe['vectorizer'].get_feature_names_out()[top_ind]
top_tokens_importances = feature_importances[top_ind]

In [None]:
fig, ax = plt.subplots(figsize=(12,6))

_ = sns.barplot(x = top_tokens, y = top_tokens_importances, palette = 'cool',
               order = [token for _, token in sorted(zip(top_tokens_importances, top_tokens), reverse=True)])
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
_ = ax.set(xlabel='Token', ylabel='Importance', title='Top 25 tokens by Feature Importance in Random Forest Model')