In [None]:
# cai pakage
import pandas as pd
import numpy as np
import regex as re
import itertools
import pickle

# visual packages
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

# SKLearn pakage
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

# NLTK modules
import nltk
from nltk import FreqDist, WordNetLemmatizer, pos_tag
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords, wordnet

# Suppress future, deprecation, and SettingWithCopy warnings
import warnings
warnings.filterwarnings("ignore", category= FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
pd.options.mode.chained_assignment = None

# make all columns in a df viewable and wider
pd.options.display.max_columns = None
pd.options.display.width = None
pd.set_option('max_colwidth', 400)

In [None]:
realnews = pd.read_csv('data/True.csv')

realnews['target'] = 0
realnews

In [None]:
fakenews = pd.read_csv('data/Fake.csv')

fakenews['target'] = 1
fakenews

In [None]:
news = pd.concat([realnews, fakenews]).reset_index()
news.drop(['index', 'subject', 'date'], axis = 1, inplace = True)
news

In [None]:
news['target'].value_counts(normalize = True)

In [None]:
# clean data and eda
news['article_length'] = news['text'].str.len()
news.head(2)

In [None]:
# plot article lengths up to a certain threshold
fig, ax = plt.subplots()
sns.histplot(data = news, x = 'article_length', hue = 'target')

ax.set_title('Partial Histogram of Article Lengths')
ax.set_xlabel('Article Length (Characters)')
ax.legend(labels = ['Fake', 'Real'])
plt.xlim(0, 6000);

In [None]:
news[news['article_length'] <= 1]

In [None]:
news[news['article_length'] <= 50]

In [None]:
shortarticles = news[news['article_length'] <= 50]
news.drop(index = shortarticles.index, inplace = True)
news

In [None]:
news.loc[news['text'].duplicated() == True]

In [None]:
news.drop_duplicates(keep = 'first', inplace = True)
news

In [None]:
news['target'].value_counts(normalize = True)

In [None]:
news.sample(n=10)

In [None]:
X = news.drop('target', axis = 1)
y = news['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [None]:
X_train["text"] = X_train["text"].str.lower()
X_train.head(2)

In [None]:
def replace_links(string):
    '''
    A function that takes in a string as an input. 
    
    Uses a regex expression to detect URLs that start with http, https, pic.twitter, or www.; 
    Twitter account names (which start with @); and hashtags (which start with #) and replaces 
    them with a blank.
    
    Returns the string with that substitution made.
    '''
    url_pattern = r'(?:http|https|pic\.twitter|www\.)\S+|@\S+|^@\S+|#\S+'
    return re.sub(url_pattern, '', string, flags=re.IGNORECASE)

In [None]:

X_train['text'] = X_train['text'].map(replace_links)
X_train.sample(n=10, random_state = 270)

In [None]:
real_reuters_mask = realnews['text'].str.contains('Reuters').sum()
fake_reuters_mask = fakenews['text'].str.contains('Reuters').sum()

print(f'The word Reuters appears in {real_reuters_mask} out of {realnews.shape[0]} total real news articles.')
print(f'The word Reuters appears in {fake_reuters_mask} out of {fakenews.shape[0]} total fake news articles.')

In [None]:
X_train['text'] = X_train['text'].str.replace('reuters', '')
X_train.sample(n=10, random_state = 270)

In [None]:
basic_token_pattern = r"\b[a-zA-Z]{3,}\b"

tokenizer = RegexpTokenizer(basic_token_pattern)

X_train['tokenized_text'] = X_train['text'].apply(tokenizer.tokenize)

X_train.head()

In [None]:
def visualize_top_10(freq_dist, title):

    top_10 = list(zip(*freq_dist.most_common(10)))
    tokens = top_10[0]
    counts = top_10[1]

    fig, ax = plt.subplots()
    ax.bar(tokens, counts)

    ax.set_title(title)
    ax.set_ylabel("Count")
    ax.yaxis.set_major_locator(MaxNLocator(integer=True))
    ax.tick_params(axis="x", rotation=90)

In [None]:
train_freq_dist = FreqDist(X_train["tokenized_text"].explode())

# Plot 
visualize_top_10(train_freq_dist, "Top 10 Word Frequency for Training Set")

In [None]:
flattened_Xtrain = pd.Series(list(itertools.chain(*X_train['tokenized_text'])))
flattened_Xtrain

In [None]:
print(len(pd.Series(flattened_Xtrain.unique())))

In [None]:
# Convert token lists to strings
X_baseline_tokens = X_train["tokenized_text"].str.join(" ")

In [None]:
# instantiate, fit, and transform vectorizer
tfidf_baseline = TfidfVectorizer()
X_baseline_vec = tfidf_baseline.fit_transform(X_baseline_tokens)

In [None]:
baseline_model = MultinomialNB()
baseline_cv = cross_val_score(baseline_model, X_baseline_vec, y_train)
print(f'CV scores: {baseline_cv.round(4)}')
print(f'Mean CV score: {baseline_cv.mean().round(4)}')

In [None]:
stopwords_list = stopwords.words('english')
stopwords_list.append(['also', 'reuters'])

def remove_stopwords(token_list):
    """
    Given a list of tokens, return a list where the tokens
    that are also present in stopwords_list have been
    removed
    """
    return [token for token in token_list if token not in stopwords_list]

In [None]:
X_train['stopwords_removed'] = X_train["tokenized_text"].apply(remove_stopwords)

In [None]:
X_train.head(2)

In [None]:
stopwords_freq_dist = FreqDist(X_train["stopwords_removed"].explode())

visualize_top_10(stopwords_freq_dist, "Top 10 Words - No Stop Words")

In [None]:
flattened_nostopwords = pd.Series(list(itertools.chain(*X_train['stopwords_removed'])))
print(f'After removing stop words, there are {len(flattened_nostopwords)} tokens in the corpus,\
      {len(flattened_nostopwords.unique())} of which are unique.')
print(f'Before removing stop words, there were {len(flattened_Xtrain)} tokens in the corpus,\
      {len(flattened_Xtrain.unique())} of which were unique.')

In [None]:
X_nostopwords_tokens = X_train["stopwords_removed"].str.join(" ")

In [None]:
tfidf_nostopwords = TfidfVectorizer()
X_nostopwords_vec = tfidf_nostopwords.fit_transform(X_nostopwords_tokens)

In [None]:
nostopwords_model = MultinomialNB()
nostopwords_cv = cross_val_score(nostopwords_model, X_nostopwords_vec, y_train)
print(f'CV scores: {nostopwords_cv.round(4)}')
print(f'Mean CV score: {nostopwords_cv.mean().round(4)}')

In [None]:
def preprocess_text(text):
    
    # make all characters lowercase
    text = text.lower()
    
    # remove URLs, twitter names, etc
    url_pattern = r'(?:http|https|pic\.twitter|www\.)\S+|@\S+|^@\S+|#\S+'
    string =  re.sub(url_pattern, '', text, flags=re.IGNORECASE)
    
    # remove Reuters
    text = text.replace('reuters', '')
    
    # tokenize text
    basic_token_pattern = r"\b[a-zA-Z]{3,}\b"
    tokenizer = RegexpTokenizer(basic_token_pattern)
    stopwords_list = stopwords.words('english')
    text = tokenizer.tokenize(text)
    text = [token.lower() for token in text if token.lower() not in stopwords_list]
    
    #initialize lemmatizer
    wnl = WordNetLemmatizer()

    # helper function to change nltk's part of speech tagging to a wordnet format.
    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:         
            return None
        
    # creates list of tuples with tokens and POS tags in wordnet format
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(text))) 
    
    # lemmatizes each token based on part of speech in tuple
    text = [wnl.lemmatize(token, pos) for token, pos in wordnet_tagged if pos is not None]
    
    return text

In [None]:
X_train['lemmatized'] = X_train['text'].apply(preprocess_text)

In [None]:
X_train.head()

In [None]:
lemmatized_dist = FreqDist(X_train["lemmatized"].explode())
visualize_top_10(lemmatized_dist, "Top 10 Words - Lemmatized")

In [None]:
flattened_lemmatization = pd.Series(list(itertools.chain(*X_train['lemmatized'])))
print(f'After removing stop words and lemmatizing, there are {len(flattened_lemmatization)} tokens in the corpus,\
      {len(flattened_lemmatization.unique())} of which are unique.')
print(f'After removing stop words but before lemmatizing, there were {len(flattened_nostopwords)} tokens in the corpus,\
      {len(flattened_nostopwords.unique())} of which are unique.')
print(f'Before removing stop words, there were {len(flattened_Xtrain)} tokens in the corpus,\
      {len(flattened_Xtrain.unique())} of which were unique.')

In [None]:
X_lemm_tokens = X_train["lemmatized"].str.join(" ")

In [None]:
tfidf_lemm = TfidfVectorizer()
X_lemm_vec = tfidf_lemm.fit_transform(X_lemm_tokens)

In [None]:
lemm_model = MultinomialNB()
lemm_cv = cross_val_score(lemm_model, X_lemm_vec, y_train)

print(f'CV scores: {lemm_cv.round(4)}')
print(f'Mean CV score: {lemm_cv.mean().round(4)}')

In [None]:
evaluation = pd.DataFrame({'Model': [],
                           'Mean CV Accuracy':[]})

evaluation.loc[0] = ['Naive Bayes', lemm_cv.mean().round(4)]
evaluation

In [None]:
lr_model = LogisticRegression()
lr_model.fit(X_lemm_vec, y_train)
lr_cv = cross_val_score(lr_model, X_lemm_vec, y_train)

evaluation.loc[1] = ['Logistic Reg.', lr_cv.mean().round(4)]
evaluation

In [None]:
svc_model = LinearSVC()
svc_model.fit(X_lemm_vec, y_train)
svc_cv = cross_val_score(svc_model, X_lemm_vec, y_train)

evaluation.loc[2] = ['SVC', svc_cv.mean().round(4)]
evaluation

In [None]:
rf_model = RandomForestClassifier()
rf_model.fit(X_lemm_vec, y_train)
rf_cv = cross_val_score(rf_model, X_lemm_vec, y_train)

evaluation.loc[3] = ['Random Forest', rf_cv.mean().round(4)]
evaluation

In [None]:
# instantiate linear svc and pipeline
svc_model_pipe = LinearSVC(class_weight = {0: 0.55, 1: 0.45})
svc_pipe = Pipeline([('model', svc_model_pipe)])
svc_params = {'model__C': [0.01, 0.1, 1], 
              'model__tol': [1e-6, 1e-4, 1e-2, 1], 
              'model__max_iter': [1000, 2000, 3000]}
svc_gs = GridSearchCV(estimator = svc_pipe, param_grid = svc_params, 
                      cv = 5, scoring = 'accuracy', return_train_score=True)

In [None]:
svc_gs.fit(X_lemm_vec, y_train)

In [None]:
pd.DataFrame(svc_gs.cv_results_)

In [None]:
svc_gs.best_estimator_

In [None]:
best_svc_model = svc_gs.best_estimator_

best_svc_model.fit(X_lemm_vec, y_train)
svc_cv_gs = cross_val_score(best_svc_model, X_lemm_vec, y_train)

evaluation.loc[4] = ['Tuned SVC', svc_cv_gs.mean().round(4)]
evaluation

In [None]:
X_test['lemmatized'] = X_test['text'].apply(preprocess_text)

In [None]:
X_test_lemm_tokens = X_test["lemmatized"].str.join(" ")
X_test_lemm_vec = tfidf_lemm.transform(X_test_lemm_tokens)
svc_model.score(X_test_lemm_vec, y_test)

In [None]:
with open('best_svc_model.pkl','wb') as f:
    pickle.dump(svc_model,f)

In [None]:
with open('tfidf_lemm.pkl','wb') as e:
    pickle.dump(tfidf_lemm,e)

In [None]:
fig, ax = plt.subplots()

ConfusionMatrixDisplay.from_estimator(svc_model, X_test_lemm_vec, y_test, 
                                      display_labels=["Real", "Fake"],
                                      cmap = 'Blues', ax = ax)

ax.set_title('News Classification Predictions - Linear SVC');

In [None]:
print(classification_report(y_test, svc_model.predict(X_test_lemm_vec)))

In [None]:
svc_features = pd.DataFrame(zip(tfidf_lemm.get_feature_names_out(), np.transpose(svc_model.coef_)), 
                           columns=['words', 'coef'])

svc_features.sort_values('coef').tail(10)

In [None]:
svc_features.sort_values('coef').head(10)

In [None]:
news['lemmatized'] = news['text'].apply(preprocess_text)
news

In [None]:
news_lemm_tokens = news["lemmatized"].str.join(" ")
news_lemm_vec = tfidf_lemm.transform(news_lemm_tokens)
news['predictions'] = svc_model.predict(news_lemm_vec)
news.head()

In [None]:
df = pd.DataFrame(news_lemm_vec.toarray(), columns=tfidf_lemm.get_feature_names_out(), index = news.index)
df.head()

In [None]:

series = []

for x in range(len(df)):
    top_words = df.iloc[x]
    top = top_words.sort_values(ascending=False)[:1]
    one = list(top.index)[0]
    series.append(one)
news['top_word'] = series

In [None]:
news.head()

In [None]:
news['top_word'].value_counts()

In [None]:
# divide df into true and false positives and negatives
realcorrect = news.loc[(news['target'] == 0) & (news['predictions'] == 0)]
fakecorrect = news.loc[(news['target'] == 1) & (news['predictions'] == 1)]
realwrong = news.loc[(news['target'] == 1) & (news['predictions'] == 0)]
fakewrong = news.loc[(news['target'] == 0) & (news['predictions'] == 1)]

In [None]:
realcorrect.head(2)

In [None]:
fakecorrect.head(2)

In [None]:
realwrong.head()

In [None]:
fakewrong.head()

In [None]:
realcorrect['top_word'].value_counts().head()

In [None]:
fakecorrect['top_word'].value_counts().head()

In [None]:
realwrong['top_word'].value_counts().head()

In [None]:
fakewrong['top_word'].value_counts().head()

In [None]:
from wordcloud import WordCloud

wordcloud1 = WordCloud(width=400, height=400, background_color='black',
                      stopwords=["say"], max_words=35)

wordcloud1.generate(str(realcorrect['lemmatized']))

wordcloud1.to_image()

In [None]:
wordcloud2 = WordCloud(width=400, height=400, background_color='white',
                      stopwords=["said"], max_words=35)

wordcloud2.generate(str(fakecorrect['lemmatized']))

wordcloud2.to_image()

In [None]:
wordcloud3 = WordCloud(width=400, height=400, background_color='orange',
                      stopwords=["said"], max_words=35)

wordcloud3.generate(str(realwrong['lemmatized']))

wordcloud3.to_image()

In [None]:
wordcloud4 = WordCloud(width=400, height=400, background_color='skyblue',
                      stopwords=["said"], max_words=35)
wordcloud4.generate(str(fakewrong['lemmatized']))

wordcloud4.to_image()