# Fake News Detection
> Problem
> - create a model to detect fake news

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy, re, nltk, gensim

from spacy import displacy, tokenizer

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import gensim.corpora as corpora

from gensim.models import CoherenceModel, LsiModel, TfidfModel

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# set plot options
plt.rcParams['figure.figsize'] = (12,8)
default_plot_colour = "#00bfbf"

In [None]:
data = pd.read_csv("fake_news_data.csv")
data.head()

In [None]:
data.info()

In [None]:
# make sure we have the appropriate numbers of rows in each group
data['fake_or_factual'].value_counts().plot(kind='bar', color=default_plot_colour)
plt.title('Fake vs Factual News')
plt.ylabel('Number of Articles')
plt.xlabel('Category')
plt.show()

In [None]:
# POS Tagging - Part of Speech Tagging
nlp = spacy.load("en_core_web_sm")

In [None]:
# split in fake and factual news
fake_news = data[data['fake_or_factual'] == 'Fake News']
factual_news = data[data['fake_or_factual'] == 'Factual News']

In [None]:
fake_spacy_doc = list(nlp.pipe(fake_news['text']))
factual_spacy_doc = list(nlp.pipe(factual_news['text']))

In [None]:
def extract_token_tags(doc: spacy.tokens.doc.Doc):
    return [(i.text, i.ent_type_, i.pos_) for i in doc]

In [None]:
fake_tags_df = []
columns = ["token", "ner_tag", "pos_tag"]
for ix, doc in enumerate(fake_spacy_doc):
    tags = extract_token_tags(doc)
    tags_df = pd.DataFrame(tags, columns=columns)
    fake_tags_df.append(tags_df)

fake_tags_df = pd.concat(fake_tags_df)

In [None]:
factual_tags_df = []
for ix, doc in enumerate(factual_spacy_doc):
    tags = extract_token_tags(doc)
    tags_df = pd.DataFrame(tags, columns=columns)
    factual_tags_df.append(tags_df)
    
factual_tags_df = pd.concat(factual_tags_df)

In [None]:
fake_tags_df.head()

In [None]:
factual_tags_df.head()

In [None]:
# get token frequency count
pos_counts_fake = fake_tags_df.groupby(['token', 'pos_tag']).size().reset_index(name='count').sort_values(by='count', ascending=False)
pos_counts_fake.head(10)

In [None]:
# get token frequency count
pos_counts_factual = factual_tags_df.groupby(['token', 'pos_tag']).size().reset_index(name='count').sort_values(by='count', ascending=False)
pos_counts_factual.head(10)

In [None]:
# get frequency of individual pos tags (noun, verb, etc)
pos_counts_fake.groupby('pos_tag')['token'].count().sort_values(ascending=False).head(10)


In [None]:
# get frequency of individual pos tags (noun, verb, etc)
pos_counts_factual.groupby('pos_tag')['token'].count().sort_values(ascending=False).head(10)

In [None]:
# check if specific nouns are more common in fake news than factual news
pos_counts_fake[pos_counts_fake['pos_tag'] == 'NOUN'][:15]

In [None]:
# check if specific nouns are more common in fake news than factual news
pos_counts_factual[pos_counts_factual['pos_tag'] == 'NOUN'][:15]

# Named entity recognition

In [None]:
top_entities_fake = fake_tags_df[fake_tags_df['ner_tag'] != ''].groupby([ 'token', 'ner_tag']).size().reset_index(name='count').sort_values(by='count', ascending=False)
top_entities_fake.head(10)

In [None]:
top_entities_factual = factual_tags_df[factual_tags_df['ner_tag'] != ''].groupby([ 'token', 'ner_tag']).size().reset_index(name='count').sort_values(by='count', ascending=False)
top_entities_factual.head(10)

In [None]:
ner_palette = {
    "ORG": sns.color_palette("Set2").as_hex()[0],
    "GPE": sns.color_palette("Set2").as_hex()[1],
    "NORP": sns.color_palette("Set2").as_hex()[2],
    "PERSON": sns.color_palette("Set2").as_hex()[3],
    "DATE": sns.color_palette("Set2").as_hex()[4],
    "CARDINAL": sns.color_palette("Set2").as_hex()[5],
    "LOC": sns.color_palette("Set2").as_hex()[6],
    "PERCENT": sns.color_palette("Set2").as_hex()[7]
}

In [None]:
sns.barplot(
    x="count",
    y="token",
    hue="ner_tag",
    palette=ner_palette,
    data=top_entities_fake[:10],
    orient="h",
    dodge=False
).set_title("Top Named Entities in Fake News")

In [None]:
sns.barplot(
    x="count",
    y="token",
    hue="ner_tag",
    palette=ner_palette,
    data=top_entities_factual[:10],
    orient="h",
    dodge=False
).set_title("Top Named Entities in Factual News")

# Text Processing

In [None]:
data.head()

In [None]:
patterns = {
    "first_hyphen_in_text": r"^[^-]*-\s",
    "no_punctuation": r"[^\w\s]",
}

data["text_clean"] = data.apply(
    lambda x: re.sub(patterns["first_hyphen_in_text"], "", x["text"]), axis=1
)

data["text_clean"] = data["text_clean"].str.lower()

data["text_clean"] = data["text_clean"].apply(
    lambda x: re.sub(patterns["no_punctuation"], "", x)
)

en_stopwords = set(stopwords.words("english"))

data["text_clean"] = data["text_clean"].apply(
    lambda x: " ".join([word for word in x.split() if word not in en_stopwords])
)

data["text_clean"] = data.apply(
    lambda x: word_tokenize(x["text_clean"]), axis=1
)

lemmatizer = WordNetLemmatizer()

data['text_clean'] = data['text_clean'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

In [None]:
# n-grams
tokens_clean = sum(data['text_clean'], [])

unigram = (pd.Series(nltk.ngrams(tokens_clean, 1)).value_counts()).reset_index()[:10]
bigram = (pd.Series(nltk.ngrams(tokens_clean, 2)).value_counts()).reset_index()[:10]
trigram = (pd.Series(nltk.ngrams(tokens_clean, 3)).value_counts()).reset_index()[:10]

print(unigram[:10], bigram[:10], trigram[:10])

In [None]:
unigram['token'] = unigram['index'].apply(lambda x: x[0])

sns.barplot(
    x='count',
    y='token',
    data=unigram,
    orient='h',
    palette=[default_plot_colour],
    hue='token',
    legend=False
).set(title="Most Common Unigrams after Preprocessing")

In [None]:
bigram['token'] = bigram['index'].apply(lambda x: x[0])

sns.barplot(
    x='count',
    y='token',
    data=bigram,
    orient='h',
    palette=[default_plot_colour],
    hue='token',
    legend=False
).set(title="Most Common bigrams after Preprocessing")

In [None]:
trigram['token'] = trigram['index'].apply(lambda x: x[0])

sns.barplot(
    x='count',
    y='token',
    data=trigram,
    orient='h',
    palette=[default_plot_colour],
    hue='token',
    legend=False
).set(title="Most Common trigrams after Preprocessing")

# Sentiment Analysis

In [None]:
vader_sentiment = SentimentIntensityAnalyzer()

In [None]:
data['vader_sentiment_score'] = data['text'].apply(lambda x: vader_sentiment.polarity_scores(x)['compound'])

In [None]:
data.head()

In [None]:
bins = [-1, -0.1, 0.1, 1]
names = ['negative', 'neutral', 'positive']

In [None]:
data['vader_sentiment_label'] = pd.cut(data['vader_sentiment_score'], bins, labels=names)

In [None]:
data.head()

In [None]:
data['vader_sentiment_label'].value_counts().plot(kind='bar', color=default_plot_colour)
plt.title('Sentiment Distribution')

In [None]:
sns.countplot(
    x='fake_or_factual',
    hue='vader_sentiment_label',
    palette = sns.color_palette("hls"),
    data=data
).set(title="Sentiment Distribution by Fake vs Factual News")

# Topic Modeling

In [None]:
fake_news_text = data[data['fake_or_factual'] == 'Fake News']['text_clean'].reset_index(drop=True)

In [None]:
dictionary_fake = corpora.Dictionary(fake_news_text)

In [None]:
doc_term_fake = [dictionary_fake.doc2bow(doc) for doc in fake_news_text]

In [None]:
coherence_values = []
model_list = []

min_topics = 2
max_topics = 11

In [None]:
for num_topics_i in range(min_topics, max_topics):
    model = gensim.models.LdaModel(doc_term_fake, num_topics=num_topics_i, id2word=dictionary_fake)
    model_list.append(model)
    coherence_model = CoherenceModel(model=model, texts=fake_news_text, dictionary=dictionary_fake, coherence='c_v')
    coherence_values.append(coherence_model.get_coherence())

In [None]:
plt.plot(range(min_topics, max_topics), coherence_values)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.title("Coherence Score by Number of Topics")
plt.legend(["Coherence Values"], loc='best')
plt.show()

In [None]:
# choosing the num of topics based on the coherence score above
num_topics_lda = 4
lda_model = gensim.models.LdaModel(doc_term_fake, num_topics=num_topics_lda, id2word=dictionary_fake)

In [None]:
lda_model.print_topics(num_topics= num_topics_lda, num_words=10)

In [None]:
def tfidf_corpus(doc_term_matrix):
    tfidf = TfidfModel(corpus = doc_term_matrix, normalize=True)
    corpus_tfidf = tfidf[doc_term_matrix]
    return corpus_tfidf


In [None]:
def get_coherent_scores(corpus, dictionary, text, min_topics, max_topics):
    coherence_values = []
    model_list = []

    for num_topics_i in range(min_topics, max_topics + 1):
        model = gensim.models.LsiModel(corpus, num_topics=num_topics_i, id2word=dictionary)
        model_list.append(model)
        coherence_model = CoherenceModel(model=model, texts=text, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())

    plt.plot(range(min_topics, max_topics + 1), coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence Score")
    plt.title("Coherence Score by Number of Topics")
    plt.legend(["Coherence Values"], loc='best')
    plt.show()

In [None]:
corpus_tfidf_fake = tfidf_corpus(doc_term_fake)

In [None]:
get_coherent_scores(corpus_tfidf_fake, dictionary_fake, fake_news_text, min_topics=2, max_topics=11)

In [None]:
# creating the final model with the number of topics chosen based on the coherence score
num_topics_lsa = 3
lsa_model = LsiModel(corpus_tfidf_fake, num_topics=num_topics_lsa, id2word=dictionary_fake)

lsa_model.print_topics(num_topics=num_topics_lsa, num_words=10)

# Creating the Classification Model

In [None]:
data.head()

In [None]:
X = [','.join(map(str, l)) for l in data['text_clean']]
Y = data['fake_or_factual']

In [None]:
count_vectorizer = CountVectorizer()
count_vectorizer_fit = count_vectorizer.fit_transform(X)

In [None]:
bag_of_words = pd.DataFrame(count_vectorizer_fit.toarray(), columns=count_vectorizer.get_feature_names_out())

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(bag_of_words, Y, test_size=0.3)

In [None]:
logistic_regression_model = LogisticRegression(random_state=0).fit(X_train, Y_train)

In [None]:
Y_prediction_logistic_regression = logistic_regression_model.predict(X_test)

In [None]:
accuracy_score(Y_prediction_logistic_regression, Y_test)

In [None]:
print(classification_report(Y_prediction_logistic_regression, Y_test))

In [None]:
support_vector_machine_model = SGDClassifier(random_state=0).fit(X_train, Y_train)

In [None]:
Y_prediction_svm = support_vector_machine_model.predict(X_test)

In [None]:
accuracy_score(Y_prediction_svm, Y_test)

In [None]:
print(classification_report(Y_prediction_svm, Y_test))