# Fake News Detection

**Author**: Marcelo Scatena
***
March 2022

# Latent Dirichlet Allocation

LDA was first introduced in a 2003 paper titled [Latent Dirichlet Allocation](https://web.archive.org/web/20120207011313/http://jmlr.csail.mit.edu/papers/volume3/blei03a/blei03a.pdf). There the authors describe lda as 'a generative probabilistic model for collections of discrete data such as text corpora.'. LDA is an unsuperversid topic modeling technique. Given a corpus and a number of topics, it breaks down the corpus into said amount of topics, and calculates the probability of each word to be belonging of one topic or another. With it, we can better understand what the topics of the corpus are constructed from.

In [6]:
#Dependencies

import pandas as pd

import gensim #the library for Topic modelling
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis.gensim_models #LDA visualization library

from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer
import regex as re

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

import warnings
warnings.simplefilter('ignore')
from itertools import chain

## Import, clean and join data

In [2]:
col_names = ['ID', 'label', 'statement', 'subject', 'speaker', 'speaker\'s title', 'state', 'party', 'barely true', 'false', 'half true', 'mostly true', 'pants on fire', 'context', 'justification']
raw_train = pd.read_csv('data/train2.tsv', sep='\t', names=col_names)
raw_test = pd.read_csv('data/test2.tsv', sep='\t', names=col_names)
raw_val = pd.read_csv('data/val2.tsv', sep='\t', names=col_names)

In [3]:
df_train = raw_train.copy()
df_val = raw_val.copy()
df_test = raw_test.copy()

In [29]:
def drop_na(df):
    df = df.dropna(subset=['statement'], axis=0)
    return df

def drop_duplicated(df):
    df = df.drop_duplicates()
    return df

def label(df):
    df['label'] = df['label'].map({'true': 1,
                                     'mostly-true': 1,
                                     'half-true': 1,
                                     'false': 0,
                                     'barely-true': 0,
                                     'pants-fire': 0})
    return df

def drop_features(df):
    df = df[['statement','label']]
    return df

def reset_index(df):
    df['statement'] = df['statement'].apply(lambda x: x.lower())
    return df

def clean(text):
    text=text.lower()
    stp=set(stopwords.words("english"))
    stp.update(['say', 'percent', 'state', 'year',
               'said', 'people', 'one'])
    placesp = re.compile('[/(){}\[\]\|@,;]')
    removech= re.compile('[^0-9a-z #+_]')
    st=WordNetLemmatizer()
    text=re.sub(placesp,' ',text)
    text=re.sub(removech,' ',text)
    text=text.split()
    text=[w for w in text if not w in stp]
    text=[st.lemmatize(w) for w in text]
    text=[w for w in text if not w in stp]
    text=" ".join(text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text

def clean_df(df):
    df['statement'] = df['statement'].apply(lambda x: clean(x))
    return df

def lower_case(df):
    df = df.reset_index(drop=True)
    return df

In [30]:
cleaning_pipeline = Pipeline(steps=[
    ('drop_na', FunctionTransformer(drop_na)),
    ('drop_duplicated', FunctionTransformer(drop_duplicated)),
    ('label', FunctionTransformer(label)),
    ('drop_features', FunctionTransformer(drop_features)),
    ('reset_index', FunctionTransformer(reset_index)),
    ('lower_case', FunctionTransformer(lower_case)),
    ('clean_df', FunctionTransformer(clean_df))
])

In [31]:
df_train_clean = cleaning_pipeline.fit_transform(df_train)
df_val_clean = cleaning_pipeline.transform(df_val)
df_test_clean = cleaning_pipeline.transform(df_test)

In [58]:
df_all = pd.concat([df_train_clean, df_val_clean, df_test_clean])

In [33]:
df_all.head()

Unnamed: 0,statement,label
0,annies list political group support third trim...,0
1,decline coal start started natural gas took st...,1
2,hillary clinton agrees john mccain voting give...,1
3,health care reform legislation likely mandate ...,0
4,economic turnaround started end term,1


##### Split text into tokens

In [34]:
df_all_split = df_all['statement'].apply(lambda x : x.split())

##### Create Dictionary from the statements

In [35]:
dictionary = corpora.Dictionary(df_all_split)
#Total number of non-zeroes in the BOW matrix (sum of the number of unique words per document over the entire corpus).
print(dictionary.num_nnz)

130152


##### Create statement term matrix

In [36]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df_all_split]
print(len(doc_term_matrix))

12791


##### Instantiate LDA model

In [37]:
lda = gensim.models.ldamodel.LdaModel

##### Fit LDA model on the dataset

In [38]:
num_topics=2
ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

##### Print the topics identified by LDA model

In [39]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.009*"obama" + 0.007*"president" + 0.005*"clinton" + 0.005*"american" + 0.005*"united" + 0.005*"country" + 0.005*"u" + 0.005*"barack" + 0.005*"law" + 0.004*"right"'),
 (1,
  '0.015*"tax" + 0.012*"000" + 0.010*"job" + 0.010*"million" + 0.009*"health" + 0.008*"care" + 0.007*"1" + 0.007*"billion" + 0.006*"budget" + 0.006*"cut"')]

##### Visualize the LDA model results

In [40]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

#### Adding more stopwords:

In [41]:
new_stopwords = ['000', '1', '2', '10', 'u']

In [42]:
def remove_stopwords_row(text):
    text=[word for word in text if word not in new_stopwords]
    return text

In [49]:
df_all_split = df_all_split.apply(lambda x: remove_stopwords_row(x))

In [51]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df_all_split]
print(len(doc_term_matrix))

lda = gensim.models.ldamodel.LdaModel

num_topics=2
ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

12791


In [52]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.014*"obama" + 0.012*"president" + 0.011*"health" + 0.010*"care" + 0.008*"bill" + 0.007*"republican" + 0.007*"barack" + 0.007*"law" + 0.007*"voted" + 0.005*"clinton"'),
 (1,
  '0.017*"tax" + 0.011*"job" + 0.011*"million" + 0.008*"billion" + 0.007*"budget" + 0.006*"new" + 0.006*"cut" + 0.006*"texas" + 0.006*"rate" + 0.006*"school"')]

In [53]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

#### Model with 8 topics

In [54]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df_all_split]
print(len(doc_term_matrix))

lda = gensim.models.ldamodel.LdaModel

num_topics=8
ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

12791


In [55]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.018*"county" + 0.012*"dollar" + 0.011*"campaign" + 0.010*"money" + 0.010*"government" + 0.009*"milwaukee" + 0.008*"million" + 0.008*"trade" + 0.008*"voted" + 0.007*"pay"'),
 (1,
  '0.012*"law" + 0.012*"republican" + 0.011*"act" + 0.009*"court" + 0.009*"voter" + 0.008*"party" + 0.007*"school" + 0.007*"dont" + 0.007*"child" + 0.006*"public"'),
 (2,
  '0.067*"obama" + 0.057*"president" + 0.035*"barack" + 0.013*"obamas" + 0.008*"administration" + 0.008*"house" + 0.008*"white" + 0.007*"first" + 0.007*"iran" + 0.006*"economy"'),
 (3,
  '0.022*"rate" + 0.014*"country" + 0.013*"tax" + 0.012*"united" + 0.012*"nation" + 0.011*"american" + 0.010*"world" + 0.010*"student" + 0.010*"texas" + 0.009*"highest"'),
 (4,
  '0.023*"clinton" + 0.016*"hillary" + 0.015*"trump" + 0.013*"donald" + 0.009*"john" + 0.009*"mccain" + 0.007*"border" + 0.007*"woman" + 0.006*"iraq" + 0.006*"police"'),
 (5,
  '0.035*"health" + 0.034*"tax" + 0.030*"care" + 0.019*"bill" + 0.015*"plan" + 0.012*"law" + 0.011*"new"

In [56]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

#### Split data into true and fake statements

In [75]:
df_all_true = df_all[df_all['label']==1]
df_all_fake = df_all[df_all['label']==0]

In [76]:
df_all_true_split = df_all_true['statement'].apply(lambda x : x.split())
df_all_fake_split = df_all_fake['statement'].apply(lambda x : x.split())
df_all_true_split = df_all_true_split.apply(lambda x: remove_stopwords_row(x))
df_all_fake_split = df_all_fake_split.apply(lambda x: remove_stopwords_row(x))

#### True statements, 2 topics

In [77]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df_all_true_split]
print(len(doc_term_matrix))

lda = gensim.models.ldamodel.LdaModel

num_topics=2
ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

7134


In [78]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.011*"obama" + 0.010*"president" + 0.008*"republican" + 0.006*"bill" + 0.006*"voted" + 0.006*"barack" + 0.005*"american" + 0.005*"law" + 0.004*"time" + 0.004*"clinton"'),
 (1,
  '0.014*"tax" + 0.010*"million" + 0.010*"job" + 0.008*"health" + 0.007*"new" + 0.006*"rate" + 0.006*"billion" + 0.006*"budget" + 0.006*"care" + 0.006*"cut"')]

In [79]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

#### False statements, 2 topics

In [80]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df_all_fake_split]
print(len(doc_term_matrix))

lda = gensim.models.ldamodel.LdaModel

num_topics=2
ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

5657


In [81]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.014*"tax" + 0.011*"health" + 0.010*"care" + 0.009*"job" + 0.007*"million" + 0.006*"government" + 0.006*"billion" + 0.006*"plan" + 0.005*"would" + 0.005*"clinton"'),
 (1,
  '0.012*"obama" + 0.011*"president" + 0.007*"barack" + 0.004*"united" + 0.004*"last" + 0.004*"security" + 0.004*"republican" + 0.004*"congress" + 0.003*"social" + 0.003*"rep"')]

In [82]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

#### True statements, 8 topics

In [83]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df_all_true_split]
print(len(doc_term_matrix))

lda = gensim.models.ldamodel.LdaModel

num_topics=8
ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

7134


In [84]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.012*"0" + 0.012*"gov" + 0.011*"scott" + 0.010*"rick" + 0.010*"romney" + 0.010*"illegal" + 0.010*"governor" + 0.009*"mitt" + 0.006*"walker" + 0.006*"perry"'),
 (1,
  '0.021*"clinton" + 0.015*"hillary" + 0.013*"john" + 0.012*"iraq" + 0.012*"mccain" + 0.011*"war" + 0.010*"american" + 0.007*"crime" + 0.006*"death" + 0.006*"marijuana"'),
 (2,
  '0.013*"republican" + 0.012*"trump" + 0.010*"bill" + 0.010*"donald" + 0.009*"law" + 0.009*"senate" + 0.009*"vote" + 0.009*"security" + 0.008*"democrat" + 0.008*"house"'),
 (3,
  '0.025*"obama" + 0.024*"job" + 0.020*"million" + 0.020*"president" + 0.013*"billion" + 0.012*"barack" + 0.011*"debt" + 0.010*"time" + 0.010*"since" + 0.010*"spending"'),
 (4,
  '0.011*"republican" + 0.006*"even" + 0.006*"since" + 0.006*"reagan" + 0.006*"used" + 0.005*"world" + 0.005*"budget" + 0.005*"voted" + 0.005*"party" + 0.005*"time"'),
 (5,
  '0.020*"rate" + 0.018*"school" + 0.015*"country" + 0.014*"texas" + 0.012*"nation" + 0.011*"highest" + 0.011*"student" + 

In [85]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

#### Fake statements, 8 topics

In [86]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df_all_fake_split]
print(len(doc_term_matrix))

lda = gensim.models.ldamodel.LdaModel

num_topics=8
ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

5657


In [87]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.008*"work" + 0.008*"united" + 0.006*"city" + 0.006*"candidate" + 0.006*"attack" + 0.006*"oil" + 0.005*"congress" + 0.005*"austin" + 0.005*"terrorist" + 0.005*"clinton"'),
 (1,
  '0.010*"country" + 0.008*"romney" + 0.007*"woman" + 0.007*"number" + 0.007*"law" + 0.007*"gun" + 0.007*"mitt" + 0.006*"trump" + 0.005*"texas" + 0.005*"america"'),
 (2,
  '0.022*"president" + 0.020*"obama" + 0.011*"barack" + 0.006*"would" + 0.006*"job" + 0.005*"abortion" + 0.005*"college" + 0.005*"obamas" + 0.004*"america" + 0.004*"muslim"'),
 (3,
  '0.008*"job" + 0.007*"island" + 0.007*"rhode" + 0.007*"republican" + 0.005*"child" + 0.005*"bill" + 0.005*"national" + 0.005*"dont" + 0.004*"gun" + 0.004*"almost"'),
 (4,
  '0.039*"obama" + 0.029*"president" + 0.023*"barack" + 0.012*"illegal" + 0.010*"since" + 0.010*"bush" + 0.009*"immigrant" + 0.009*"time" + 0.006*"per" + 0.006*"first"'),
 (5,
  '0.013*"security" + 0.012*"social" + 0.012*"clinton" + 0.011*"new" + 0.009*"million" + 0.009*"job" + 0.008*"hill

In [88]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)