# Fake News Exploratory Data Analysis

Get subset of FakeNewCorpus data set.

In [1]:
import pandas as pd
import regex as re
from cleantext import clean
raw_data = pd.read_csv('https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv',index_col=0)
raw_data.head()

Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary
0,141,awm.com,unreliable,http://awm.com/church-congregation-brings-gift...,Sometimes the power of Christmas will make you...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Church Congregation Brings Gift to Waitresses ...,Ruth Harris,,[''],,,
1,256,beforeitsnews.com,fake,http://beforeitsnews.com/awakening-start-here/...,AWAKENING OF 12 STRANDS of DNA – “Reconnecting...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,AWAKENING OF 12 STRANDS of DNA – “Reconnecting...,Zurich Times,,[''],,,
2,700,cnnnext.com,unreliable,http://www.cnnnext.com/video/18526/never-hike-...,Never Hike Alone: A Friday the 13th Fan Film U...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Never Hike Alone - A Friday the 13th Fan Film ...,,,[''],Never Hike Alone: A Friday the 13th Fan Film ...,,
3,768,awm.com,unreliable,http://awm.com/elusive-alien-of-the-sea-caught...,"When a rare shark was caught, scientists were ...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Elusive ‘Alien Of The Sea ‘ Caught By Scientis...,Alexander Smith,,[''],,,
4,791,bipartisanreport.com,clickbait,http://bipartisanreport.com/2018/01/21/trumps-...,Donald Trump has the unnerving ability to abil...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Trump’s Genius Poll Is Complete & The Results ...,Gloria Christie,,[''],,,


Perform basic cleaning: remove urls, dates, numbers, emails.

In [2]:
data = raw_data.copy(deep = True)
date_pattern = re.compile(r"([\d]{1,2}[\/|\-][\d]{1,2}(?:[\/|\-][\d]{2,4})?|[\d]{2,4}[\/|\-][\d]{1,2}[\/|\-][\d]{1,2}|(?:january|february|march|april|may|june|july|august|september|october|november|december)[\s][\d]{1,2}[a-z][a-z](?:\s[\d]{2,4})|[\d][\d]\w?\w?\sof\s(?:january|february|march|april|may|june|july|august|september|october|november|december)(?:\s[\d]{2,4})?|(?:january|february|march|april|may|june|july|august|september|october|november|december)\s\d\d?\w?\w?,?(?:\s\d{2,4})?)")

def cleandates(datastring): 

    datastring = date_pattern.sub("date", datastring.lower())
    return datastring


data['content'] = data['content'].apply(cleandates)
data['content'] = [clean(entry,
                              lower=True,
                              no_line_breaks=True,
                              no_emails=True,
                              no_urls=True,
                              no_numbers=True,
                               lang="en",
                               replace_with_number="num",
                               replace_with_email="email",
                               replace_with_url="url"
                              ) for entry in data['content']]
data['content'].head()

0    sometimes the power of christmas will make you...
1    awakening of num strands of dna - "reconnectin...
2    never hike alone: a friday the 13th fan film u...
3    when a rare shark was caught, scientists were ...
4    donald trump has the unnerving ability to abil...
Name: content, dtype: object

Tokenize the text

In [3]:
import nltk
import itertools
from nltk.corpus import stopwords
tokens = list(itertools.chain.from_iterable(data['content'].apply(nltk.word_tokenize)))
stopwords = stopwords.words('english')

Remove stopwords and compute reduction of vocabulary rate after having removed them

In [4]:
vocabulary = set(tokens)
tokens_no_stopwords = [word for word in tokens if word not in stopwords]
vocabulary_no_stopwords = set(tokens_no_stopwords)
print("Reduction rate of removing stopwords: " + str(1 - len(vocabulary_no_stopwords) / len(vocabulary)))

Reduction rate of removing stopwords: 0.007994186046511587


Stem words and compute reduction rate of vocabulary

In [5]:
from nltk.stem.porter import *
stemmer = PorterStemmer()
vocabulary_stem = set([stemmer.stem(word) for word in vocabulary_no_stopwords])
print("Reduction rate of stemming: " + str(1 - len(vocabulary_stem)/len(vocabulary_no_stopwords)))

Reduction rate of stemming: 0.315995115995116


In [6]:
text = nltk.word_tokenize("donald trump was president")
nltk.pos_tag(text)

[('donald', 'NN'), ('trump', 'NN'), ('was', 'VBD'), ('president', 'NN')]

In [7]:
text2 = nltk.word_tokenize("to trump is to ...")
nltk.pos_tag(text2)

[('to', 'TO'), ('trump', 'VB'), ('is', 'VBZ'), ('to', 'TO'), ('...', ':')]