# This notebook is used for cleaning the data and creating two datasets that will later be used in fake and real news article classifier.

In [44]:
import pandas as pd
import numpy as np
import re
import string

In [2]:
fake_original = pd.read_csv("Raw_datasets/Fake.csv")
true_original = pd.read_csv("Raw_datasets/True.csv")
mixture = pd.read_csv("Raw_datasets/news_articles.csv")
fake = fake_original.copy()
true = true_original.copy()
mixture_clean = mixture.copy()

In [3]:
mixture_clean = mixture_clean.dropna()
mixture_clean = mixture_clean[['title','text', 'published', 'label', 'title_without_stopwords','text_without_stopwords']]

fake = fake.dropna()
true = true.dropna()

In [45]:
def wordCorrect(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub("https?://\S+|www\. \S+", "", text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    
    return text

In [4]:
mixture_clean.rename(columns={"published":"date"}, inplace=True)
#YYYY-MM-DDThh:mm:ss.sTZD our current date format.

mixture_clean = mixture_clean[~mixture_clean.date.str.contains('http')]



mixture_clean['date'] = pd.to_datetime(mixture_clean['date'], utc=True)
mixture_clean.date = pd.to_datetime(mixture_clean.date).dt.date

In [5]:
fake = fake[fake.date.str.contains('20')]
true = true[true.date.str.contains('20')]

#in case some URL has 20 written inside of them.
fake = fake[~fake.date.str.contains('http')]
true = true[~true.date.str.contains('http')]

In [6]:
fake['date'] = pd.to_datetime(fake['date'])
true['date'] = pd.to_datetime(true['date'])
fake.drop(['subject'], axis=1, inplace=True)
true.drop(['subject'], axis=1, inplace=True)

In [51]:
mixture_clean['title'] = mixture_clean['title'].apply(wordCorrect)
mixture_clean['text'] = mixture_clean['text'].apply(wordCorrect)
mixture_clean['title_without_stopwords'] = mixture_clean['title_without_stopwords'].apply(wordCorrect)
mixture_clean['text_without_stopwords'] = mixture_clean['text_without_stopwords'].apply(wordCorrect)

fake['title'] = fake['title'].apply(wordCorrect)
true['title'] = true['title'].apply(wordCorrect)
true['text'] = true['text'].apply(wordCorrect)
fake['text'] = fake['text'].apply(wordCorrect)

Unnamed: 0,title,text,date,label,title_without_stopwords,text_without_stopwords
0,muslims busted they stole millions in govt ben...,print they should pay all the back all the mon...,2016-10-26,Real,muslims busted stole millions govt benefits,print pay back money plus interest entire fami...
1,re why did attorney general loretta lynch plea...,why did attorney general loretta lynch plead t...,2016-10-29,Real,attorney general loretta lynch plead fifth,attorney general loretta lynch plead fifth bar...
2,breaking weiner cooperating with fbi on hillar...,red state fox news sunday reported this morn...,2016-10-30,Real,breaking weiner cooperating fbi hillary email ...,red state fox news sunday reported morning ant...
3,pin drop speech by father of daughter kidnappe...,email kayla mueller was a prisoner and torture...,2016-11-01,Real,pin drop speech father daughter kidnapped kill...,email kayla mueller prisoner tortured isis cha...
4,fantastic trumps point plan to reform healthc...,email healthcare reform to make america great ...,2016-11-01,Real,fantastic trumps point plan reform healthcare ...,email healthcare reform make america great sin...
5,hillary goes absolutely berserk on protester a...,print hillary goes absolutely berserk she expl...,2016-11-02,Real,hillary goes absolutely berserk protester rall...,print hillary goes absolutely berserk explodes...
6,breaking nypd ready to make arrests in weiner ...,breaking nypd ready to make arrests in weiner ...,2016-11-04,Real,breaking nypd ready make arrests weiner casehi...,breaking nypd ready make arrests weiner casehi...
7,wow whistleblower tells chilling story of mass...,breaking nypd ready to make arrests in weiner ...,2016-11-04,Real,wow whistleblower tells chilling story massive...,breaking nypd ready make arrests weiner casehi...
8,breaking clinton clearedwas this a coordinated...,limbaugh said that the revelations in the wiki...,2016-11-06,Real,breaking clinton clearedwas coordinated last m...,limbaugh said revelations wikileaks material s...
9,evil hillary supporters yell fck trumpburn tru...,email these people are sick and evil they wil...,2016-11-07,Real,evil hillary supporters yell fck trumpburn tru...,email people sick evil stop nothing get way la...


In [7]:
is_real = mixture_clean['label'] == 'Real'
mix_true_stop = mixture_clean[is_real]
mix_true_stop = mix_true_stop.drop(['title', 'label', 'text'], axis=1)

mix_true = mixture_clean[is_real]
mix_true = mix_true.drop(['title_without_stopwords', 'text_without_stopwords', 'label'], axis=1)

In [8]:
mix_fake_stop = mixture_clean[~is_real]
mix_fake_stop = mix_fake_stop.drop(['title', 'label', 'text'], axis=1)

mix_fake = mixture_clean[~is_real]
mix_fake = mix_fake.drop(['title_without_stopwords', 'text_without_stopwords', 'label'], axis=1)

In [14]:
#Correct order of columns should be [title, text, date]
mix_true_stop = mix_true_stop[['title_without_stopwords', 'text_without_stopwords', 'date']]
mix_fake_stop = mix_fake_stop[['title_without_stopwords', 'text_without_stopwords', 'date']]

mix_true = mix_true[['title','text','date']]
mix_fake = mix_fake[['title','text','date']]

mix_fake_stop.reset_index(drop=True, inplace=True)
mix_true_stop.reset_index(drop=True, inplace=True)
mix_true.reset_index(drop=True, inplace=True)
mix_fake.reset_index(drop=True, inplace=True)
fake.reset_index(drop=True, inplace=True)
true.reset_index(drop=True, inplace=True)

In [26]:
mix_true.to_csv('Datasets/mix_true.csv', header =True)
mix_fake.to_csv('Datasets/mix_fake.csv', header =True)

fake.to_csv('Datasets/fake.csv', header =True)
true.to_csv('Datasets/true.csv', header =True)

mix_true_stop.to_csv('Datasets/mix_true_with_stop.csv', header =True)
mix_fake_stop.to_csv('Datasets/mix_fake_with_stop.csv', header =True)

In [18]:
fake['legit'] = 0
true['legit'] = 1

In [21]:
fakeTrue = pd.concat([fake,true])
fakeTrue.reset_index(drop=True, inplace=True)

In [24]:
fakeTrue.to_csv('Datasets/fakeTrue.csv', index=True, header=True)