# This notebook is used for cleaning the data and creating two datasets that will later be used in fake and real news article classifier.

In [1]:
import pandas as pd
import numpy as np
import re
import string

In [2]:
fake_original = pd.read_csv("Raw_datasets/Fake.csv")
true_original = pd.read_csv("Raw_datasets/True.csv")
mixture = pd.read_csv("Raw_datasets/news_articles.csv")
fake = fake_original.copy()
true = true_original.copy()
mixture_clean = mixture.copy()

In [3]:
mixture_clean = mixture_clean.dropna()
mixture_clean = mixture_clean[['title','text', 'published', 'label', 'title_without_stopwords','text_without_stopwords']]

fake = fake.dropna()
true = true.dropna()

In [4]:
def wordCorrect(text):
    text = text.lower()
    text = re.sub('u.s', 'xUSx', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub("https?://\S+|www\. \S+", "", text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('xUSx', 'u.s', text)
    
    return text

In [5]:
mixture_clean.rename(columns={"published":"date"}, inplace=True)
#YYYY-MM-DDThh:mm:ss.sTZD our current date format.

mixture_clean = mixture_clean[~mixture_clean.date.str.contains('http')]



mixture_clean['date'] = pd.to_datetime(mixture_clean['date'], utc=True)
mixture_clean.date = pd.to_datetime(mixture_clean.date).dt.date

In [6]:
fake = fake[fake.date.str.contains('20')]
true = true[true.date.str.contains('20')]

#in case some URL has 20 written inside of them.
fake = fake[~fake.date.str.contains('http')]
true = true[~true.date.str.contains('http')]

In [7]:
fake['date'] = pd.to_datetime(fake['date'])
true['date'] = pd.to_datetime(true['date'])
fake.drop(['subject'], axis=1, inplace=True)
true.drop(['subject'], axis=1, inplace=True)

In [8]:
mixture_clean['title'] = mixture_clean['title'].apply(wordCorrect)
mixture_clean['text'] = mixture_clean['text'].apply(wordCorrect)
mixture_clean['title_without_stopwords'] = mixture_clean['title_without_stopwords'].apply(wordCorrect)
mixture_clean['text_without_stopwords'] = mixture_clean['text_without_stopwords'].apply(wordCorrect)

fake['title'] = fake['title'].apply(wordCorrect)
true['title'] = true['title'].apply(wordCorrect)

true['text'] = true['text'].apply(wordCorrect)
fake['text'] = fake['text'].apply(wordCorrect)

Unnamed: 0,title,text,date
0,as u.s budget fight looms republicans flip t...,washington reuters the head of a conservat...,2017-12-31
1,u.s military to accept transgender recruits o...,washington reuters transgender people will...,2017-12-29
2,senior u.s republican senator let mr muell...,washington reuters the special cou.sel inv...,2017-12-31
3,fbi ru.sia probe helped by australian diplomat...,washington reuters trump campaign adviser ...,2017-12-30
4,trump wants postal service to charge much mor...,seattle washington reuters president donal...,2017-12-29
5,white house congress prepare for talks on spe...,west palm beach fla washington reuters t...,2017-12-29
6,trump says ru.sia probe will be fair but time...,west palm beach fla reuters president don...,2017-12-29
7,factbox trump on twitter dec 29 approval ...,the following statements were posted to the ve...,2017-12-29
8,trump on twitter dec 28 global warming,the following statements were posted to the ve...,2017-12-29
9,alabama official to certify senator elect jone...,washington reuters alabama secretary of st...,2017-12-28


In [9]:
is_real = mixture_clean['label'] == 'Real'
mix_true_stop = mixture_clean[is_real]
mix_true_stop = mix_true_stop.drop(['title', 'label', 'text'], axis=1)

mix_true = mixture_clean[is_real]
mix_true = mix_true.drop(['title_without_stopwords', 'text_without_stopwords', 'label'], axis=1)

In [10]:
mix_fake_stop = mixture_clean[~is_real]
mix_fake_stop = mix_fake_stop.drop(['title', 'label', 'text'], axis=1)

mix_fake = mixture_clean[~is_real]
mix_fake = mix_fake.drop(['title_without_stopwords', 'text_without_stopwords', 'label'], axis=1)

In [11]:
#Correct order of columns should be [title, text, date]
mix_true_stop = mix_true_stop[['title_without_stopwords', 'text_without_stopwords', 'date']]
mix_fake_stop = mix_fake_stop[['title_without_stopwords', 'text_without_stopwords', 'date']]

mix_true = mix_true[['title','text','date']]
mix_fake = mix_fake[['title','text','date']]

mix_fake_stop.reset_index(drop=True, inplace=True)
mix_true_stop.reset_index(drop=True, inplace=True)
mix_true.reset_index(drop=True, inplace=True)
mix_fake.reset_index(drop=True, inplace=True)
fake.reset_index(drop=True, inplace=True)
true.reset_index(drop=True, inplace=True)

In [12]:
mix_true.to_csv('Datasets/mix_true.csv', header =True)
mix_fake.to_csv('Datasets/mix_fake.csv', header =True)

fake.to_csv('Datasets/fake.csv', header =True)
true.to_csv('Datasets/true.csv', header =True)

mix_true_stop.to_csv('Datasets/mix_true_with_stop.csv', header =True)
mix_fake_stop.to_csv('Datasets/mix_fake_with_stop.csv', header =True)

In [13]:
fake['legit'] = 0
true['legit'] = 1
mix_true['legit'] = 1
mix_fake['legit'] = 0

In [14]:
fakeTrue = pd.concat([fake,true])
fakeTrue.reset_index(drop=True, inplace=True)

mix_trueFake = pd.concat([mix_true, mix_fake])
mix_trueFake.reset_index(drop=True, inplace=True)

In [15]:
fakeTrue.to_csv('Datasets/fakeTrue.csv', header=True)

In [16]:
mix_trueFake.to_csv('Datasets/testData.csv', header=True)