# Fake News Classification - EDA

In [1]:
import os
import zipfile
import pandas as pd
import numpy as np

In [2]:
DATA_PATH = '../data/'
# if the path or names of the data hosted on Kaggle changes the following 
# three constants will need updated
DATA_ARCHIVE = 'fake-and-real-news-dataset.zip'
FAKE_DATA_FILE = 'Fake.csv'
TRUE_DATA_FILE = 'True.csv'
KAGGLE_DATA_LOCATION = 'clmentbisaillon/fake-and-real-news-dataset'

# Download the Data

In [3]:
try: 
    os.mkdir(DATA_PATH)
except FileExistsError:
    print('Data directory already exists')
    pass

Data directory already exists


Documentation on the Kaggle API is located at [https://www.kaggle.com/docs/api](https://www.kaggle.com/docs/api)

In [4]:
# assumes that kaggle has been installed and an api key is correctly installed
if not os.path.exists(DATA_PATH + DATA_ARCHIVE):
    !kaggle datasets download -d $KAGGLE_DATA_LOCATION -p $DATA_PATH

In [5]:
if not os.path.exists(DATA_PATH + FAKE_DATA_FILE):
    with zipfile.ZipFile(DATA_PATH + DATA_ARCHIVE, 'r') as zip_ref:
        zip_ref.extractall(path=DATA_PATH)

In [6]:
os.listdir('../data')

['fake-and-real-news-dataset.zip', 'Fake.csv', 'True.csv']

# Load the Data

In [7]:
fake_df = pd.read_csv(DATA_PATH + FAKE_DATA_FILE)

In [8]:
fake_df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [9]:
true_df = pd.read_csv(DATA_PATH + TRUE_DATA_FILE)

In [10]:
true_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [11]:
true_df['label'] = 'true'

In [12]:
fake_df['label'] = 'fake'

In [13]:
df = pd.concat([fake_df, true_df], ignore_index=True)

In [14]:
df.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",fake


# Clean the Data

## Check for missing data

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  object
dtypes: object(5)
memory usage: 1.7+ MB


There are no null values, next I will check for any placeholder values.

## Check for placeholder values and duplicates

### Title column

In [16]:
title_value_counts = df.title.value_counts()

In [17]:
title_value_counts[title_value_counts > 1]

Factbox: Trump fills top jobs for his administration                                                               14
Highlights: The Trump presidency on April 13 at 9:30 P.M. EDT/0130 GMT on Friday                                    8
Factbox: Contenders for senior jobs in Trump's administration                                                       8
Factbox: International reaction to arrest of Reuters reporters in Myanmar                                           6
MEDIA IGNORES Time That Bill Clinton FIRED His FBI Director On Day Before Vince Foster Was Found Dead               6
                                                                                                                   ..
WATCH FULL RESPONSE FROM TRUMP AFTER OBAMACARE VOTE: “Obamacare will explode…Pelosi and Schumer own it” [Video]     2
Trump Revives Keystone and Dakota Access Pipelines                                                                  2
Turkey's Erdogan says U.S. decision to suspend visa serv

There are a lot of titles that are used more than once.  This will need to be reviewed.
First I will look at the most commonly reused title.

In [18]:
df[df.title == 'Factbox: Trump fills top jobs for his administration']

Unnamed: 0,title,text,subject,date,label
29701,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"January 19, 2017",True
29875,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"January 11, 2017",True
30096,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"December 21, 2016",True
30118,Factbox: Trump fills top jobs for his administ...,(Reuters) - President-elect Donald Trump will ...,politicsNews,"December 19, 2016",True
30136,Factbox: Trump fills top jobs for his administ...,(Reuters) - President-elect Donald Trump will ...,politicsNews,"December 17, 2016",True
30167,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"December 15, 2016",True
30186,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"December 13, 2016",True
30220,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"December 13, 2016",True
30236,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"December 12, 2016",True
30289,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"December 8, 2016",True


In [19]:
df.iloc[29701].text

'(Reuters) - U.S. President-elect Donald Trump will name former Georgia Governor Sonny Perdue as his nominee for secretary of agriculture on Thursday, a senior transition official said on Wednesday. Here is a list of Republican Trump’s selections for top jobs in his administration.  NOTE: Senate confirmation is required for all the posts except national security adviser and White House posts. Tillerson, 64, has spent his entire career at Exxon Mobil Corp, where he rose to chairman and chief executive officer in 2006. A civil engineer by training, the Texan joined the world’s largest publicly traded energy company in 1975 and led several of its operations in the United States as well as in Yemen, Thailand and Russia. As Exxon’s chief executive, he maintained close ties with Moscow and opposed U.S. sanctions against Russia for its incursion into Crimea. Mnuchin, 54, is a successful private equity investor, hedge fund manager and Hollywood financier who spent 17 years at Goldman Sachs Gro

In [20]:
df.iloc[29875].text

'(Reuters) - U.S. President-elect Donald Trump on Wednesday announced he has chosen David Shulkin, who currently heads the Department of Veterans Affairs healthcare system, to head the agency. Here is a list of Republican Trump’s selections for top jobs in his administration.  NOTE: Senate confirmation is required for all the posts except national security adviser and White House posts. Tillerson, 64, has spent his entire career at Exxon Mobil Corp, where he rose to chairman and chief executive officer in 2006. A civil engineer by training, the Texan joined the world’s largest publicly traded energy company in 1975 and led several of its operations in the United States as well as in Yemen, Thailand and Russia. As Exxon’s chief executive, he maintained close ties with Moscow and opposed U.S. sanctions against Russia for its incursion into Crimea. Mnuchin, 54, is a successful private equity investor, hedge fund manager and Hollywood financier who spent 17 years at Goldman Sachs Group Inc

This title seems to be a story title that Reuters reused, while changing the underlying story.

Next I'll make a dataframe with all the data with duplicate titles

In [21]:
duplicate_title_list = set(title_value_counts[title_value_counts > 1].keys())

In [22]:
df_dup_titles = df[df.title.isin(duplicate_title_list)]

In [23]:
df_dup_titles

Unnamed: 0,title,text,subject,date,label
1488,McConnell Says He’ll Obstruct ANY Effort To H...,Siding with Trump s Attorney General Jeff Sess...,News,"May 12, 2017",fake
1535,McConnell Says He’ll Obstruct ANY Effort To H...,Siding with Trump s Attorney General Jeff Sess...,News,"May 10, 2017",fake
9087,BARBRA STREISAND Gives Up On Dream Of Impeachi...,Barbra Streisand was an Obama sycophant and on...,politics,"Dec 31, 2017",fake
9088,WATCH: SENATOR LINDSEY GRAHAM DROPS BOMBSHELL…...,Everyone suspected the sketchy Steele Dossier ...,politics,"Dec 31, 2017",fake
9089,“CONSERVATIVE GAY GUY” BLASTS Pence’s Aspen Ne...,It s been said that good fences make good neig...,politics,"Dec 30, 2017",fake
...,...,...,...,...,...
44884,U.S. puts more pressure on Pakistan to help wi...,WASHINGTON (Reuters) - The United States sugge...,worldnews,"August 21, 2017",true
44886,Trump talks tough on Pakistan's 'terrorist' ha...,ISLAMABAD (Reuters) - Outlining a new strategy...,worldnews,"August 22, 2017",true
44887,"U.S., North Korea clash at U.N. forum over nuc...",GENEVA (Reuters) - North Korea and the United ...,worldnews,"August 22, 2017",true
44889,"U.S., North Korea clash at U.N. forum over nuc...",GENEVA (Reuters) - North Korea and the United ...,worldnews,"August 22, 2017",true


Looking at the above subset of data, I can see that some rows may be complete duplicates (44887 and 44889) and others may be duplicates except for the date (1488 and 1535).

Are there any duplicate rows?

In [24]:
df_dup_titles[df_dup_titles.duplicated()]

Unnamed: 0,title,text,subject,date,label
9942,HILLARY TWEETS MESSAGE In Defense Of DACA…OOPS...,No time to waste we've got to fight with eve...,politics,"Sep 9, 2017",fake
11446,FORMER DEMOCRAT WARNS Young Americans: “Rioter...,"Who is silencing political speech, physically...",politics,"Mar 10, 2017",fake
14925,[VIDEO] #BlackLivesMatter Terrorists Storm Dar...,They were probably just looking for a safe sp...,politics,"Nov 16, 2015",fake
23926,Senate tax bill stalls on deficit-focused 'tri...,WASHINGTON (Reuters) - The U.S. Senate on Thur...,politicsNews,"November 30, 2017",true
24259,Trump warns 'rogue regime' North Korea of grav...,BEIJING (Reuters) - U.S. President Donald Trum...,politicsNews,"November 8, 2017",true
...,...,...,...,...,...
44709,France unveils labor reforms in first step to ...,PARIS (Reuters) - French President Emmanuel Ma...,worldnews,"August 31, 2017",true
44744,Guatemala top court sides with U.N. graft unit...,GUATEMALA CITY (Reuters) - Guatemala s top cou...,worldnews,"August 29, 2017",true
44771,"Europeans, Africans agree renewed push to tack...",PARIS (Reuters) - Europe s big four continen...,worldnews,"August 28, 2017",true
44834,Thailand's ousted PM Yingluck has fled abroad:...,BANGKOK (Reuters) - Ousted Thai prime minister...,worldnews,"August 25, 2017",true


There are 209 duplicate rows that will need to be deleted.

I want to also check for other duplicate situations.

Check for rows with duplicate text

In [25]:
df_dup_titles[df_dup_titles.duplicated(['text'])]

Unnamed: 0,title,text,subject,date,label
9114,WHY THIS BLUE-COLLAR DEMOCRAT STRONGHOLD Count...,AP News The regulars amble in before dawn and...,politics,"Dec 27, 2017",fake
9738,LEFTIST STORE OWNER Makes Video Asking Libs To...,Things didn t go as well as Nicholle had hoped...,politics,"Oct 6, 2017",fake
9942,HILLARY TWEETS MESSAGE In Defense Of DACA…OOPS...,No time to waste we've got to fight with eve...,politics,"Sep 9, 2017",fake
11041,Joe Scarborough BERATES Mika Brzezinski Over “...,,politics,"Apr 26, 2017",fake
11190,WATCH TUCKER CARLSON Scorch Sanctuary City May...,,politics,"Apr 6, 2017",fake
...,...,...,...,...,...
44709,France unveils labor reforms in first step to ...,PARIS (Reuters) - French President Emmanuel Ma...,worldnews,"August 31, 2017",true
44744,Guatemala top court sides with U.N. graft unit...,GUATEMALA CITY (Reuters) - Guatemala s top cou...,worldnews,"August 29, 2017",true
44771,"Europeans, Africans agree renewed push to tack...",PARIS (Reuters) - Europe s big four continen...,worldnews,"August 28, 2017",true
44834,Thailand's ousted PM Yingluck has fled abroad:...,BANGKOK (Reuters) - Ousted Thai prime minister...,worldnews,"August 25, 2017",true


There are 5964 rows that have duplicate text.

I see some rows do not have any text, and that will have to be investigated later. 

Check for rows that have duplicate title and text

In [26]:
df_dup_titles[df_dup_titles.duplicated(['title', 'text'])]

Unnamed: 0,title,text,subject,date,label
9942,HILLARY TWEETS MESSAGE In Defense Of DACA…OOPS...,No time to waste we've got to fight with eve...,politics,"Sep 9, 2017",fake
11446,FORMER DEMOCRAT WARNS Young Americans: “Rioter...,"Who is silencing political speech, physically...",politics,"Mar 10, 2017",fake
14925,[VIDEO] #BlackLivesMatter Terrorists Storm Dar...,They were probably just looking for a safe sp...,politics,"Nov 16, 2015",fake
15892,HOUSE INTEL Slaps Subpoenas on McCain Institut...,Please see our previous report below on the Mc...,Government News,"Dec 27, 2017",fake
15893,PRICELESS! WATCH MSNBC HOST’S Shocked Response...,THIS IS PRICELESS! The video below shows just ...,Government News,"Dec 26, 2017",fake
...,...,...,...,...,...
44709,France unveils labor reforms in first step to ...,PARIS (Reuters) - French President Emmanuel Ma...,worldnews,"August 31, 2017",true
44744,Guatemala top court sides with U.N. graft unit...,GUATEMALA CITY (Reuters) - Guatemala s top cou...,worldnews,"August 29, 2017",true
44771,"Europeans, Africans agree renewed push to tack...",PARIS (Reuters) - Europe s big four continen...,worldnews,"August 28, 2017",true
44834,Thailand's ousted PM Yingluck has fled abroad:...,BANGKOK (Reuters) - Ousted Thai prime minister...,worldnews,"August 25, 2017",true


Only 5793 rows with duplicate title and text.

Of the 171 rows that have duplicate text but not title, how many have empty text values

In [27]:
df_dup_titles[df_dup_titles.text == ' ']

Unnamed: 0,title,text,subject,date,label
10923,TAKE OUR POLL: Who Do You Think President Trum...,,politics,"May 10, 2017",fake
11041,Joe Scarborough BERATES Mika Brzezinski Over “...,,politics,"Apr 26, 2017",fake
11190,WATCH TUCKER CARLSON Scorch Sanctuary City May...,,politics,"Apr 6, 2017",fake
11236,SHOCKER: Public School Turns Computer Lab Into...,,politics,"Apr 1, 2017",fake
11247,MICHAEL FLYNN’S LAWYER Releases Statement Scor...,,politics,"Mar 30, 2017",fake
...,...,...,...,...,...
21816,BALTIMORE BURNS: MARYLAND GOVERNOR BRINGS IN N...,,left-news,"Apr 27, 2015",fake
21826,FULL VIDEO: THE BLOCKBUSTER INVESTIGATION INTO...,,left-news,"Apr 25, 2015",fake
21827,(VIDEO) HILLARY CLINTON: RELIGIOUS BELIEFS MUS...,,left-news,"Apr 25, 2015",fake
21857,(VIDEO)ICE PROTECTING OBAMA: WON’T RELEASE NAM...,,left-news,"Apr 14, 2015",fake


There are more rows with a text value of ' ' than 171, so this issue will need to be dealt with separately.

*Summary for title column* There are close to 11,800 rows that have titles used multiple times.  The use of a duplicate title seems to be a common practice and some articles seem to be revisions.  A duplicate title will be considered ok, but the duplicate rows and rows with duplicate text for the story will need removed.  Rows without text will also need to be removed.

### text column

Check for a text value with one space

In [28]:
df[df.text == ' ']

Unnamed: 0,title,text,subject,date,label
10923,TAKE OUR POLL: Who Do You Think President Trum...,,politics,"May 10, 2017",fake
11041,Joe Scarborough BERATES Mika Brzezinski Over “...,,politics,"Apr 26, 2017",fake
11190,WATCH TUCKER CARLSON Scorch Sanctuary City May...,,politics,"Apr 6, 2017",fake
11225,MAYOR OF SANCTUARY CITY: Trump Trying To Make ...,,politics,"Apr 2, 2017",fake
11236,SHOCKER: Public School Turns Computer Lab Into...,,politics,"Apr 1, 2017",fake
...,...,...,...,...,...
21826,FULL VIDEO: THE BLOCKBUSTER INVESTIGATION INTO...,,left-news,"Apr 25, 2015",fake
21827,(VIDEO) HILLARY CLINTON: RELIGIOUS BELIEFS MUS...,,left-news,"Apr 25, 2015",fake
21857,(VIDEO)ICE PROTECTING OBAMA: WON’T RELEASE NAM...,,left-news,"Apr 14, 2015",fake
21873,(VIDEO) HYSTERICAL SNL TAKE ON HILLARY’S ANNOU...,,left-news,"Apr 12, 2015",fake


There could be an arbritray amount of whitespace so I will replace all text values that are only whitespace with Nan

In [29]:
df = df.replace(r'^\s*$', np.nan, regex=True)

In [30]:
df.isna().sum()

title        0
text       631
subject      0
date         0
label        0
dtype: int64

*Summary for the text column* 631 rows with no text is small compared to the size of the dataset, so they should be dropped.

### Remove Duplicate Data

In [31]:
df_clean = df.drop_duplicates(ignore_index=True)

In [32]:
df_clean = df_clean.dropna()

In [33]:
df_clean = df_clean.drop_duplicates(['title', 'text'], ignore_index=True)

In [34]:
df_clean = df_clean.drop_duplicates(['text'], ignore_index=True)

In [35]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38644 entries, 0 to 38643
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    38644 non-null  object
 1   text     38644 non-null  object
 2   subject  38644 non-null  object
 3   date     38644 non-null  object
 4   label    38644 non-null  object
dtypes: object(5)
memory usage: 1.5+ MB


How many rows were dropped?

In [36]:
len(df) - len(df_clean)

6254

## Check Date Range

In [39]:
try:
    df_clean['date'] = pd.to_datetime(df_clean['date'])
except ValueError:
    print('Not all date values are dates.')

Not all date values are dates.


I will need to filter out the date values that are not dates.