In [85]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

Planned structure:
* Loading the data 
* Format them and combine
* Preprocess and plot (It should be wiser to first plot some of the information about our data and plot again after the preprocess, like avg token size maybe?)

## Data Preprocessing and EDA 

### Loading the datasets

#### FEVER Dataset
Requires some preprocess, these are mostly used for fact extraction and verification so it might be useful for training?

In [86]:
fever_obj = pd.read_json("dataset/FEVER/train.jsonl", lines=True)

In [87]:
fever_obj

Unnamed: 0,id,verifiable,label,claim,evidence
0,75397,VERIFIABLE,SUPPORTS,Nikolaj Coster-Waldau worked with the Fox Broa...,"[[[92206, 104971, Nikolaj_Coster-Waldau, 7], [..."
1,150448,VERIFIABLE,SUPPORTS,Roman Atwood is a content creator.,"[[[174271, 187498, Roman_Atwood, 1]], [[174271..."
2,214861,VERIFIABLE,SUPPORTS,"History of art includes architecture, dance, s...","[[[255136, 254645, History_of_art, 2]]]"
3,156709,VERIFIABLE,REFUTES,Adrienne Bailon is an accountant.,"[[[180804, 193183, Adrienne_Bailon, 0]]]"
4,83235,NOT VERIFIABLE,NOT ENOUGH INFO,System of a Down briefly disbanded in limbo.,"[[[100277, None, None, None]]]"
...,...,...,...,...,...
145444,75062,VERIFIABLE,REFUTES,Led Zeppelin released an eponymous debut album...,"[[[91851, 104659, Led_Zeppelin, 6], [91851, 10..."
145445,149256,VERIFIABLE,SUPPORTS,Taal was romantic.,"[[[292586, 285327, Taal_-LRB-film-RRB-, 0]], [..."
145446,13287,VERIFIABLE,SUPPORTS,Her stars American actress Rooney Mara.,"[[[28520, 34848, Her_-LRB-film-RRB-, 3], [2852..."
145447,13114,VERIFIABLE,SUPPORTS,J. R. R. Tolkien created Gimli.,"[[[28359, 34669, Gimli_-LRB-Middle-earth-RRB-,..."


In [88]:
fever_obj["label"].value_counts()

SUPPORTS           80035
NOT ENOUGH INFO    35639
REFUTES            29775
Name: label, dtype: int64

#### LIAR Dataset
Column names are missing, labels are weird (half-true, pants-fire, etc.), consists somewhat irrelevant information.

In [89]:
liar_dataset = pd.read_csv("dataset/liar/train.tsv", sep ='\t', header=None)

In [90]:
liar_dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10235,5473.json,mostly-true,There are a larger number of shark attacks in ...,"animals,elections",aclu-florida,,Florida,none,0.0,1.0,1.0,1.0,0.0,"interview on ""The Colbert Report"""
10236,3408.json,mostly-true,Democrats have now become the party of the [At...,elections,alan-powell,,Georgia,republican,0.0,0.0,0.0,1.0,0.0,an interview
10237,3959.json,half-true,Says an alternative to Social Security that op...,"retirement,social-security",herman-cain,,Georgia,republican,4.0,11.0,5.0,3.0,3.0,a Republican presidential debate
10238,2253.json,false,On lifting the U.S. Cuban embargo and allowing...,"florida,foreign-policy",jeff-greene,,Florida,democrat,3.0,1.0,3.0,0.0,0.0,a televised debate on Miami's WPLG-10 against ...


In [91]:
# very weird labels
liar_dataset[1].value_counts()

half-true      2114
false          1995
mostly-true    1962
true           1676
barely-true    1654
pants-fire      839
Name: 1, dtype: int64

#### FakeNews
Formatted greatly, does not require that much of a preprocess.

In [92]:
fake_news = pd.read_csv("dataset/fake_news/train.csv")

In [93]:
fake_news

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [94]:
fake_news['label'].value_counts()

1    10413
0    10387
Name: label, dtype: int64

In [95]:
fake_news_clean = fake_news.drop(columns=['id', 'author'])
fake_news_clean

Unnamed: 0,title,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...
20795,Rapper T.I.: Trump a ’Poster Child For White S...,Rapper T. I. unloaded on black celebrities who...,0
20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",When the Green Bay Packers lost to the Washing...,0
20797,Macy’s Is Said to Receive Takeover Approach by...,The Macy’s of today grew from the union of sev...,0
20798,"NATO, Russia To Hold Parallel Exercises In Bal...","NATO, Russia To Hold Parallel Exercises In Bal...",1


#### Fake and Real News Dataset
Formatted greatly, includes subject column that gives info about the text. 21k True 23k False

In [96]:
fake_and_real_T = pd.read_csv("dataset/fake_and_real_news_dataset/True.csv")
fake_and_real_F = pd.read_csv("dataset/fake_and_real_news_dataset/Fake.csv")

In [97]:
fake_and_real_T

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017"
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017"
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017"
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017"


In [98]:
fake_and_real_T['label'] = 1

In [99]:
fake_and_real_F

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"
...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016"
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016"
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016"
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016"


In [100]:
fake_and_real_F['label'] = 0

In [101]:
fake_and_real_combined = pd.concat([fake_and_real_T, fake_and_real_F]).sample(frac=1).reset_index(drop=True)
fake_and_real_combined.drop(columns='date', inplace=True)
fake_and_real_combined

Unnamed: 0,title,text,subject,label
0,Maddow: Trump’s Entire Campaign Is One Big Mo...,Rachel Maddow never pulls any punches when it ...,News,0
1,NC Students Pull Racist Prank On Their Classm...,Students at a North Carolina high school have ...,News,0
2,WHOA: It Appears Trump Leaked Classified CIA ...,"I just want people to know, the CIA was hacke...",News,0
3,London's Angel station reopens after suspect p...,LONDON (Reuters) - British police said they ca...,worldnews,1
4,Spanish foreign minister calls Catalonia's lea...,PARIS (Reuters) - The independence speech made...,worldnews,1
...,...,...,...,...
44893,Trump denies seeking nearly tenfold increase i...,WASHINGTON (Reuters) - U.S. President Donald T...,worldnews,1
44894,KARMA: GAY PASTOR SUES WHOLE FOODS For “Anti-G...,Too bad for the gay pastor that Whole Foods ha...,left-news,0
44895,Macron may see 'slackers' become protest rally...,PARIS (Reuters) - President Emmanuel Macron s ...,worldnews,1
44896,Texas Super-Dolt Louie Gohmert SAVAGED For Li...,Trump is lighting the Internet on fire with hi...,News,0


### Formatting the DataFrames and Combining

* Our three datasets do not share the same informatics, some of them have very distinct columns (LIAR) while some of them only have text and label (Fake and Real dataset).
* Since most of the unnecessary information does not represent our data that good, we will only focus on text and label while we apply preprocessing and other operations on them

#### LIAR

TODO:
* Give some insight about the dataset
* Explain why we are dropping the columns

In [102]:
liar_dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [103]:
liar_dataset.shape

(10240, 14)

In [104]:
liar_dataset.dtypes

0      object
1      object
2      object
3      object
4      object
5      object
6      object
7      object
8     float64
9     float64
10    float64
11    float64
12    float64
13     object
dtype: object

##### Dropping NaN values

In [105]:
liar_dataset.isnull().sum()

0        0
1        0
2        0
3        2
4        2
5     2897
6     2208
7        2
8        2
9        2
10       2
11       2
12       2
13     102
dtype: int64

In [106]:
liar_dataset.dropna(inplace=True)
liar_dataset.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
dtype: int64

##### Renaming the columns

The original column names weren't available in the csv file but it's available in the README file of the LIAR dataset. For the sake of better insight about the data we will re-map the columns into their original names

In [107]:
# liar dataset
column_mapping = {
    0: 'statement_ID',
    1: 'label',
    2: 'statement',
    3: 'subject',
    4: 'speaker',
    5: 'speaker_job_title',
    6: 'state_info',
    7: 'party_affiliation',
    8: 'barely_true_counts',
    9: 'false_counts',
    10: 'half_true_counts',
    11: 'mostly_true_counts',
    12: 'pants_on_fire_counts',
    13: 'context'
}

liar_dataset = liar_dataset.rename(columns=column_mapping)
liar_dataset.head(n=2)

Unnamed: 0,statement_ID,label,statement,subject,speaker,speaker_job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.


##### Dropping irrelevant columns
Explain why we are dropping these columns

In [108]:
drop_columns = ['statement_ID', 'speaker_job_title', 'state_info', 'party_affiliation',
                "barely_true_counts", "false_counts", "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "speaker", "context", "subject"]

liar_clean = liar_dataset.drop(columns=drop_columns)
liar_clean.rename(columns={'statement':'text'}, inplace=True)
liar_clean


Unnamed: 0,label,text
0,false,Says the Annies List political group supports ...
1,half-true,When did the decline of coal start? It started...
2,mostly-true,"Hillary Clinton agrees with John McCain ""by vo..."
5,true,The Chicago Bears have had more starting quart...
7,half-true,I'm the only person on this stage who has work...
...,...,...
10227,pants-fire,"Recently though, the media has reported on tho..."
10228,barely-true,Stopped by Smiley Cookie to pick up some great...
10230,barely-true,The Supreme Courts views are radically out of ...
10231,half-true,"When it comes to the state deficit, Wisconsin ..."


##### Value counts of labels and mapping them as binary
Explain why half-true and barely-true are useless since they will cause problem with other datasets (We need binary classification)

In [109]:
liar_clean['label'].value_counts()

half-true      1413
mostly-true    1363
false          1305
true           1154
barely-true    1052
pants-fire      437
Name: label, dtype: int64

Explain the mapping maybe

In [110]:
liar_clean = liar_clean[~liar_clean['label'].isin(['half-true', 'barely-true'])]
label_mapping = {
    'mostly-true': 1,
    'true': 1,
    'false': 0,
    'pants-fire': 0
}

liar_clean['label'] = liar_clean['label'].map(label_mapping)
liar_clean = liar_clean.reindex(columns=sorted(liar_clean.columns, reverse=True))
liar_clean['label'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  liar_clean['label'] = liar_clean['label'].map(label_mapping)


1    2517
0    1742
Name: label, dtype: int64

In [111]:
liar_clean

Unnamed: 0,text,label
0,Says the Annies List political group supports ...,0
2,"Hillary Clinton agrees with John McCain ""by vo...",1
5,The Chicago Bears have had more starting quart...,1
9,Says GOP primary opponents Glenn Grothman and ...,1
10,"For the first time in history, the share of th...",1
...,...,...
10222,"For the first time since the Korean War, total...",1
10223,Says Rick Perry turned down our invitation to ...,1
10226,The proudest accomplishment (of my tenure) was...,1
10227,"Recently though, the media has reported on tho...",0


#### Fake and Real News

TODO:
* Give some insight about the dataset
* Explain why we are dropping the columns

In [112]:
fake_and_real_combined.head(5)

Unnamed: 0,title,text,subject,label
0,Maddow: Trump’s Entire Campaign Is One Big Mo...,Rachel Maddow never pulls any punches when it ...,News,0
1,NC Students Pull Racist Prank On Their Classm...,Students at a North Carolina high school have ...,News,0
2,WHOA: It Appears Trump Leaked Classified CIA ...,"I just want people to know, the CIA was hacke...",News,0
3,London's Angel station reopens after suspect p...,LONDON (Reuters) - British police said they ca...,worldnews,1
4,Spanish foreign minister calls Catalonia's lea...,PARIS (Reuters) - The independence speech made...,worldnews,1


In [113]:
fake_and_real_combined.isnull().sum()

title      0
text       0
subject    0
label      0
dtype: int64

In [114]:
drop_columns = ['title','subject']
fake_and_real_clean = fake_and_real_combined.drop(columns=drop_columns)
fake_and_real_clean.head(n=5)

Unnamed: 0,text,label
0,Rachel Maddow never pulls any punches when it ...,0
1,Students at a North Carolina high school have ...,0
2,"I just want people to know, the CIA was hacke...",0
3,LONDON (Reuters) - British police said they ca...,1
4,PARIS (Reuters) - The independence speech made...,1


#### FakeNews

TODO:
* Give some insight about the dataset
* Explain why we are dropping the columns

In [115]:
# fakenews 
fake_news_clean.head(5)

Unnamed: 0,title,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1


##### Dropping NaN values

In [116]:
fake_news_clean.isnull().sum()

title    558
text      39
label      0
dtype: int64

In [117]:
fake_news_clean.dropna(inplace=True)
fake_news_clean.isnull().sum()

title    0
text     0
label    0
dtype: int64

##### Dropping irrelevant columns

In [118]:
drop_columns = ['title']
fake_news_clean = fake_news_clean.drop(columns=drop_columns)

In [119]:
fake_news_clean.head(n=5)

Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \nAn Iranian woman has been sentenced to...,1


#### Combining all three datasets

In [120]:
text_dataset = pd.concat([liar_clean, fake_and_real_clean, fake_news_clean], ignore_index=True)
text_dataset

Unnamed: 0,text,label
0,Says the Annies List political group supports ...,0
1,"Hillary Clinton agrees with John McCain ""by vo...",1
2,The Chicago Bears have had more starting quart...,1
3,Says GOP primary opponents Glenn Grothman and ...,1
4,"For the first time in history, the share of th...",1
...,...,...
69355,Rapper T. I. unloaded on black celebrities who...,0
69356,When the Green Bay Packers lost to the Washing...,0
69357,The Macy’s of today grew from the union of sev...,0
69358,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [121]:
text_dataset.isnull().sum()

text     0
label    0
dtype: int64

### Data Preprocessing

* Explain why we are removing punctuations, making them lowercase, removing stop words, clean them and potentially why we vectorize/tokenize them.

In [122]:
text_dataset.sample(n=5)

Unnamed: 0,text,label
54729,A combat veteran with PTSD wasn’t allowed to f...,1
16280,SYDNEY (Reuters) - The United States will appl...,1
66266,\nAs if Hillary Clinton was not already facing...,1
39174,We all know how much conservatives love to whi...,0
56619,.@nikkihaley: ”The president is the CEO of the...,0


###