In [3]:
import pandas as pd
import numpy as np
from glob import glob

# LIAR DATASET

In [98]:
def load_csv(part):
    columns = ["id", "label", "text", "subject", "speaker", "job", "state", "party", "int0", "int1", "int2", "int3", "int4", "context"]
    return pd.read_csv("./liar_dataset/" + part + ".tsv", sep="\t", names=columns)

In [44]:
test = load_csv("test")

In [45]:
def sort_label(label):
    if label in ["pants-fire", "false", "barely-true"]:
        return "false"
    else:
        return "true"
    
def clean_df(df):
    # 6 labels pants-fire, false, barely-true, half-true, mostly-true, true
    df["label"] = df["label"].apply(lambda label: sort_label(label))
    
    # per https://arxiv.org/pdf/1905.04749.pdf they only use text and label
    filtered_df=df.loc[:, ["id", "label", "text"]]
    
    return filtered_df

In [46]:
clean_df(test)

Unnamed: 0,id,label,text
0,11972.json,true,Building a wall on the U.S.-Mexico border will...
1,11685.json,false,Wisconsin is on pace to double the number of l...
2,11096.json,false,Says John McCain has done nothing to help the ...
3,5209.json,true,Suzanne Bonamici supports a plan that will cut...
4,9524.json,false,When asked by a reporter whether hes at the ce...
...,...,...,...
1262,7334.json,true,Says his budget provides the highest state fun...
1263,9788.json,false,Ive been here almost every day.
1264,10710.json,false,"In the early 1980s, Sen. Edward Kennedy secret..."
1265,3186.json,false,Says an EPA permit languished under Strickland...


In [40]:
for part in ["train", "test", "valid"]:
    df = load_csv(part)
    df = clean_df(df)
    df.to_csv("./liar_dataset/clean_" + part + ".csv")

## Post processing
Clean output from triple extractor

In [8]:
def bug_clean(df):
    condition = np.logical_and(df.triple.notna(), df.text.isna())
    indxs = df[condition]["Unnamed: 0"].values
    triples = df[condition]["triple"].values
    df.loc[indxs, "triple"] = triples
    df.drop(df.index[condition], inplace=True)
    return df

In [332]:
paths = glob("./liar_dataset_triples/*")

In [333]:
main_df = pd.DataFrame()
for path in paths:
    if not path.__contains__("train"):
        continue
    curr_df = pd.read_csv(path)
    curr_df = bug_clean(curr_df)
    main_df = pd.concat([main_df, curr_df])

In [334]:
main_df = main_df.drop(columns=list(set(main_df.columns.values) ^ set(["id", "label", "text", "triple"]))).drop_duplicates()

In [335]:
main_df.to_csv("./liar_dataset_triples/main_train_clean_triples.csv")

In [4]:
def percent_triples(df):
    return 1 - sum(df.triple.isna()) / df.shape[0]

def avg_word_count(df):
    tot = 0
    for sent in df.text.values:
        tot += len(sent.split(" "))
    return tot / df.shape[0]

In [5]:
paths = glob("./liar_dataset_triples/*")

In [6]:
drops = []
for indx, path in enumerate(paths):
    if path.__contains__("clean_train_triple"):
        drops.append(indx)
paths = np.delete(paths, drops)

In [9]:
for path in paths:
    name = "train"
    if path.__contains__("test"):
        name = "test"
    if path.__contains__("valid"):
        name = "valid"
    df = pd.read_csv(path)    
    df = bug_clean(df)
    p_triples = percent_triples(df)
    sent_size = avg_word_count(df)
    print(f"dataset: {name}. avg sent length: {sent_size:.2f} words. percent triples: {p_triples*100:.2f}%.")

dataset: test. avg sent length: 18.24 words. percent triples: 11.68%.
dataset: valid. avg sent length: 17.93 words. percent triples: 11.06%.
dataset: train. avg sent length: 17.97 words. percent triples: 3.39%.


In [10]:
df

Unnamed: 0.1,Unnamed: 0,id,label,text,triple
0,0,2635.json,False,Says the Annies List political group supports ...,
1,1,10540.json,True,When did the decline of coal start? It started...,
2,2,324.json,True,Hillary Clinton agrees with John McCain by vot...,
3,3,1123.json,False,Health care reform legislation is likely to ma...,
4,4,9028.json,True,The economic turnaround started at the end of ...,
...,...,...,...,...,...
10234,2434,1592.json,True,"Under the ruling of the Supreme Court, any lob...",
10235,2435,5473.json,True,There are a larger number of shark attacks in ...,
10236,2436,3408.json,True,Democrats have now become the party of the Atl...,
10237,2437,3959.json,True,Says an alternative to Social Security that op...,"[['County', 'location/hud_county_place/county'..."


# FakeNewsNet 

In [449]:
paths = glob("./fakenewsnet/*/*/*")

In [450]:
df = pd.DataFrame(columns=["text", "source", "label"])

In [451]:
def get_content(data):
    text = data['text']
    label = "false" if (path.split("\\")[-2] == "FakeNewsContent") else "true"
    source = path.split("\\")[1]
    return {"text":text, "source": source, "label": label}

In [452]:
from tqdm import tqdm

for path in tqdm(paths):
    with open(path) as f:
        data = json.load(f)
    content = get_content(data)
    df = df.append(content, ignore_index=True)


  0%|                                                                                          | 0/422 [00:00<?, ?it/s][A
 14%|██████████▊                                                                     | 57/422 [00:00<00:00, 565.77it/s][A
 28%|█████████████████████▉                                                         | 117/422 [00:00<00:00, 584.73it/s][A
 42%|█████████████████████████████████▏                                             | 177/422 [00:00<00:00, 587.06it/s][A
 56%|████████████████████████████████████████████▏                                  | 236/422 [00:00<00:00, 586.53it/s][A
 70%|███████████████████████████████████████████████████████▍                       | 296/422 [00:00<00:00, 587.65it/s][A
 84%|██████████████████████████████████████████████████████████████████▍            | 355/422 [00:00<00:00, 581.21it/s][A
100%|███████████████████████████████████████████████████████████████████████████████| 422/422 [00:00<00:00, 580.73it/s][A


In [460]:
buzzfeed = df[df.source == "BuzzFeed"]
politifact = df[df.source == "PolitiFact"]
print(f"""
    BuzzFeed:
    num articles: {buzzfeed.shape[0]},
        false: {sum(buzzfeed.label == "false")},
        true: {sum(buzzfeed.label == "true")}
    
    PolitiFact:
    num articles: {politifact.shape[0]},
        false: {sum(politifact.label == "false")},
        true: {sum(politifact.label == "true")}
""")


    BuzzFeed:
    num articles: 182,
        false: 91,
        true: 91
    
    PolitiFact:
    num articles: 240,
        false: 120,
        true: 120



In [461]:
df.to_csv("./fakenewsnet/clean_fakenewsnet.csv")

## Post processing
Clean output from triple extractor

In [463]:
paths = glob("./fakenewsnet_triples/*")

In [464]:
df = pd.read_csv(paths[0])

In [471]:
eval(df.triple[1])

[['Bill Clinton',
  'people/person/employment_history./business/employment_tenure/title',
  'President'],
 ['Marsha Blackburn',
  'people/person/employment_history./business/employment_tenure/title',
  'Rep.'],
 ['Roger Bate',
  'organization/role/leaders./organization/leadership/person',
  'American Enterprise Institute'],
 ['Hillary Clintons campaign.Blackburn',
  'people/person/employment_history./business/employment_tenure/title',
  'candidate'],
 ['J. Rosenstein',
  'organization/role/leaders./organization/leadership/person',
  'District of Maryland Rod'],
 ['J. Rosenstein',
  'people/person/employment_history./business/employment_tenure/title',
  'Attorney'],
 ['Barack Obamas',
  'people/person/employment_history./business/employment_tenure/title',
  'President']]

# Fake or Real

In [446]:
paths = glob("./fake-or-real/*")

In [448]:
pd.read_csv(paths[0])

Unnamed: 0,uuid,ord_in_thread,author,published,title,text,language,crawled,site_url,country,domain_rank,thread_title,spam_score,main_img_url,replies_count,participants_count,likes,comments,shares,type
0,6a175f46bcd24d39b3e962ad0f29936721db70db,0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,english,2016-10-27T01:49:27.168+03:00,100percentfedup.com,US,25689.0,Muslims BUSTED: They Stole Millions In Gov’t B...,0.000,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
1,2bdc29d12605ef9cf3f09f9875040a7113be5d5b,0,reasoning with facts,2016-10-29T08:47:11.259+03:00,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,english,2016-10-29T08:47:11.259+03:00,100percentfedup.com,US,25689.0,Re: Why Did Attorney General Loretta Lynch Ple...,0.000,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
2,c70e149fdd53de5e61c29281100b9de0ed268bc3,0,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,english,2016-10-31T01:41:49.479+02:00,100percentfedup.com,US,25689.0,BREAKING: Weiner Cooperating With FBI On Hilla...,0.000,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
3,7cf7c15731ac2a116dd7f629bd57ea468ed70284,0,Fed Up,2016-11-01T05:22:00.000+02:00,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,english,2016-11-01T15:46:26.304+02:00,100percentfedup.com,US,25689.0,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,0.068,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias
4,0206b54719c7e241ffe0ad4315b808290dbe6c0f,0,Fed Up,2016-11-01T21:56:00.000+02:00,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,english,2016-11-01T23:59:42.266+02:00,100percentfedup.com,US,25689.0,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,0.865,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12994,f1b5d0e44803f48732bde854a9fdf95837219b12,2,replaceme,2016-10-26T23:58:00.000+03:00,,It DOES allow you to put a dog face on top of ...,english,2016-10-27T00:37:46.194+03:00,zerohedge.com,US,2435.0,"Snapchat To Raise Up To $4 Billion In IPO, Val...",0.000,,40,32,0,0,0,bs
12995,36011ceba3647e1bea78299b68b6fb705a1fc1ad,3,Freedumb,2016-10-27T00:02:00.000+03:00,,Wait till you see what happens to the valuatio...,english,2016-10-27T00:37:46.220+03:00,zerohedge.com,US,2435.0,"Snapchat To Raise Up To $4 Billion In IPO, Val...",0.000,,40,32,0,0,0,bs
12996,6995d1aa9ac99926106489b14b5530e85358059a,4,major major maj...,2016-10-27T00:06:00.000+03:00,,I'm waiting for the one that puts a pussy on m...,english,2016-10-27T00:37:46.244+03:00,zerohedge.com,US,2435.0,"Snapchat To Raise Up To $4 Billion In IPO, Val...",0.000,,40,32,0,0,0,bs
12997,7de8ae90eee164eb756db6c8a3772288e11d7a94,5,beemasters,2016-10-27T00:09:00.000+03:00,,$4 Billion even after they are known to be kee...,english,2016-10-27T00:37:46.247+03:00,zerohedge.com,US,2435.0,"Snapchat To Raise Up To $4 Billion In IPO, Val...",0.000,,40,32,0,0,0,bs
