In [1]:
import pandas as pd
import numpy as np
from glob import glob

# LIAR DATASET

In [98]:
def load_csv(part):
    columns = ["id", "label", "text", "subject", "speaker", "job", "state", "party", "int0", "int1", "int2", "int3", "int4", "context"]
    return pd.read_csv("./liar_dataset/" + part + ".tsv", sep="\t", names=columns)

In [44]:
test = load_csv("test")

In [45]:
def sort_label(label):
    if label in ["pants-fire", "false", "barely-true"]:
        return "false"
    else:
        return "true"
    
def clean_df(df):
    # 6 labels pants-fire, false, barely-true, half-true, mostly-true, true
    df["label"] = df["label"].apply(lambda label: sort_label(label))
    
    # per https://arxiv.org/pdf/1905.04749.pdf they only use text and label
    filtered_df=df.loc[:, ["id", "label", "text"]]
    
    return filtered_df

In [46]:
clean_df(test)

Unnamed: 0,id,label,text
0,11972.json,true,Building a wall on the U.S.-Mexico border will...
1,11685.json,false,Wisconsin is on pace to double the number of l...
2,11096.json,false,Says John McCain has done nothing to help the ...
3,5209.json,true,Suzanne Bonamici supports a plan that will cut...
4,9524.json,false,When asked by a reporter whether hes at the ce...
...,...,...,...
1262,7334.json,true,Says his budget provides the highest state fun...
1263,9788.json,false,Ive been here almost every day.
1264,10710.json,false,"In the early 1980s, Sen. Edward Kennedy secret..."
1265,3186.json,false,Says an EPA permit languished under Strickland...


In [40]:
for part in ["train", "test", "valid"]:
    df = load_csv(part)
    df = clean_df(df)
    df.to_csv("./liar_dataset/clean_" + part + ".csv")

## Post processing
Clean output from triple extractor

In [8]:
def bug_clean(df):
    condition = np.logical_and(df.triple.notna(), df.text.isna())
    indxs = df[condition]["Unnamed: 0"].values
    triples = df[condition]["triple"].values
    df.loc[indxs, "triple"] = triples
    df.drop(df.index[condition], inplace=True)
    return df

In [332]:
paths = glob("./liar_dataset_triples/*")

In [333]:
main_df = pd.DataFrame()
for path in paths:
    if not path.__contains__("train"):
        continue
    curr_df = pd.read_csv(path)
    curr_df = bug_clean(curr_df)
    main_df = pd.concat([main_df, curr_df])

In [334]:
main_df = main_df.drop(columns=list(set(main_df.columns.values) ^ set(["id", "label", "text", "triple"]))).drop_duplicates()

In [335]:
main_df.to_csv("./liar_dataset_triples/main_train_clean_triples.csv")

In [4]:
def percent_triples(df):
    return 1 - sum(df.triple.isna()) / df.shape[0]

def avg_word_count(df):
    tot = 0
    for sent in df.text.values:
        tot += len(sent.split(" "))
    return tot / df.shape[0]

In [5]:
paths = glob("./liar_dataset_triples/*")

In [6]:
drops = []
for indx, path in enumerate(paths):
    if path.__contains__("clean_train_triple"):
        drops.append(indx)
paths = np.delete(paths, drops)

In [9]:
for path in paths:
    name = "train"
    if path.__contains__("test"):
        name = "test"
    if path.__contains__("valid"):
        name = "valid"
    df = pd.read_csv(path)    
    df = bug_clean(df)
    p_triples = percent_triples(df)
    sent_size = avg_word_count(df)
    print(f"dataset: {name}. avg sent length: {sent_size:.2f} words. percent triples: {p_triples*100:.2f}%.")

dataset: test. avg sent length: 18.24 words. percent triples: 11.68%.
dataset: valid. avg sent length: 17.93 words. percent triples: 11.06%.
dataset: train. avg sent length: 17.97 words. percent triples: 3.39%.


In [10]:
df

Unnamed: 0.1,Unnamed: 0,id,label,text,triple
0,0,2635.json,False,Says the Annies List political group supports ...,
1,1,10540.json,True,When did the decline of coal start? It started...,
2,2,324.json,True,Hillary Clinton agrees with John McCain by vot...,
3,3,1123.json,False,Health care reform legislation is likely to ma...,
4,4,9028.json,True,The economic turnaround started at the end of ...,
...,...,...,...,...,...
10234,2434,1592.json,True,"Under the ruling of the Supreme Court, any lob...",
10235,2435,5473.json,True,There are a larger number of shark attacks in ...,
10236,2436,3408.json,True,Democrats have now become the party of the Atl...,
10237,2437,3959.json,True,Says an alternative to Social Security that op...,"[['County', 'location/hud_county_place/county'..."


In [137]:
import requests 
import urllib
from bs4 import BeautifulSoup

# desktop user-agent
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
# mobile user-agent
MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"

def google(query):
    query = query.replace(' ', '+')
    URL = f"https://google.com/search?q={query}"
    
    headers = {"user-agent" : USER_AGENT}
    resp = requests.get(URL, headers=headers)
    
    if resp.status_code == 200:
        soup = BeautifulSoup(resp.content, "html.parser")
        for g in soup.find_all("div", {"class": "g"}):
            titles = g.find_all("h3")
            if titles:
                text = titles[0].text
                if "Wikipedia" in text:
                    return text[:-12]
    return str(resp.status_code)

In [136]:
query = "Obama"
query = query.replace(' ', '+')
URL = f"https://google.com/search?q={query}"

headers = {"user-agent" : USER_AGENT}
resp = requests.get(URL, headers=headers)

if resp.status_code == 200:
    soup = BeautifulSoup(resp.content, "html.parser")
    for g in soup.find_all("div", {"class": "g"}):
        titles = g.find_all("h3")
        if titles:
            text = titles[0].text
            if "Wikipedia" in text:
                 print(text[:-12])
else:
    print(resp)

<Response [429]>


In [138]:
google("obama")

'429'

In [6]:
#!pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

In [7]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON

def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


def fb_to_common(freebase_id):
    endpoint_url = "https://query.wikidata.org/sparql"

    query = \
    '''SELECT ?sLabel WHERE { 
        ?s wdt:P646 "''' + freebase_id + '''".
        SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
    LIMIT 1'''
    res = get_results(endpoint_url, query)
    try:
        res = res['results']['bindings'][0]['sLabel']['value']
        return res
    except:
        return "No result"

def common_to_fb(common_name):
    endpoint_url = "https://query.wikidata.org/sparql"

    query = '''
    SELECT ?fbid WHERE { 
        ?s wdt:P373 "''' + common_name + '''".
    OPTIONAL {
        ?s wdt:P646 ?fbid .
        }
    SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
    LIMIT 1'''
    res = get_results(endpoint_url, query)
    try:
        res = res['results']['bindings'][0]['fbid']['value']
        return res
    except:
        return common_name + "-1"

In [118]:
train = pd.read_csv("./liar_dataset_triples/train.csv", index_col=0)
valid = pd.read_csv("./liar_dataset_triples/valid.csv", index_col=0)
test = pd.read_csv("./liar_dataset_triples/test.csv", index_col=0)

In [123]:
from tqdm.notebook import tqdm

def google_triples(df):
    # clean df
    df = df.dropna()
    df.drop_duplicates(inplace=True)
    df.triple = df.triple.apply(lambda x: eval(x))
    df = df.reset_index()
    # iterate list of list 
    for i, trip_list in enumerate(tqdm(df.triple.values)):
        fb_triples = []
        for trip in trip_list:
            h, r, t = trip
            h = google(h)
            t = google(t)
            fb_triples.append([h,r,t])
        df.at[i, 'triple'] = fb_triples
    return df

In [125]:
fb_train

Unnamed: 0,index,id,label,text,triple
0,491,6873.json,False,This is what President Obama said the jobless ...,"[[No result, people/person/employment_history...."
1,493,7976.json,True,Says President Obama has cracked down on emplo...,"[[No result, people/person/employment_history...."
2,503,2354.json,True,Says State Rep. Kristi Thibaut was an ACORN lo...,"[[No result, organization/role/leaders./organi..."
3,2484,980.json,False,Secretary Geithner has left the option on the ...,"[[No result, people/person/employment_history...."
4,2486,6457.json,False,Says U.S. Rep. Martin Heinrich spent a trillio...,"[[No result, people/person/employment_history...."
...,...,...,...,...,...
342,2411,1550.json,True,Debt has almost doubled in Austin under Gov. P...,"[[No result, people/person/employment_history...."
343,2419,6032.json,True,Georgia Public Service Commission member Stan ...,"[[No result, organization/role/leaders./organi..."
344,2423,3744.json,True,Says Rick Perry turned down our invitation to ...,"[[No result, people/person/employment_history...."
345,2433,4388.json,False,Mayor Fung wants to punish our childrens educa...,"[[No result, people/person/employment_history...."


In [124]:
fb_train = google_triples(train)
#fb_train.to_csv("./liar_dataset_triples/fb_train.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


HBox(children=(FloatProgress(value=0.0, max=347.0), HTML(value='')))




# FakeNewsNet 

In [449]:
paths = glob("./fakenewsnet/*/*/*")

In [450]:
df = pd.DataFrame(columns=["text", "source", "label"])

In [451]:
def get_content(data):
    text = data['text']
    label = "false" if (path.split("\\")[-2] == "FakeNewsContent") else "true"
    source = path.split("\\")[1]
    return {"text":text, "source": source, "label": label}

In [452]:
from tqdm import tqdm

for path in tqdm(paths):
    with open(path) as f:
        data = json.load(f)
    content = get_content(data)
    df = df.append(content, ignore_index=True)


  0%|                                                                                          | 0/422 [00:00<?, ?it/s][A
 14%|██████████▊                                                                     | 57/422 [00:00<00:00, 565.77it/s][A
 28%|█████████████████████▉                                                         | 117/422 [00:00<00:00, 584.73it/s][A
 42%|█████████████████████████████████▏                                             | 177/422 [00:00<00:00, 587.06it/s][A
 56%|████████████████████████████████████████████▏                                  | 236/422 [00:00<00:00, 586.53it/s][A
 70%|███████████████████████████████████████████████████████▍                       | 296/422 [00:00<00:00, 587.65it/s][A
 84%|██████████████████████████████████████████████████████████████████▍            | 355/422 [00:00<00:00, 581.21it/s][A
100%|███████████████████████████████████████████████████████████████████████████████| 422/422 [00:00<00:00, 580.73it/s][A


In [460]:
buzzfeed = df[df.source == "BuzzFeed"]
politifact = df[df.source == "PolitiFact"]
print(f"""
    BuzzFeed:
    num articles: {buzzfeed.shape[0]},
        false: {sum(buzzfeed.label == "false")},
        true: {sum(buzzfeed.label == "true")}
    
    PolitiFact:
    num articles: {politifact.shape[0]},
        false: {sum(politifact.label == "false")},
        true: {sum(politifact.label == "true")}
""")


    BuzzFeed:
    num articles: 182,
        false: 91,
        true: 91
    
    PolitiFact:
    num articles: 240,
        false: 120,
        true: 120



In [461]:
df.to_csv("./fakenewsnet/clean_fakenewsnet.csv")

## Post processing
Clean output from triple extractor

In [463]:
paths = glob("./fakenewsnet_triples/*")

In [464]:
df = pd.read_csv(paths[0])

In [471]:
eval(df.triple[1])

[['Bill Clinton',
  'people/person/employment_history./business/employment_tenure/title',
  'President'],
 ['Marsha Blackburn',
  'people/person/employment_history./business/employment_tenure/title',
  'Rep.'],
 ['Roger Bate',
  'organization/role/leaders./organization/leadership/person',
  'American Enterprise Institute'],
 ['Hillary Clintons campaign.Blackburn',
  'people/person/employment_history./business/employment_tenure/title',
  'candidate'],
 ['J. Rosenstein',
  'organization/role/leaders./organization/leadership/person',
  'District of Maryland Rod'],
 ['J. Rosenstein',
  'people/person/employment_history./business/employment_tenure/title',
  'Attorney'],
 ['Barack Obamas',
  'people/person/employment_history./business/employment_tenure/title',
  'President']]

# Fake or Real

In [97]:
paths = glob("./fake-or-real/*")

In [98]:
pd.read_csv(paths[0])