In [210]:
import pandas as pd
import numpy as np

In [98]:
def load_csv(part):
    columns = ["id", "label", "text", "subject", "speaker", "job", "state", "party", "int0", "int1", "int2", "int3", "int4", "context"]
    return pd.read_csv("./liar_dataset/" + part + ".tsv", sep="\t", names=columns)

In [44]:
test = load_csv("test")

In [45]:
def sort_label(label):
    if label in ["pants-fire", "false", "barely-true"]:
        return "false"
    else:
        return "true"
    
def clean_df(df):
    # 6 labels pants-fire, false, barely-true, half-true, mostly-true, true
    df["label"] = df["label"].apply(lambda label: sort_label(label))
    
    # per https://arxiv.org/pdf/1905.04749.pdf they only use text and label
    filtered_df=df.loc[:, ["id", "label", "text"]]
    
    return filtered_df

In [46]:
clean_df(test)

Unnamed: 0,id,label,text
0,11972.json,true,Building a wall on the U.S.-Mexico border will...
1,11685.json,false,Wisconsin is on pace to double the number of l...
2,11096.json,false,Says John McCain has done nothing to help the ...
3,5209.json,true,Suzanne Bonamici supports a plan that will cut...
4,9524.json,false,When asked by a reporter whether hes at the ce...
...,...,...,...
1262,7334.json,true,Says his budget provides the highest state fun...
1263,9788.json,false,Ive been here almost every day.
1264,10710.json,false,"In the early 1980s, Sen. Edward Kennedy secret..."
1265,3186.json,false,Says an EPA permit languished under Strickland...


In [40]:
for part in ["train", "test", "valid"]:
    df = load_csv(part)
    df = clean_df(df)
    df.to_csv("./liar_dataset/clean_" + part + ".csv")

## Extract triples with demo

In [291]:
def bug_clean(df):
    condition = np.logical_and(df.triple.notna(), df.text.isna())
    indxs = df[condition]["Unnamed: 0"].values
    triples = df[condition]["triple"].values
    df.loc[indxs, "triple"] = triples
    df.drop(df.index[condition], inplace=True)
    return df

In [292]:
from glob import glob

In [293]:
paths = glob("./liar_dataset_triples/*")

In [294]:
main_df = pd.DataFrame()
for path in paths:
    if not path.__contains__("train"):
        continue
    curr_df = pd.read_csv(path)
    curr_df = bug_clean(curr_df)
    main_df = pd.concat([main_df, curr_df])

In [None]:
main_df =

In [295]:
main_df.to_csv("./liar_dataset_triples/main_train_clean_triples.csv")

In [296]:
def percent_triples(df):
    return 1 - sum(df.triple.isna()) / df.shape[0]

def avg_word_count(df):
    tot = 0
    for sent in df.text.values:
        tot += len(sent.split(" "))
    return tot / df.shape[0]

In [297]:
paths = glob("./liar_dataset_triples/*")

In [298]:
drops = []
for indx, path in enumerate(paths):
    if path.__contains__("clean_train_triple"):
        drops.append(indx)
paths = np.delete(paths, drops)

In [315]:
for path in paths:
    name = "train"
    if path.__contains__("test"):
        name = "test"
    if path.__contains__("valid"):
        name = "valid"
    df = pd.read_csv(path)    
    df = bug_clean(df)
    p_triples = percent_triples(df)
    sent_size = avg_word_count(df)
    print(f"dataset: {name}. avg sent length: {sent_size:.2f} words. percent triples: {p_triples*100:.2f}%.")

dataset: test. avg sent length: 18.24 words. percent triples: 11.68%.
dataset: valid. avg sent length: 17.93 words. percent triples: 11.06%.
dataset: train. avg sent length: 17.97 words. percent triples: 3.39%.


In [300]:
df1 = pd.read_csv(paths[0])

In [303]:
df1.isna().sum()

Unnamed: 0         0
Unnamed: 0.1       0
id                 0
label              0
text               0
triple          1119
dtype: int64

In [317]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'id', 'label', 'text',
       'triple', 'Unnamed: 0.1.1.1', 'Unnamed: 0.1.1.1.1'],
      dtype='object')

In [329]:
df.drop(columns=list(set(df.columns.values) ^ set(["id", "label", "text", "triple"]))).drop_duplicates()

Unnamed: 0,id,label,text,triple
0,2635.json,False,Says the Annies List political group supports ...,
1,10540.json,True,When did the decline of coal start? It started...,
2,324.json,True,Hillary Clinton agrees with John McCain by vot...,
3,1123.json,False,Health care reform legislation is likely to ma...,
4,9028.json,True,The economic turnaround started at the end of ...,
...,...,...,...,...
10234,1592.json,True,"Under the ruling of the Supreme Court, any lob...",
10235,5473.json,True,There are a larger number of shark attacks in ...,
10236,3408.json,True,Democrats have now become the party of the Atl...,
10237,3959.json,True,Says an alternative to Social Security that op...,"[['County', 'location/hud_county_place/county'..."


In [321]:
df.columns 

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

['Unnamed: 0.1',
 'Unnamed: 0.1.1',
 'Unnamed: 0',
 'Unnamed: 0.1.1.1.1',
 'Unnamed: 0.1.1.1']