In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
import string
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('dataset/data_liar_plus.csv', sep=';')

In [3]:
le_multi = LabelEncoder()
df.loc[:, 'label'] = le_multi.fit_transform(df.label)

In [4]:
{l: i for i, l in enumerate(le_multi.classes_)}

{'barely-true': 0,
 'false': 1,
 'half-true': 2,
 'mostly-true': 3,
 'pants-fire': 4,
 'true': 5}

In [5]:
df.statement = df.statement.str.replace(r'[^\w\s]+', ' ')
df.statement = df.statement.apply(lambda x: x.lower().strip())
df.statement = df.statement.str.replace('\w*\d+\w*','NUM')
df.statement = df.statement.str.replace('\s{2,}',' ')

In [6]:
df.context = df.context.str.replace(r'[^\w\s]+', ' ')
df.context = df.context.apply(lambda x: str(x).lower().strip())
df.context = df.context.str.replace('\w*\d+\w*','NUM')
df.context = df.context.str.replace('\s{2,}',' ')

In [7]:
df.subject = df.subject.str.replace(r'[^\w\s]+', ' ')
df.subject = df.subject.apply(lambda x: x.lower().strip())
df.subject = df.subject.str.replace('\w*\d+\w*','NUM')
df.subject = df.subject.str.replace('\s{2,}',' ')

In [8]:
df.loc[~df.speaker.isin(df.groupby('speaker', as_index=False).count()[['speaker', 'id_json']].query("id_json > 11").speaker),
       'speaker'] = 'unknown_speaker'

In [9]:
df = df.join(pd.get_dummies(df.speaker, prefix='speaker'))

In [10]:
df.speaker_job = df.speaker_job.str.replace(r'[^\w\s]+', ' ')
df.speaker_job = df.speaker_job.apply(lambda x: x.lower().strip())
df.speaker_job = df.speaker_job.str.replace('\w*\d+\w*','NUM')
df.speaker_job = df.speaker_job.str.replace('\s{2,}',' ')

In [11]:
df.loc[df.state.isin(['None', 'Unknown']), 'state'] = 'noStateInfo'
df.loc[df.state.isin(['Tennesse']), 'state'] = 'Tennessee'
df.loc[df.state.isin(['PA - Pennsylvania']), 'state'] = 'Pennsylvania'
df.loc[df.state.isin(['Rhode island']), 'state'] = 'Rhode Island'
df.loc[df.state.isin(['Tex']), 'state'] = 'Texas'
df.loc[df.state.isin(['Virgiia','Virgina', 'Virginia director, Coalition to Stop Gun Violence']), 'state'] = 'Virginia'
df.loc[df.state.isin(['Washington D.C.','Washington DC','Washington state', 'Washington, D.C.',]), 'state'] = 'Washington'

In [12]:
df = df.join(pd.get_dummies(df.state, prefix='state'))

In [13]:
df = df.join(pd.get_dummies(df.party, prefix='party'))

In [14]:
df.drop(['barely_true_counts', 'false_counts', 'half_true_counts',
       'mostly_true_counts', 'pants_on_fire_counts'],
        axis=1,
        inplace=True)

In [15]:
df.justification = df.justification.str.replace(r'[^\w\s]+', ' ')
df.justification = df.justification.apply(lambda x: x.lower().strip())
df.justification = df.justification.str.replace('\w*\d+\w*','NUM')
df.justification = df.justification.str.replace('\s{2,}',' ')

In [21]:
df.columns[:12]

Index(['id_json', 'label', 'statement', 'subject', 'speaker', 'speaker_job',
       'state', 'party', 'context', 'justification', 'is_fake', 'data_type'],
      dtype='object')

In [23]:
train_index = df.query("data_type == 'train'").index
val_index = df.query("data_type == 'val'").index
test_index = df.query("data_type == 'test'").index

cols_to_use = ['label', 'is_fake', 'statement', 'subject', 'speaker', 'speaker_job', 'state', 'party', 'context', 'justification']

df.loc[train_index][cols_to_use].to_csv('dataset/train_data_clean.csv', index=False)
df.loc[val_index][cols_to_use].to_csv('dataset/val_data_clean.csv', index=False)
df.loc[test_index][cols_to_use].to_csv('dataset/test_data_clean.csv', index=False)

In [25]:
df.to_csv('dataset/data_clean_with_onehot.csv', index=False)