In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
import string
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('dataset/data_liar_plus.csv', sep=';')

**Target LabelEncoder**

In [3]:
le_multi = LabelEncoder()
df.loc[:, 'label'] = le_multi.fit_transform(df.label)

In [4]:
{l: i for i, l in enumerate(le_multi.classes_)}

{'barely-true': 0,
 'false': 1,
 'half-true': 2,
 'mostly-true': 3,
 'pants-fire': 4,
 'true': 5}

**text preprocessing for statement, context, subject, speaker_job, justification**
* Punctuation deleting
* Lowercase + strip
* Numbers replacement with "NUM" token
* Extra whitespaces removing

In [5]:
text_features = ['statement', 'context', 'subject', 'speaker_job', 'justification']

In [6]:
for feature in text_features:
    df[feature] = df[feature].str.replace(r'[^\w\s]+', ' ')
    df[feature] = df[feature].apply(lambda x: x.lower().strip())
    df[feature] = df[feature].str.replace('\w*\d+\w*','NUM')
    df[feature] = df[feature].str.replace('\s{2,}',' ')

**text preprocessing for speaker**
* Less common speakers replacement with token "unknown_speaker" (Less common defenition by 11th percentile)
* One-hot-Encoding

In [7]:
df.loc[~df.speaker.isin(df.groupby('speaker', as_index=False).count()[['speaker', 'id_json']].query("id_json > 11").speaker),
       'speaker'] = 'unknown_speaker'
df = df.join(pd.get_dummies(df.speaker, prefix='speaker'))

**text preprocessing for state**
* Deleting some noise
* One-hot-Encoding

In [8]:
df.loc[df.state.isin(['None', 'Unknown']), 'state'] = 'noStateInfo'
df.loc[df.state.isin(['Tennesse']), 'state'] = 'Tennessee'
df.loc[df.state.isin(['PA - Pennsylvania']), 'state'] = 'Pennsylvania'
df.loc[df.state.isin(['Rhode island']), 'state'] = 'Rhode Island'
df.loc[df.state.isin(['Tex']), 'state'] = 'Texas'
df.loc[df.state.isin(['Virgiia','Virgina', 'Virginia director, Coalition to Stop Gun Violence']), 'state'] = 'Virginia'
df.loc[df.state.isin(['Washington D.C.','Washington DC','Washington state', 'Washington, D.C.',]), 'state'] = 'Washington'

df = df.join(pd.get_dummies(df.state, prefix='state'))

**text preprocessing for state**
* One-hot-Encoding

In [9]:
df = df.join(pd.get_dummies(df.party, prefix='party'))

**Deleting speaker history due to data leakage**

In [10]:
df.drop(['barely_true_counts', 'false_counts', 'half_true_counts',
       'mostly_true_counts', 'pants_on_fire_counts'],
        axis=1,
        inplace=True)

In [11]:
train_index = df.query("data_type == 'train'").index
val_index = df.query("data_type == 'val'").index
test_index = df.query("data_type == 'test'").index

cols_to_use = ['label', 'is_fake', 'statement', 'subject', 'speaker', 'speaker_job', 'state', 'party', 'context', 'justification']

df.loc[train_index][cols_to_use].to_csv('dataset/train_data_clean.csv', index=False)
df.loc[val_index][cols_to_use].to_csv('dataset/val_data_clean.csv', index=False)
df.loc[test_index][cols_to_use].to_csv('dataset/test_data_clean.csv', index=False)

In [12]:
df.to_csv('dataset/data_clean_with_onehot.csv', index=False)