# Setup

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import RegexpTokenizer
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
from nltk.corpus import stopwords
from functools import partial
from nltk.stem import PorterStemmer

In [None]:
labels = ['pants-fire', 'false', 'barely-true', 'half-true', 'mostly-true', 'true']

## Load dataset
Load the tsv file in three different pandas dataframe and add a column containing the statement length (as the number of words)

In [None]:
train = pd.read_table('dataset/train.tsv', names=['ID', 'LABEL', 'STATEMENT', 'TOPICS', 'SPEAKER', 'ROLE', 'CITY', 'PARTY', 'H1', 'H2', 'H3', 'H4', 'H5', 'CONTEXT'])
train.insert(loc=3, column='STATEMENT_LEN', value=train.STATEMENT.apply(lambda x: len(x.split())))

test = pd.read_table('dataset/test.tsv', names=['ID', 'LABEL', 'STATEMENT', 'TOPICS', 'SPEAKER', 'ROLE', 'CITY', 'PARTY', 'H1', 'H2', 'H3', 'H4', 'H5', 'CONTEXT'])
test.insert(loc=3, column='STATEMENT_LEN', value=test.STATEMENT.apply(lambda x: len(x.split())))

valid = pd.read_table('dataset/valid.tsv', names=['ID', 'LABEL', 'STATEMENT', 'TOPICS', 'SPEAKER', 'ROLE', 'CITY', 'PARTY', 'H1', 'H2', 'H3', 'H4', 'H5', 'CONTEXT'])
valid.insert(loc=3, column='STATEMENT_LEN', value=valid.STATEMENT.apply(lambda x: len(x.split())))

## Clean the data
Remove statements with excessive word counts as they represent errors (e.g., contain multiple statements together) and remove the statements with a length less than 15 that start with the word "On" since they are probably actually titles of articles and as such cannot have truth value


In [None]:
train.boxplot(column='STATEMENT_LEN')
plt.show()

In [None]:
# remove the 3 main outliers from the train set, that are 1606.json, 1993.json and 1720.json
ids_to_remove = ['1606.json', '1993.json', '1720.json']
train = train[~train.ID.isin(ids_to_remove)]

In [None]:
train.boxplot(column='STATEMENT_LEN')
plt.show()

In [None]:
test.boxplot(column='STATEMENT_LEN')
plt.show()

In [None]:
# remove the 2 main outliers from the test set, that are 1653.json and 40.json
ids_to_remove = ['1653.json', '40.json']
test = test[~test.ID.isin(ids_to_remove)]

In [None]:
test.boxplot(column='STATEMENT_LEN')
plt.show()

In [None]:
# remove from the train, test and validation set the statements that begin with "on" and have less than min_len words
min_len = 15
train = train[~((train.STATEMENT.str.startswith('On')) & (train.STATEMENT_LEN < min_len))]
test = test[~((test.STATEMENT.str.startswith('On')) & (test.STATEMENT_LEN < min_len))]
valid = valid[~((valid.STATEMENT.str.startswith('On')) & (valid.STATEMENT_LEN < min_len))]

## Encode the labels
1 for true, including 'half-true', 'mostly-true' and 'true'; 0 for false, including 'barely-true', 'false' and 'pants-fire'

In [None]:
train.LABEL= train.LABEL.apply(lambda x: 0 if x in labels[:3] else 1)
test.LABEL= test.LABEL.apply(lambda x: 0 if x in labels[:3] else 1)
valid.LABEL= valid.LABEL.apply(lambda x: 0 if x in labels[:3] else 1)

## Case folding
Make all the sentences lowercase

In [None]:
train.STATEMENT = train.STATEMENT.str.lower()
test.STATEMENT = test.STATEMENT.str.lower()
valid.STATEMENT = valid.STATEMENT.str.lower()

## Remove punctuation and stopwords
The stopwords can be taken from a custom list of stopwords based on the nltk corpus but with some changes

In [None]:
def remove_stopwords_and_punctuation(sentence, stop_words):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return " ".join(filtered_tokens)

In [None]:
custom_stopwords = True

if custom_stopwords:
    with open('custom_stopwords', 'r') as f:
        stopwords_list = []
        for line in f:
            if not line.startswith('#'):
                stopwords_list.append(line[:-1])
else:
    stopwords_list = stopwords.words('english')

In [None]:
remove_stopwords_and_punctuation_partial = partial(remove_stopwords_and_punctuation, stop_words=stopwords_list)

In [None]:
train.STATEMENT = train.STATEMENT.apply(remove_stopwords_and_punctuation_partial)
test.STATEMENT = test.STATEMENT.apply(remove_stopwords_and_punctuation_partial)
valid.STATEMENT = valid.STATEMENT.apply(remove_stopwords_and_punctuation_partial)

## Stemming
Apply Porter stemming to all the sentences

In [None]:
stemmer = PorterStemmer()

In [None]:
train.STATEMENT = train.STATEMENT.apply(lambda x: " ".join([PorterStemmer().stem(word) for word in nltk.word_tokenize(x)]))
test.STATEMENT = test.STATEMENT.apply(lambda x: " ".join([PorterStemmer().stem(word) for word in nltk.word_tokenize(x)]))
valid.STATEMENT = valid.STATEMENT.apply(lambda x: " ".join([PorterStemmer().stem(word) for word in nltk.word_tokenize(x)]))

## Save data
Save the pre-processed data in new csv_files

In [None]:
train.to_csv("dataset/prep_train.csv", index=False)
test.to_csv("dataset/prep_test.csv", index=False)
valid.to_csv("dataset/prep_valid.csv", index=False)