# Clean and process baseline features for LIAR test dataset 

In [None]:
import pandas as pd

# load data LIAR test dataset
src = 'data/liar_dataset/test.tsv'
raw_data = pd.read_csv(src, sep='\t', header=None)

### Clean dataset

In [None]:
clean_data = raw_data.copy(deep=True)

# Select only label and statment columns, remove the rest
clean_data = clean_data[[1,2]]

# rename columns to match FakeNews corpus
clean_data = clean_data.rename(columns={1:'type'})
clean_data = clean_data.rename(columns={2:'content'})

# remove empty 'content' rows
clean_data.dropna(subset=['content'], inplace=True)

# remove rows without type labels
drop_null_types = clean_data[ (clean_data['type'].isnull())].index
clean_data.drop(drop_null_types, inplace=True)

# remove types that are not 'true' or 'false'
omitted_types = {
    'half-true',
    'mostly-true',
    'barely-true',
    'pants-fire'
}
drop_indexes = clean_data[ (clean_data['type'].isin(omitted_types))].index
clean_data.drop(drop_indexes, inplace=True)

In [None]:
import lib.process_methods as pm
import swifter

# cleanup text on 'content' column and add into new column 'content_clean'
clean_data['content_clean'] = clean_data['content'].swifter.apply(pm.clean_text)

# Apply remove_stopwords to 'content_clean' column and create 'content_stopword' column
clean_data['content_stopword'] = clean_data['content_clean'].swifter.apply(pm.remove_stopwords)

# stemming
clean_data['content_stem'] = clean_data['content_stopword'].swifter.apply(pm.remove_word_variations)

### Process baseline features

In [None]:
# Make grouped types ('fake' or 'reliable') into 'true' or 'false' values
def bool_dummies(df: pd.DataFrame, col: str) -> pd.DataFrame:
    type_data = pd.get_dummies(df['type'], drop_first=True)
    df = pd.concat([df, type_data], axis=1)
    return df

clean_data = bool_dummies(clean_data, 'type')
clean_data = clean_data.rename(columns={'true':'reliable'})

In [None]:
import re

# Function to count tags, e.g. NUMs with <NUM> tag
def count_tag(text: str, tag: str) -> int:
    num_with_tag = re.findall(tag, text)
    return len(num_with_tag)

# Apply count NUMs with <NUM> tag
num_tag = '_num_'
clean_data['num_count'] = clean_data['content_clean'].apply(count_tag, tag=num_tag)

# Apply count DATEs with <DATE> tag
date_tag = '_date_'
clean_data['date_count'] = clean_data['content_clean'].apply(count_tag, tag=date_tag)

# Apply count URLs with <URL> tag
url_tag = '_url_'
clean_data['url_count'] = clean_data['content_clean'].apply(count_tag, tag=url_tag)

In [None]:
# Function to count single char in string
def count_char(text: str, char: str):
    return text.count(',')

# count of commas in each article
comma = ','
clean_data['comma_count'] = clean_data['content_clean'].apply(count_char, char=comma)

# count of exlamation points in each article
exclm = '!'
clean_data['exclm_count'] = clean_data['content_clean'].apply(count_char, char=exclm)

In [None]:
import nltk
import swifter

# Count unique words in text (word frequency of content_clean)
def get_word_freq(text: str) -> int:
    tokens = nltk.word_tokenize(text)
    return len(set(tokens))

# get word freq
clean_data['content_word_freq'] = clean_data['content_clean'].swifter.apply(get_word_freq)

In [None]:
import nltk
import swifter

# Count unique words in text (word frequency of content_clean)
def get_word_freq(text: str) -> int:
    tokens = nltk.word_tokenize(text)
    return len(set(tokens))

# word freq after stopword removal
clean_data['stop_word_freq'] = clean_data['content_stopword'].swifter.apply(get_word_freq)

# word freq after stemming
clean_data['stem_word_freq'] = clean_data['content_stem'].swifter.apply(get_word_freq)

In [None]:
# reduction rate on stopword removal
# training
col_a = clean_data['content_word_freq']
col_b = clean_data['stop_word_freq']
clean_data['stop_reduction_rate'] = round(((col_a - col_b)/col_a) * 100, 3)

In [None]:
# reduction rate on stem removal
# training
col_a = clean_data['content_word_freq']
col_b = clean_data['stem_word_freq']
clean_data['stem_reduction_rate'] = round(((col_a - col_b)/col_a) * 100, 3)

In [None]:
# (Avarage of use of words per sentence. per article)

import swifter

def average_sentence_length(text):
    # Split the text into sentences
    sentences = text.split('.')
    
    # Initialize variables to store total length and number of sentences
    total_length = 0
    num_sentences = 0
    
    # Iterate through each sentence to calculate total length and count the number of sentences
    for sentence in sentences:
        # Count the number of words in the sentence
        words = sentence.split()
        length = len(words)
        
        # Add the length of the current sentence to the total length
        total_length += length
        
        # Increment the number of sentences
        if length > 0:  # Exclude empty sentences
            num_sentences += 1
    
    # Calculate the average length of sentences
    if num_sentences > 0:
        average_length = total_length / num_sentences
    else:
        average_length = 0
    
    return int(average_length)

# Apply
clean_data['average_sentence_length'] = clean_data['content'].swifter.apply(average_sentence_length)

In [None]:
# remove random rows to balance dataset
random_n = 249-208
select_type = clean_data[ clean_data['type'] == 'false']
random_rows = select_type.sample(random_n, random_state=28)
clean_data = clean_data.drop(random_rows.index)

In [None]:
# save file
dst = 'data/liar_dataset/test_features.csv'
clean_data.to_csv(dst)