# Variable Declarations

In [10]:
raw_train_path = 'data/raw/liar_train2.tsv'
cleaned_train_path = 'data/cleaned/cleaned_train.csv'

raw_test_path = 'data/raw/liar_test2.tsv'
cleaned_test_path = 'data/cleaned/cleaned_test.csv'

# Data Cleaning

In [11]:
from scripts.lemmatization import lemmatize_dataframe
from scripts.data_cleaning import load_and_clean_data
from scripts.pos_tagging import pos_for_dataframe
import pandas as pd

names = ['ID', 'Label', 'Statement', 'Subject', 'Speaker', 'Speaker\'s Job', 'State',
                                      'Party',
                                      'Barely True Counts', 'False Counts', 'Half True Counts', 'Mostly True Counts',
                                      'Pants on Fire Counts', 'Context', 'Justification']

columns_to_drop = ['ID', "Speaker's Job", 'State']

counts_columns = ['Barely True Counts', 'False Counts', 'Half True Counts', 'Mostly True Counts',
                      'Pants on Fire Counts']

print("=== Training Dataset ===")
load_and_clean_data(raw_train_path, cleaned_train_path, names, columns_to_drop, counts_columns)

# Apply POS tagging
cleaned_train_data = pd.read_csv(cleaned_train_path)
cleaned_data_with_pos_tags = pos_for_dataframe(cleaned_train_data, 'Statement')

# Save the DataFrame with POS tags
lemmatized_data = lemmatize_dataframe(cleaned_data_with_pos_tags, 'Statement')

lemmatized_data.to_csv(cleaned_train_path)

print("\n=== Testing Dataset ===")
load_and_clean_data(raw_test_path, cleaned_test_path, names, columns_to_drop, counts_columns)

cleaned_test_data = pd.read_csv(cleaned_test_path)
cleaned_data_with_pos_tags = pos_for_dataframe(cleaned_test_data, 'Statement')

lemmatized_data = lemmatize_dataframe(cleaned_data_with_pos_tags, 'Statement')

lemmatized_data.to_csv(cleaned_test_path)

=== Training Dataset ===
             ID        Label  \
0.0   2635.json        false   
1.0  10540.json    half-true   
2.0    324.json  mostly-true   
3.0   1123.json        false   
4.0   9028.json    half-true   

                                             Statement  \
0.0  Says the Annies List political group supports ...   
1.0  When did the decline of coal start? It started...   
2.0  Hillary Clinton agrees with John McCain "by vo...   
3.0  Health care reform legislation is likely to ma...   
4.0  The economic turnaround started at the end of ...   

                                Subject         Speaker         Speaker's Job  \
0.0                            abortion    dwayne-bohac  State representative   
1.0  energy,history,job-accomplishments  scott-surovell        State delegate   
2.0                      foreign-policy    barack-obama             President   
3.0                         health-care    blog-posting                   NaN   
4.0                        e

# POS Tagging

# Lemmatization