## Import Packages & Global Parameters

In [58]:
import os
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from LIWC.liwc_func import Liwc

In [59]:
RAW_DIR    = "data/raw_data"
OUTPUT_DIR = "data/proc_data"

In [60]:
if not os.path.isdir(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [61]:
LIWC = Liwc("en")

LIWC_FEATURE_LIST = ['words_per_sentence', 'six_plus_words', 'word_count', 'function', 'pronoun', 'ppron', 'i', 'we', 'you', 'she', 'he', 'they', 'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj', 'negate', 'verb', 'adj', 'compare', 'interrog', 'number', 'quant', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'social', 'family', 'friend', 'female', 'male', 'cogproc', 'insight', 'cause', 'discrep', 'tentat', 'certain', 'differ', 'percept', 'see', 'hear', 'feel', 'bio', 'body', 'health', 'sexual', 'ingest', 'drives', 'affiliation', 'achieve', 'power', 'reward', 'risk', 'focuspast', 'focuspresent', 'focusfuture', 'relativ', 'motion', 'space', 'time', 'work', 'leisure', 'home', 'money', 'relig', 'death', 'informal', 'swear', 'netspeak', 'assent', 'nonflu', 'filler']

## Read Data

In [95]:
data = {
    "label": [],
    "text": [],
}
data = pd.DataFrame(data)

In [96]:
for split in ["train", "validation", "test"]:
    df = pd.read_csv(os.path.join(RAW_DIR, split + ".tsv"), delimiter="\t", header=None)
    df.drop([0, 3, 4, 8, 9, 10, 11, 12], axis=1, inplace=True)
    df.rename(columns={1: "label", 2: "text", 5: "job", 6: "state", 7: "party", 13: "context"}, inplace=True)
    df = df[df["label"] != "half-true"]
    df.loc[(df["label"] == "true") | (df["label"] == "mostly-true"), "label"] = 1
    df.loc[(df["label"] == "false") | (df["label"] == "barely-true") | (df["label"] == "pants-fire"), "label"] = 0
    data = pd.concat([data, df], axis=0)

## Process Data

### Process Sensitive Attributes

In [None]:
data["democrat"] = data["party"].apply(lambda x: 1 if x == "democrat" else 0)
data["republican"] = data["party"].apply(lambda x: 1 if x == "republican" else 0)

### Process Categorical Features

In [102]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', max_categories=20))])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, [0])
    ])
    
def one_hot_encode(df, column_name):
    col = preprocessor.fit_transform(df[column_name].values.reshape(-1, 1)).toarray()
    df = df.drop(column_name, axis=1)
    for i in range(col.shape[1]):
        df[column_name + "_" + str(i)] = col[:, i]
    return df

In [103]:
data = one_hot_encode(data, "job")
data = one_hot_encode(data, "state")
data = one_hot_encode(data, "party")
data = one_hot_encode(data, "context")

### Process Textual Features

In [52]:
liwc_features = {k: [] for k in LIWC_FEATURE_LIST}
for i in range(len(data)):
    LIWC_result = LIWC.cal_liwc(data.iloc[i]["text"])
    for f in LIWC_FEATURE_LIST:
        liwc_features[f].append(LIWC_result[f] if f in LIWC_result else 0)

In [53]:
for f in LIWC_FEATURE_LIST:
    data[f] = liwc_features[f]

## Save Dataset

In [54]:
data.reset_index(drop=True, inplace=True)

In [56]:
pd.to_pickle(data, os.path.join(OUTPUT_DIR, "data.pkl"))