In [3]:
import pandas as pd
from copy import deepcopy
from pydantic import BaseModel
from typing import Optional
import json
import csv
import math
import numpy as np
from typing import Dict, List

In [1]:
TRAIN_PATH = "../data/raw/train.tsv"
VAL_PATH = "../data/raw/valid.tsv"
TEST_PATH = "../data/raw/test.tsv"

In [4]:
def read_datapoints(datapath: str) -> List[Dict]:
    with open(datapath) as f:
        reader = csv.DictReader(f, delimiter="\t", fieldnames=[
            "statement_json",
            "label",
            "statement",
            "subject",
            "speaker",
            "speaker_title",
            "state_info",
            "party_affiliation",
            "barely_true_count",
            "false_count",
            "half_true_count",
            "mostly_true_count",
            "pants_fire_count",
            "context",
            "justification"
        ])
        return [row for row in reader]

In [6]:
class Datum(BaseModel):
    statement_json: Optional[str]
    label: Optional[bool]
    statement: str
    subject: Optional[str]
    speaker: Optional[str]
    speaker_title: Optional[str]
    state_info: Optional[str]
    party_affiliation: Optional[str]
    barely_true_count: float
    false_count: float
    half_true_count: float
    mostly_true_count: float
    pants_fire_count: float
    context: Optional[str]
    justification: Optional[str]

In [7]:
def read_json_data(datapath: str) -> List[Datum]:
    with open(datapath) as f:
        data = json.load(f)
        return [Datum(**point) for point in data]

In [8]:
def normalize_and_clean_counts(data):
    normalized_data = []
    for index, datum in enumerate(data):
        normalized_datum = deepcopy(datum) #  preserve immutability of input data
        for count_col in ["barely_true_count",
                          "false_count",
                          "half_true_count",
                          "mostly_true_count",
                          "pants_fire_count"]:
            # First check if that this Datum has that particular column. Can allow use of future data which does not have these columns.
            if count_col in normalized_datum:
                # Cannot pass Nonetype values to float(). If NaN entry we set this to 0
                if normalized_datum[count_col] == None:
                    normalized_datum[count_col] = float(0)
                # Otherwise set the string entry to be floating type
                normalized_datum[count_col] = float(normalized_datum[count_col])
        # Add this normalized datum (Dict) to the normalized data array
        normalized_data.append(normalized_datum)
    return normalized_data

In [9]:
def normalize_and_clean_speaker_title(data: List[Dict]) -> List[Dict]:
    normalized_data = []
    for datum in data:
        # First do simple cleaning
        normalized_datum = deepcopy(datum) # preserve immutability of input data
        old_speaker_title = normalized_datum["speaker_title"]
        # We have some NaN values (Nonetype) in dataset, cannot pass that to .lower() method etc. Give them value "none"
        if old_speaker_title == None:
            old_speaker_title = "Unknown"
        new_speaker_title = old_speaker_title.lower().strip().replace("-", " ") if old_speaker_title is not None else None
        # Then canonicalize
        if new_speaker_title in CANONICAL_SPEAKER_TITLES:
            new_speaker_title = CANONICAL_SPEAKER_TITLES[new_speaker_title]
        normalized_datum["speaker_title"] = new_speaker_title
        normalized_data.append(normalized_datum)
    return normalized_data

In [65]:
def normalize_and_clean_state_info(data: List[Dict]) -> List[Dict]:
    normalized_data = []
    for datum in data:
        normalized_datum = deepcopy(datum) # preserve immutability of input data
        old_state_info = normalized_datum["state_info"]
        # We have some NaN values (Nonetype) in dataset, cannot pass that to .lower() method etc. Give them value "none"
        if old_state_info == None:
            old_state_info = "Unknown"
        new_state_info = old_state_info.lower().strip().replace("-", " ") if old_state_info is not None else None
        # Check to see if this cleaned state_info datum is in our predefined canonical dictionary
        if new_state_info in CANONICAL_STATE:
            # Set it to its canonical form
            new_state_info = CANONICAL_STATE[new_state_info]
        # Enter the cleaned state_info into our new normalized datum
        normalized_datum["state_info"] = new_state_info
        # Add this cleaned datum to the new normalized dataset
        normalized_data.append(normalized_datum)
    return normalized_data

In [11]:
train_data = read_datapoints(TRAIN_PATH)

In [12]:
# type(train_data) # it is a list
train_data[0]

{'statement_json': '2635.json',
 'label': 'false',
 'statement': 'Says the Annies List political group supports third-trimester abortions on demand.',
 'subject': 'abortion',
 'speaker': 'dwayne-bohac',
 'speaker_title': 'State representative',
 'state_info': 'Texas',
 'party_affiliation': 'republican',
 'barely_true_count': '0',
 'false_count': '1',
 'half_true_count': '0',
 'mostly_true_count': '0',
 'pants_fire_count': '0',
 'context': 'a mailer',
 'justification': None}

In [13]:
type(train_data[0]['barely_true_count'])

str

In [14]:
# we are getting two None's in our train_data somewhere. 
for idx, datum in enumerate(train_data):
    if datum['barely_true_count'] == None:
        print(idx)
        

2142
9375


In [15]:
train_data[2142]

{'statement_json': '638.json',
 'label': 'false',
 'statement': 'The fact is that although we have had a president who is opposed to abortion over the last eight years, abortions have not gone down.\'\'\tabortion\tbarack-obama\tPresident\tIllinois\tdemocrat\t70\t71\t160\t163\t9\ta TV interview with megachurch pastor Rick Warren in Lake Forest, Calif.\n2724.json\ttrue\tMost of the jobs that we lost were lost before the economic policies we put in place had any effect.\teconomy,job-accomplishments,jobs,stimulus\tbarack-obama\tPresident\tIllinois\tdemocrat\t70\t71\t160\t163\t9\tan interview on The Daily Show with Jon Stewart"',
 'subject': None,
 'speaker': None,
 'speaker_title': None,
 'state_info': None,
 'party_affiliation': None,
 'barely_true_count': None,
 'false_count': None,
 'half_true_count': None,
 'mostly_true_count': None,
 'pants_fire_count': None,
 'context': None,
 'justification': None}

In [16]:
train_data[9375]

{'statement_json': '1626.json',
 'label': 'false',
 'statement': "Joe, I keep hearing you every morning talking about the biggest tax increase in history, but you don't mention it's also the biggest tax cut in history.''\thealth-care,taxes\trichard-durbin\tSenator\tIllinois\tdemocrat\t0\t2\t1\t0\t1\ta comment on the Morning Joe'' show on MSNBC.",
 'subject': None,
 'speaker': None,
 'speaker_title': None,
 'state_info': None,
 'party_affiliation': None,
 'barely_true_count': None,
 'false_count': None,
 'half_true_count': None,
 'mostly_true_count': None,
 'pants_fire_count': None,
 'context': None,
 'justification': None}

In [17]:
cleaned_data_counts = normalize_and_clean_counts(train_data)

In [18]:
cleaned_data_counts[9375]

{'statement_json': '1626.json',
 'label': 'false',
 'statement': "Joe, I keep hearing you every morning talking about the biggest tax increase in history, but you don't mention it's also the biggest tax cut in history.''\thealth-care,taxes\trichard-durbin\tSenator\tIllinois\tdemocrat\t0\t2\t1\t0\t1\ta comment on the Morning Joe'' show on MSNBC.",
 'subject': None,
 'speaker': None,
 'speaker_title': None,
 'state_info': None,
 'party_affiliation': None,
 'barely_true_count': 0.0,
 'false_count': 0.0,
 'half_true_count': 0.0,
 'mostly_true_count': 0.0,
 'pants_fire_count': 0.0,
 'context': None,
 'justification': None}

We can see there are still None entries for the other columns like subject, speaker, context, justification.
I handle the party_affliation and speaker_title, state_info etc. but still need to clean subject and speaker, can't have any NaN values passed to the model. Sklearn does not like. 
Probably can just delete justification. Not context though. 

In [20]:
def remove_justification_col(data: List[Dict]) -> List[Dict]:
    normalized_data = []
    for datum in data:
        normalized_datum = deepcopy(datum) # preserve immutability of input data
        if 'justification' in normalized_datum:
            del normalized_datum['justification']
        normalized_data.append(normalized_datum)
    return normalized_data

In [21]:
cleaned_just = remove_justification_col(cleaned_data_counts)

In [22]:
cleaned_just[9375]

{'statement_json': '1626.json',
 'label': 'false',
 'statement': "Joe, I keep hearing you every morning talking about the biggest tax increase in history, but you don't mention it's also the biggest tax cut in history.''\thealth-care,taxes\trichard-durbin\tSenator\tIllinois\tdemocrat\t0\t2\t1\t0\t1\ta comment on the Morning Joe'' show on MSNBC.",
 'subject': None,
 'speaker': None,
 'speaker_title': None,
 'state_info': None,
 'party_affiliation': None,
 'barely_true_count': 0.0,
 'false_count': 0.0,
 'half_true_count': 0.0,
 'mostly_true_count': 0.0,
 'pants_fire_count': 0.0,
 'context': None}

In [23]:
def normalize_and_clean_context(data: List[Dict]) -> List[Dict]:
    normalized_data = []
    for datum in data:
        normalized_datum = deepcopy(datum)
        old_context = normalized_datum['context']
        if old_context == None:
            old_context = "Unknown"
        new_context = old_context.lower().strip().replace("-", " ")
        normalized_datum["context"] = new_context
        normalized_data.append(normalized_datum)
    return normalized_data

In [24]:
total = 0
#string_testing_for = 'news' # 697
#string_testing_for = 'speech' #1062
# string_testing_for = 'TV ad' #310
string_testing_for = 'ad'

for datum in train_data:
    if datum['context']!= None and string_testing_for in datum['context']:
        total += 1
print(total)

1545


In [25]:
cleaned_context = normalize_and_clean_context(cleaned_just)

In [26]:
cleaned_context[9375]

{'statement_json': '1626.json',
 'label': 'false',
 'statement': "Joe, I keep hearing you every morning talking about the biggest tax increase in history, but you don't mention it's also the biggest tax cut in history.''\thealth-care,taxes\trichard-durbin\tSenator\tIllinois\tdemocrat\t0\t2\t1\t0\t1\ta comment on the Morning Joe'' show on MSNBC.",
 'subject': None,
 'speaker': None,
 'speaker_title': None,
 'state_info': None,
 'party_affiliation': None,
 'barely_true_count': 0.0,
 'false_count': 0.0,
 'half_true_count': 0.0,
 'mostly_true_count': 0.0,
 'pants_fire_count': 0.0,
 'context': 'unknown'}

There are only two NaN values for speaker and subject we need to get rid of. 

Will put all of these functions into a python script and run sequentially.