In [1]:
# Import Dependencies

In [2]:
import pandas as pd
import plotly.express as px
import math
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from typing import Dict, List

In [3]:
# Bring in the data

In [4]:
TRAIN_PATH = '../data/raw/train2.tsv'
VAL_PATH = '../data/raw/val2.tsv'
TEST_PATH = '../data/raw/test2.tsv'

In [5]:
train_df = pd.read_csv(TRAIN_PATH, sep="\t", names=["id", "statement_json", "label", "statement", "subject", "speaker", "speaker_title", "state_info", "party_affiliation", "barely_true_count", "false_count", "half_true_count", "mostly_true_count", "pants_fire_count", "context", "justification"])
val_df = pd.read_csv(VAL_PATH, sep="\t", names=["id", "statement_json", "label", "statement", "subject", "speaker", "speaker_title", "state_info", "party_affiliation", "barely_true_count", "false_count", "half_true_count", "mostly_true_count", "pants_fire_count", "context", "justification"])
test_df = pd.read_csv(TEST_PATH, sep="\t", names=["id", "statement_json", "label", "statement", "subject", "speaker", "speaker_title", "state_info", "party_affiliation", "barely_true_count", "false_count", "half_true_count", "mostly_true_count", "pants_fire_count", "context", "justification"])

In [6]:
# Display at most 500 columns/rows
pd.options.display.max_colwidth = 500
pd.options.display.max_rows = 500

In [7]:
# Basic Statistics

In [8]:
len(train_df)

10242

In [9]:
len(val_df)

1284

In [10]:
len(test_df)

1267

In [12]:
train_df.columns

Index(['id', 'statement_json', 'label', 'statement', 'subject', 'speaker',
       'speaker_title', 'state_info', 'party_affiliation', 'barely_true_count',
       'false_count', 'half_true_count', 'mostly_true_count',
       'pants_fire_count', 'context', 'justification'],
      dtype='object')

In [13]:
train_df.head()

Unnamed: 0,id,statement_json,label,statement,subject,speaker,speaker_title,state_info,party_affiliation,barely_true_count,false_count,half_true_count,mostly_true_count,pants_fire_count,context,justification
0,0.0,2635.json,false,Says the Annies List political group supports third-trimester abortions on demand.,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,"That's a premise that he fails to back up. Annie's List makes no bones about being comfortable with candidates who oppose further restrictions on late-term abortions. Then again, this year its backing two House candidates who voted for more limits."
1,1.0,10540.json,half-true,When did the decline of coal start? It started when natural gas took off that started to begin in (President George W.) Bushs administration.,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,"Surovell said the decline of coal ""started when natural gas took off That started to begin in President (George W. ) Bushs administration. ""No doubt, natural gas has been gaining ground on coal in generating electricity. The trend started in the 1990s but clearly gained speed during the Bush administration when the production of natural gas -- a competitor of coal -- picked up. But analysts give little credit or blame to Bush for that trend. They note that other factors, such as technologic..."
2,2.0,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by voting to give George Bush the benefit of the doubt on Iran.""",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,"Obama said he would have voted against the amendment if he had been present. So though Clinton may have ""agreed"" with McCain on the issue, they did not technically vote the same way on it. To say that voting for Kyl-Lieberman is ""giving George Bush the benefit of the doubt on Iran"" remains a contentious issue. But Obama's main point is that Clinton and McCain were on the same side, and that is correct."
3,3.0,1123.json,false,Health care reform legislation is likely to mandate free sex change surgeries.,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,"The release may have a point that Mikulskis comment could open the door to ""medically necessary"" coverage which conceivably may include sex-change operations. But it's unclear whether her amendment will remain in the legislation, and there's nothing specific in the legislation on sex-change procedures and nothing else solid that indicates such coverage will be provided. The news release cherry-picked a few fleeting references to gender and sexual orientation in completely unrelated contexts ..."
4,4.0,9028.json,half-true,The economic turnaround started at the end of my term.,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,"Crist said that the economic ""turnaround started at the end of my term. ""During Crists last year in office, Floridas economy experienced notable gains in personal income and industrial production, and more marginal improvements in the unemployment rate and in payroll employment. But GDP didnt grow again until Scott took office. Economists say Crist deserves some credit for the economic turnaround because he accepted federal stimulus dollars, but they add that any state is inevitably buffeted..."


In [14]:
train_df.dropna(subset=['label'], inplace=True)
len(train_df)

10240

In [15]:
# Normalized distribution of labels (roughly equal except for the flagrantly false statement "pants-fire")
train_df.label.value_counts(normalize=True)

half-true      0.206445
false          0.194824
mostly-true    0.191602
true           0.163672
barely-true    0.161523
pants-fire     0.081934
Name: label, dtype: float64

In [16]:
label_ratios = train_df.label.value_counts(normalize=True)
px.bar(label_ratios, x=label_ratios.index, y=label_ratios.values, labels={"index": "label", "y": "ratios"}, title="Label Distribution")

In [17]:
# Notice a huge number of speaker titles
train_df.speaker_title.nunique()

1184

In [18]:
train_df.speaker_title[train_df.speaker_title.notnull()]

0                                 State representative
1                                       State delegate
2                                            President
5                           Wisconsin Assembly speaker
7                                            President
                             ...                      
10230                                  President-Elect
10232                                          Senator
10233                      State Senator, 8th District
10234                      Senior editor, The Atlantic
10241    chairman of the Republican National Committee
Name: speaker_title, Length: 7343, dtype: object

In [19]:
# A lot of repetition in speaker_title - not canonicalized
train_df.speaker_title.value_counts()[:20]

President                        492
U.S. Senator                     479
Governor                         391
President-Elect                  273
U.S. senator                     263
Presidential candidate           254
Former governor                  176
U.S. Representative              172
Milwaukee County Executive       149
Senator                          147
State Senator                    108
U.S. representative              103
U.S. House of Representatives    102
Attorney                          81
Congressman                       80
Social media posting              78
Governor of New Jersey            78
Co-host on CNN's "Crossfire"      73
State Representative              72
State representative              66
Name: speaker_title, dtype: int64

In [20]:
train_df.speaker.value_counts()

barack-obama                                   488
donald-trump                                   273
hillary-clinton                                239
mitt-romney                                    176
scott-walker                                   149
                                              ... 
lorraine-fende                                   1
nfederation-o-independent-business-virginia      1
jim-moore                                        1
scott-surovell                                   1
alan-powell                                      1
Name: speaker, Length: 2910, dtype: int64

In [21]:
train_df.speaker.nunique()

2910

In [22]:
affiliation_counts = train_df.party_affiliation.value_counts()
px.bar(affiliation_counts, x=affiliation_counts.index, y=affiliation_counts.values, labels={"index": "affiliation", "y": "counts"}, title="Counts Per Affiliation")

In [23]:
# Convert from 6-way scale to binary scale
def get_binary_label(label: str) -> bool:
    if label in {'pants-fire', 'barely-true', 'false'}:
        return False
    elif label in {'true', 'half-true', 'motly-true'}:
        return True

In [24]:
train_df['binary_label'] = train_df.label.apply(get_binary_label)

In [27]:
party_groups = train_df.groupby(["party_affiliation"])

In [28]:
party_groups.get_group("republican").binary_label.value_counts(normalize=True)

False    0.591029
True     0.408971
Name: binary_label, dtype: float64

In [29]:
party_groups.get_group("democrat").binary_label.value_counts(normalize=True)

True     0.555424
False    0.444576
Name: binary_label, dtype: float64

In [30]:
train_df.binary_label.value_counts(normalize=True)

False    0.54216
True     0.45784
Name: binary_label, dtype: float64

In [31]:
unigram_lens = train_df.statement.str.split().str.len()

In [32]:
px.histogram(unigram_lens, x=unigram_lens.values, labels={"x": "unigram lens"}, title="Unigram Length Distribution")

In [33]:
unigram_lens.median()

17.0

In [34]:

unigram_lens.mean()

18.030859375

In [35]:

unigram_lens.max()

479

In [36]:
# Ran into some noisy labels for certain columns so have to remove it
train_df[train_df.pants_fire_count == "a television interview"]

Unnamed: 0,id,statement_json,label,statement,subject,speaker,speaker_title,state_info,party_affiliation,barely_true_count,false_count,half_true_count,mostly_true_count,pants_fire_count,context,justification,binary_label


In [37]:
# Drop column with invalid type for `pants_fire`
train_df.drop(6134, inplace=True)

In [38]:

# Separate true samples from false ones
true_ex = train_df[train_df.binary_label == True]
false_ex = train_df[train_df.binary_label == False]

In [39]:
train_df.barely_true_count.describe()

count    10237.00000
mean        11.53082
std         18.97315
min          0.00000
25%          0.00000
50%          2.00000
75%         12.00000
max         70.00000
Name: barely_true_count, dtype: float64

In [40]:
# TODO (mihail): Include feature for credit history counts (binned)
barely_true_counts = train_df.barely_true_count.value_counts().sort_index()
px.bar(barely_true_counts, x=barely_true_counts.index, y=barely_true_counts.values, labels={"index": "credit", "y": "counts"}, title="Barely True Credit Distribution")


In [41]:
px.histogram(train_df, x="barely_true_count", labels={"x": "credit score"}, title="Barely True Credit Histogram", nbins=10)

In [43]:
barely_true_counts.values

array([3025, 1512,  815,  489,  236,  317,  190,  236,  171,  246,  104,
        289,  112,   49,  115,   70,   69,   63,  134,   56,  149,  115,
        142,  148,  117,  176,   92,  239,  273,  488])

In [44]:
train_df.false_count.describe()

count    10237.000000
mean        13.284556
std         24.113327
min          0.000000
25%          0.000000
50%          2.000000
75%         12.000000
max        114.000000
Name: false_count, dtype: float64

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10239 entries, 0 to 10241
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10239 non-null  float64
 1   statement_json     10239 non-null  object 
 2   label              10239 non-null  object 
 3   statement          10239 non-null  object 
 4   subject            10237 non-null  object 
 5   speaker            10237 non-null  object 
 6   speaker_title      7342 non-null   object 
 7   state_info         8031 non-null   object 
 8   party_affiliation  10237 non-null  object 
 9   barely_true_count  10237 non-null  float64
 10  false_count        10237 non-null  float64
 11  half_true_count    10237 non-null  float64
 12  mostly_true_count  10237 non-null  float64
 13  pants_fire_count   10237 non-null  float64
 14  context            10137 non-null  object 
 15  justification      10153 non-null  object 
 16  binary_label       827

In [46]:
train_df.half_true_count.describe()

count    10237.000000
mean        17.133926
std         35.848256
min          0.000000
25%          0.000000
50%          3.000000
75%         13.000000
max        160.000000
Name: half_true_count, dtype: float64

In [47]:

train_df.mostly_true_count.describe()

count    10237.000000
mean        16.434014
std         36.153445
min          0.000000
25%          0.000000
50%          3.000000
75%         11.000000
max        163.000000
Name: mostly_true_count, dtype: float64

In [48]:

train_df.pants_fire_count.describe()

count    10237.000000
mean         6.201231
std         16.129705
min          0.000000
25%          0.000000
50%          1.000000
75%          5.000000
max        105.000000
Name: pants_fire_count, dtype: float64

In [49]:

train_df.pants_fire_count.astype(float).describe()

count    10237.000000
mean         6.201231
std         16.129705
min          0.000000
25%          0.000000
50%          1.000000
75%          5.000000
max        105.000000
Name: pants_fire_count, dtype: float64

In [50]:
true_ex.statement.str.split().str.len().describe()

count    3789.000000
mean       18.454473
std         8.651553
min         2.000000
25%        13.000000
50%        17.000000
75%        23.000000
max       242.000000
Name: statement, dtype: float64

In [51]:

false_ex.statement.str.split().str.len().describe()

count    4488.000000
mean       17.488191
std         9.371839
min         2.000000
25%        12.000000
50%        16.000000
75%        22.000000
max       317.000000
Name: statement, dtype: float64

In [52]:

# Sample true and false examples to observe characteristics
true_ex.sample(frac=0.2).head(5)

Unnamed: 0,id,statement_json,label,statement,subject,speaker,speaker_title,state_info,party_affiliation,barely_true_count,false_count,half_true_count,mostly_true_count,pants_fire_count,context,justification,binary_label
730,730.0,8881.json,half-true,"Savings that I achieved helped fund cameras on the border. As a result, more than 21,500 arrests have been made and 46 tons of narcotics confiscated.","crime,immigration,state-budget,state-finances",todd-staples,state agriculture commissioner,Texas,republican,0.0,1.0,4.0,3.0,1.0,a campaign email blast,"Staples said savings he achieved helped fund cameras on the border and that as a result, more than 21,500 arrests were made and 46 tons of narcotics were confiscated. Its documented that the grant money supported the border effort. But we dont see a way to tie particular arrests and confiscations to the grants. Also, this statement leaves out the critical fact that about 90 percent of the operations funding has come from federal aid awarded through the governors office.",True
765,765.0,7960.json,half-true,Not one tax has been raised since Ive been governor.,taxes,chris-christie,Governor of New Jersey,New Jersey,republican,10.0,17.0,27.0,19.0,8.0,"an interview on the ""Ask The Governor"" monthly radio call-in program","Christie said on a radio program, ""Not one tax has been raised since I've been governor. ""The governor is correct that the major taxes in New Jersey that generate revenue have not increased. But he doesnt acknowledge that cuts hes made to tax-relief programs have resulted in tax increases for certain segments of the population. Several experts we talked to agreed that reductions in tax-relief programs can result in higher taxes owed for some people.",True
2441,2453.0,6602.json,half-true,Under the Obama economy ... utility bills are higher.,economy,mitt-romney,Former governor,Massachusetts,republican,34.0,32.0,58.0,33.0,19.0,his acceptance speech at the Republican National Convention in Tampa,"By our count, Form N-400 from the Department of Homeland Security has 110 questions (first and last name count as one, not two). And though it doesn't include worksheets like FAFSA, it has its share of complicated questions. It's no easy task to document every trip of 24 hours or more outside the United States in the past five years.",True
2082,2094.0,8132.json,half-true,"If we dont spend money on a pier, then that money does wipe into the general funds of the city or the county, and if you send it to the county, you never see it again.",taxes,bill-foster,"Mayor, St. Petersburg",Florida,republican,1.0,0.0,2.0,2.0,0.0,a mayoral debate,"Bishop said that the page was initially posted in February 2010 -- that was a few months after the September arrests of Eggelletion, Gallagher and Salesman. Only time will tell if any other members of the Broward County Commission get arrested.",True
2673,2685.0,6752.json,true,Says that in the U.S. House of Representatives weve had bipartisan support for the repeal of Obamacare for getting rid of cap and trade for building the Keystone Pipeline.,"bipartisanship,climate-change,congress,congressional-rules,voting-record",marsha-blackburn,U.S. Representative,Tennessee,republican,2.0,2.0,1.0,0.0,0.0,"an appearance on ""Face the Nation"" on CBS","The Edwards campaign Website uses the Center for American Progress, a left-leaning think tank for its figure of about 56 million people without accounts. The center got the number from a 2002 report by the General Accounting Office, which used U. S. Census data to estimate that 55. 9-million adults did not hold checking, savings or money market accounts in 1999. But that number is at the high end of the available estimates. According to the Federal Reserve Board's 2004 Survey of Consumer Fi...",True


In [53]:

stripped = false_ex.state_info.copy().str.strip()
false_ex.loc[:, "state_info"] = stripped

stripped = true_ex.state_info.copy().str.strip()
true_ex.loc[:, "state_info"] = stripped

In [54]:
# Clean up the variants of state info 
CANONICAL_TO_VARIANTS = {
    "Tennessee": {"Tennessee", "Tennesse"},
    "Washington D.C.": {"District of Columbia", "Washington D.C.", "Washington, D.C.", "Washington DC"},
    "Texas": {"Tex", "Texas"}, 
    "Washington": {"Washington", "Washington state"},
    "Virginia": {"Virginia", "Virgina", "Virgiia"},
    "Pennsylvania": {"Pennsylvania", "PA - Pennsylvania"},
    "Rhode Island": {"Rhode Island", "Rhode island"},
    "Ohio": {"Ohio", "ohio"}
}

def get_variant_to_canonical(can_to_var: Dict):
    variant_to_canonical = {}
    for canonical, variant in can_to_var.items():
        for var in variant:
            variant_to_canonical[var] = canonical
    return variant_to_canonical

variant_to_canonical = get_variant_to_canonical(CANONICAL_TO_VARIANTS)

In [55]:

def clean_variant(state_info, variant_to_canonical):
    if state_info in variant_to_canonical.keys():
        return variant_to_canonical[state_info]
    return state_info

true_ex.loc[:, "state_info"] = true_ex.state_info.apply(lambda x: clean_variant(x, variant_to_canonical))
false_ex.loc[:, "state_info"] = false_ex.state_info.apply(lambda x: clean_variant(x, variant_to_canonical))

In [56]:
# State Info Distribution