In [280]:
import pandas as pd
import numpy as np
import sklearn as sk

In [281]:
df1 = pd.read_csv("Politifact with biodata NA dropped.csv")

In [282]:
# https://towardsdatascience.com/clean-a-messy-date-column-with-mixed-formats-in-pandas-1a88808edbf7

# standardise the format of all date columnns: YYYY-MM-DD
df1['claim date formatted'] = pd.to_datetime(df1['claim date'],infer_datetime_format=True)
df1['birthdate formatted'] = pd.to_datetime(df1['bio.birthday'],dayfirst=True)
df1['start formatted'] = pd.to_datetime(df1['start'],dayfirst=True)
df1['end formatted'] = pd.to_datetime(df1['end'],dayfirst=True)

df1[['claim date', 'claim date formatted', 'bio.birthday', 'birthdate formatted', 'start', 'start formatted',
    'end', 'end formatted']].sample(5) # standardised date format: YYYY-MM-DD

Unnamed: 0,claim date,claim date formatted,bio.birthday,birthdate formatted,start,start formatted,end,end formatted
5753,"May 3, 2016",2016-05-03,26/10/1947,1947-10-26,3/1/2001,2001-01-03,3/1/2007,2007-01-03
3054,"April 9, 2019",2019-04-09,15/12/1954,1954-12-15,6/1/2009,2009-01-06,3/1/2015,2015-01-03
4162,"November 30, 2010",2010-11-30,9/11/1952,1952-11-09,5/1/1993,1993-01-05,3/1/1995,1995-01-03
5269,"October 9, 2016",2016-10-09,26/10/1947,1947-10-26,3/1/2001,2001-01-03,3/1/2007,2007-01-03
3849,"January 21, 2021",2021-01-21,8/4/1955,1955-04-08,5/1/2011,2011-01-05,3/1/2017,2017-01-03


In [283]:
def getAgeWhenClaimWasMade(birthdate, claimdate):
    birth_year = birthdate.strftime('%Y-%m-%d').split('-')[0]
    claim_year = claimdate.strftime('%Y-%m-%d').split('-')[0]
    curr_age = int(claim_year) - int(birth_year)
    return curr_age
df1['claimmer age'] = df1.apply(lambda x: getAgeWhenClaimWasMade(x['birthdate formatted'], x['claim date formatted']), axis=1)
df1[['birthdate formatted', 'claim date formatted', 'claimmer age']].sample(10)

Unnamed: 0,birthdate formatted,claim date formatted,claimmer age
2135,1970-12-22,2021-01-06,51
5175,1966-12-09,2019-07-09,53
296,1972-08-07,2022-02-01,50
5835,1955-04-13,2013-09-04,58
2998,1961-07-15,2013-04-19,52
2496,1963-06-06,2011-05-20,48
1524,1952-12-07,2013-08-03,61
2793,1869-05-01,2017-06-01,148
1147,1970-01-29,2011-10-09,41
2262,1961-08-04,2014-04-11,53


In [284]:
np.sort(df1['claimmer age'].unique())

array([ 25,  26,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  92,  94, 106,
       107, 108, 115, 123, 137, 138, 141, 146, 148, 150, 151, 157, 158,
       161, 172, 173, 174, 176, 193, 195, 196, 204, 210, 217, 218, 220,
       221, 224, 227, 230, 234, 235, 239, 245, 247, 248, 266, 267])

<div class="alert alert-warning">
- Notice abnormally old ages!

In [285]:
# check thru politicians with dates that don't make sense
# https://bioguide.congress.gov/
abnormal_age_idx = df1[df1['claimmer age'] > 94].index
abnormal_age_names = df1.iloc[abnormal_age_idx].groupby('name').size().index
df1.iloc[abnormal_age_idx].drop_duplicates(subset='name')[['name','claimmer age', 'claim source',
                                                           'birthdate formatted',
                                                          'start formatted', 'end formatted','claim date formatted']] # 20 names to manually check thru and correct the dates

Unnamed: 0,name,claimmer age,claim source,birthdate formatted,start formatted,end formatted,claim date formatted
107,John Carter,218,a press release,1792-09-10,1821-12-03,1823-03-03,2010-03-21
131,Richard Clarke,172,a panel on ABC This Week,1843-02-09,1889-12-02,1891-03-03,2015-04-26
239,Mark Harris,239,A campaign ad,1779-01-27,1821-12-03,1823-03-03,2018-09-20
277,David Barton,227,an open letter,1783-12-14,1821-12-03,1825-03-03,2010-06-12
316,James Peterson,123,a ruling from the bench,1894-02-11,1933-03-09,1935-01-03,2017-06-23
323,James Woods,148,a tweet,1868-02-04,1917-04-02,1919-03-03,2016-07-31
350,Charles Cooper,157,arguments before the US Supreme Court,1856-01-16,1893-08-07,1895-03-03,2013-03-26
385,John Delaney,141,an interview,1878-08-21,1917-04-02,1919-03-03,2019-08-27
588,John Stockton,196,an interview,1826-08-02,1865-12-04,1867-03-03,2022-01-23
676,David Lewis,151,a tweet,1869-05-01,1911-04-04,1913-03-03,2020-03-26


<div class="alert alert-warning">
    
- The above are the politicians with anbormally old ages.
- After checking on https://bioguide.congress.gov/, confirmed that they have already passed on (way) before claim date. 
- To drop these invalid rows.

In [286]:
print(f'There are {len(abnormal_age_names)} unique invalid politicians who already passed on before claim date.\n')
print(f'They are found on {len(abnormal_age_idx)} rows which are to be dropped.')

There are 31 unique invalid politicians who already passed on before claim date.

They are found on 102 rows which are to be dropped.


In [287]:
# drop rows with these politicians who have alr passed on by index - abnormal_age_idx
df1.drop(abnormal_age_idx, inplace=True)
# reset df2 index
df1.reset_index(drop=True)

Unnamed: 0.2,Unnamed: 0.1,index,Unnamed: 0,name,claim date,claim source,claim,issue,accuracy rating,id.bioguide,...,start,end,state,full name,accuracy,claim date formatted,birthdate formatted,start formatted,end formatted,claimmer age
0,0,2,2,Mark Kelly,"August 5, 2022",an ad,Blake Masters “wants to pass a national ban on...,abortion,mostly-true,K000377,...,2/12/2020,3/1/2023,AZ,Mark Kelly,1,2022-08-05,1964-02-21,2020-12-02,2023-01-03,58
1,1,9,9,Mariannette Miller-Meeks,"July 17, 2022",a newsletter,The Democrats’ Women’s Health Protection Act o...,abortion,barely-true,M001215,...,3/1/2021,3/1/2023,IA,Mariannette Miller-Meeks,0,2022-07-17,1955-09-06,2021-01-03,2023-01-03,67
2,2,25,25,Catherine Cortez Masto,"June 14, 2022",an ad,"Adam Laxalt ""supports eliminating Nevada's pro...",abortion,half-true,C001113,...,3/1/2017,3/1/2023,NV,Catherine Cortez Masto,0,2022-06-14,1964-03-29,2017-01-03,2023-01-03,58
3,3,28,28,Tammy Baldwin,"May 4, 2022",TV interview,"""Our Supreme Court has never taken away a cons...",abortion,FALSE,B001230,...,6/1/1999,3/1/2001,WI,Tammy Baldwin,0,2022-05-04,1962-02-11,1999-01-06,2001-01-03,60
4,4,29,29,Ron Johnson,"May 11, 2022",News release,In the immediate wake of a fire and vandalism ...,abortion,barely-true,J000293,...,5/1/2011,3/1/2017,WI,Ron Johnson,0,2022-05-11,1955-04-08,2011-01-05,2017-01-03,67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6289,6391,25913,25913,Jeff Miller,"November 19, 2009",a newsletter,"""Although Democrats represent only 10 of Flori...",workers,barely-true,M001144,...,23/10/2001,3/1/2003,FL,Jeff Miller,0,2009-11-19,1959-06-27,2001-10-23,2003-01-03,50
6290,6392,25914,25914,Jeff Miller,"November 19, 2009",a newsletter,Recovery.gov listed congressional districts th...,workers,mostly-true,M001144,...,23/10/2001,3/1/2003,FL,Jeff Miller,1,2009-11-19,1959-06-27,2001-10-23,2003-01-03,50
6291,6393,25918,25918,Barack Obama,"May 27, 2009",a speech,"In the 100 days since its passage, the economi...",workers,barely-true,O000167,...,4/1/2005,16/11/2008,IL,Barack Obama,0,2009-05-27,1961-08-04,2005-01-04,2008-11-16,48
6292,6394,25919,25919,Arlen Specter,"March 24, 2009",a speech on the Senate floor,"The Employee Free Choice Act mandates the ""eli...",workers,mostly-true,S000709,...,5/1/1981,3/1/1987,PA,Arlen Specter,1,2009-03-24,1930-02-12,1981-01-05,1987-01-03,79


## Train Test Split

<div class="alert alert-warning">
    
- To split the data into train and test datasets prior to applying countvectorizer / tfidfvectorizer/ other feature engineering steps to prevent data leakage.
    
    Reference:
- https://stackoverflow.com/questions/54491953/can-i-use-countvectorizer-on-both-test-and-train-data-at-the-same-time-or-do-i-n
- https://stackoverflow.com/questions/49444262/normalize-data-before-or-after-split-of-training-and-testing-data

In [288]:
from sklearn.model_selection import train_test_split

# Set "accuracy" as the y variable
y = df1["accuracy"] 

# Set everything other than accuracy as the X variables
X = df1.drop(columns=["accuracy"]) 

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,test_size=0.30, random_state=42)

### CountVectorizer `claim`

In [289]:
# https://stackoverflow.com/questions/52972368/select-top-n-tfidf-features-for-a-given-document

train_claim = X_train['claim']
test_claim = X_test['claim']

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_claim_vec = TfidfVectorizer(lowercase=False, stop_words="english")
tfidf_claim_vec.fit(train_claim)
tfidf_claim_vec_train = tfidf_claim_vec.transform(train_claim).toarray()
tfidf_claim_vec_test = tfidf_claim_vec.transform(test_claim).toarray()

tfidf_claim_vec_test.shape,tfidf_claim_vec_test.shape

((1889, 6687), (1889, 6687))

### CountVectorizer `claim source`

In [290]:
X_train['claim source']

1221                                          a tweet
95                               a fundraising letter
1982                                         an email
1562                              an interview on CNN
704                                          a debate
                            ...                      
6038                                   a news release
2934                            an interview on MSNBC
3648                                          a tweet
4580                                         a speech
3575    a tweet during the State of the Union address
Name: claim source, Length: 4405, dtype: object

In [291]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_src = CountVectorizer(stop_words='english', lowercase=True)
vectorizer_src.fit(X_train['claim source'].values)
src_vec_train = vectorizer_src.transform(X_train['claim source']) # fit & transform on train

# only transform on test
src_vec_test = vectorizer_src.transform(X_test['claim source'].values)

src_vec_train.shape

(4405, 949)

### One-Hot Encoding (OHE) for Categorical Features

- apply on categorical features: ['issue', 'state', 'bio.gender', 'type', 'party']

#### OHE all categorical variables tgt

In [292]:
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import classification_report, confusion_matrix

In [293]:
categorical_variables = ['issue', 'state', 'bio.gender', 'type', 'party']

X_train_categorical = X_train[categorical_variables]
X_test_categorical = X_test[categorical_variables]

# One-hot encoding
enc_lr = OneHotEncoder(handle_unknown="ignore")

transformer_lr = make_column_transformer((enc_lr, categorical_variables), remainder="passthrough")
X_train_categorical = transformer_lr.fit_transform(X_train_categorical)
X_test_categorical = transformer_lr.transform(X_test_categorical)

age_train = X_train['claimmer age'].values.reshape(-1,1)
age_test = X_test['claimmer age'].values.reshape(-1,1)

X_train_vec = np.hstack([age_train, src_vec_train.toarray(),tfidf_claim_vec_train ])
X_test_vec = np.hstack([age_test, src_vec_test.toarray(),tfidf_claim_vec_test])

# then combine with the numerical matrices of the other features 
X_train_combined = np.hstack([X_train_vec, X_train_categorical.toarray()])
X_test_combined = np.hstack([X_test_vec, X_test_categorical.toarray()])

columns = ['claimmer age','issue', 'state', 'bio.gender', 'type', 'party',]

In [294]:
columns = ['claimmer age']
columns.extend(list(vectorizer_src.vocabulary_.keys()))
columns.extend(list(tfidf_claim_vec.vocabulary_.keys()))
lr_cols = transformer_lr.transformers_[0][1].categories_
for i in range(5):
    columns.extend(lr_cols[i])


In [295]:
columns.__len__()

7775

In [296]:
X_train_combined.shape, X_test_combined.shape

((4405, 7775), (1889, 7775))

In [302]:
y_train.shape,X_train_combined.shape

((4405,), (4405, 7775))

In [307]:
df_train = pd.DataFrame(data=X_train_combined, columns=columns)
df_train['accuracy'] = y_train.values
df_train.to_csv("train bio.csv")
df_test = pd.DataFrame(data=X_test_combined, columns=columns)
df_test['accuracy'] = y_test.values
df_test.to_csv("test bio.csv")