In [22]:
import pandas as pd
import numpy as np
import sklearn as sk

In [23]:
df1 = pd.read_csv("politifact_with_bio_data.csv")

In [24]:
df1.head()

Unnamed: 0.2,Unnamed: 0,index,Unnamed: 0.1,name,claim_date,claim_source,claim,issue,accuracy_rating,id.bioguide,id.wikipedia,bio.gender,bio.birthday,type,party,start,end,state,full_name,accuracy
0,0,2,2,Mark Kelly,"August 5, 2022",an ad,Blake Masters “wants to pass a national ban on...,abortion,mostly-true,K000377,Mark Kelly,M,21/2/1964,sen,Democrat,2/12/2020,3/1/2023,AZ,Mark Kelly,1
1,1,9,9,Mariannette Miller-Meeks,"July 17, 2022",a newsletter,The Democrats’ Women’s Health Protection Act o...,abortion,barely-true,M001215,Mariannette Miller-Meeks,F,6/9/1955,rep,Republican,3/1/2021,3/1/2023,IA,Mariannette Miller-Meeks,0
2,2,25,25,Catherine Cortez Masto,"June 14, 2022",an ad,"Adam Laxalt ""supports eliminating Nevada's pro...",abortion,half-true,C001113,Catherine Cortez Masto,F,29/3/1964,sen,Democrat,3/1/2017,3/1/2023,NV,Catherine Cortez Masto,0
3,3,28,28,Tammy Baldwin,"May 4, 2022",TV interview,"""Our Supreme Court has never taken away a cons...",abortion,FALSE,B001230,Tammy Baldwin,F,11/2/1962,rep,Democrat,6/1/1999,3/1/2001,WI,Tammy Baldwin,0
4,4,29,29,Ron Johnson,"May 11, 2022",News release,In the immediate wake of a fire and vandalism ...,abortion,barely-true,J000293,Ron Johnson (Wisconsin politician),M,8/4/1955,sen,Republican,5/1/2011,3/1/2017,WI,Ron Johnson,0


##### Drop rows of the same claim but categorised under different issues to avoid data leakage when splitting

In [25]:
print("there are: {rows}".format(rows=df1.shape))
print("but only {n} unique claims".format(n=df1['claim'].unique().__len__()))

there are: (6396, 20)
but only 4027 unique claims


Upon inspection this is the result of politcas grouping the same article under different issues. The duplicates need to be removed to avoid data leakage.

In [26]:
df2 = df1.drop_duplicates(subset=['claim'], keep='first')

In [27]:
claims = df2['claim']
issues = []
for c in claims:
    issues.append(' '.join(df1[df1['claim']==c]['issue'].unique()))

In [28]:
df2['issues'] = issues

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['issues'] = issues


In [29]:
df1 = df2
df1.reset_index(inplace=True)

In [30]:
df1.shape

(4027, 22)

In [31]:
# https://towardsdatascience.com/clean-a-messy-date-column-with-mixed-formats-in-pandas-1a88808edbf7

# standardise the format of all date columnns: YYYY-MM-DD
df1['claim_date_formatted'] = pd.to_datetime(df1['claim_date'],infer_datetime_format=True)
df1['birthdate_formatted'] = pd.to_datetime(df1['bio.birthday'],dayfirst=True)
df1['start_formatted'] = pd.to_datetime(df1['start'],dayfirst=True)
df1['end_formatted'] = pd.to_datetime(df1['end'],dayfirst=True)

df1[['claim_date', 'claim_date_formatted', 'bio.birthday', 'birthdate_formatted', 'start', 'start_formatted',
    'end', 'end_formatted']].sample(5) # standardised date format: YYYY-MM-DD

Unnamed: 0,claim_date,claim_date_formatted,bio.birthday,birthdate_formatted,start,start_formatted,end,end_formatted
554,"January 20, 2012",2012-01-20,19/11/1947,1947-11-19,6/1/1987,1987-01-06,3/1/1989,1989-01-03
1983,"May 14, 2012",2012-05-14,27/2/1954,1954-02-27,4/1/1995,1995-01-04,3/1/1997,1997-01-03
3944,"November 5, 2010",2010-11-05,17/11/1949,1949-11-17,3/1/1991,1991-01-03,3/1/1993,1993-01-03
604,"November 19, 2020",2020-11-19,11/2/1962,1962-02-11,6/1/1999,1999-01-06,3/1/2001,2001-01-03
1506,"July 17, 2011",2011-07-17,28/5/1971,1971-05-28,5/1/2011,2011-01-05,3/1/2017,2017-01-03


In [None]:
def getAgeWhenClaimWasMade(birthdate, claimdate):
    birth_year = birthdate.strftime('%Y-%m-%d').split('-')[0]
    claim_year = claimdate.strftime('%Y-%m-%d').split('-')[0]
    curr_age = int(claim_year) - int(birth_year)
    return curr_age
df1['claimmer age'] = df1.apply(lambda x: getAgeWhenClaimWasMade(x['birthdate formatted'], x['claim date formatted']), axis=1)
df1[['birthdate formatted', 'claim date formatted', 'claimmer age']].sample(10)

In [None]:
np.sort(df1['claimmer age'].unique())

<div class="alert alert-warning">
- Notice abnormally old ages!

In [None]:
abnormal_age_idx = df1[df1['claimmer age'] > 94].index
abnormal_age_names = df1.iloc[abnormal_age_idx].groupby('name').size().index
df1.iloc[abnormal_age_idx].drop_duplicates(subset='name')

In [None]:
# check thru politicians with dates that don't make sense
# https://bioguide.congress.gov/
abnormal_age_idx = df1[df1['claimmer age'] > 94].index
abnormal_age_names = df1.iloc[abnormal_age_idx].groupby('name').size().index
df1.iloc[abnormal_age_idx].drop_duplicates(subset='name')[['name','claimmer age', 'claim source',
                                                           'birthdate formatted',
                                                          'start formatted', 'end formatted','claim date formatted']] # 20 names to manually check thru and correct the dates

<div class="alert alert-warning">
    
- The above are the politicians with anbormally old ages.
- After checking on https://bioguide.congress.gov/, confirmed that they have already passed on (way) before claim date. 
- To drop these invalid rows.

In [None]:
print(f'There are {len(abnormal_age_names)} unique invalid politicians.\n')
print(f'They are found on {len(abnormal_age_idx)} rows which are to be dropped.')

Upon inspection and googling we learn that this was due to a mismatch when joining the two tables(bio data and Politifact).

In [None]:
# drop rows with these politicians who have alr passed on by index - abnormal_age_idx
df1.drop(abnormal_age_idx, inplace=True)
# reset df2 index
df1.reset_index(drop=True)

## Train Test Split

<div class="alert alert-warning">
    
- To split the data into train and test datasets prior to applying countvectorizer / tfidfvectorizer/ other feature engineering steps to prevent data leakage.
    
    Reference:
- https://stackoverflow.com/questions/54491953/can-i-use-countvectorizer-on-both-test-and-train-data-at-the-same-time-or-do-i-n
- https://stackoverflow.com/questions/49444262/normalize-data-before-or-after-split-of-training-and-testing-data

In [None]:
from sklearn.model_selection import train_test_split

# Set "accuracy" as the y variable
y = df1["accuracy"] 

# Set everything other than accuracy as the X variables
X = df1.drop(columns=["accuracy"]) 

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,test_size=0.30, random_state=42)

### CountVectorizer `claim`

In [None]:
# https://stackoverflow.com/questions/52972368/select-top-n-tfidf-features-for-a-given-document

train_claim = X_train['claim']
test_claim = X_test['claim']

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_claim_vec = TfidfVectorizer(lowercase=False, stop_words="english")
tfidf_claim_vec.fit(train_claim)
tfidf_claim_vec_train = tfidf_claim_vec.transform(train_claim).toarray()
tfidf_claim_vec_test = tfidf_claim_vec.transform(test_claim).toarray()

tfidf_claim_vec_test.shape,tfidf_claim_vec_test.shape

### Subjectivity of `claim`

In [None]:
from textblob import TextBlob
X_train['claim subjectivity'] = X_train['claim'].apply(lambda x:TextBlob(x).sentiment.subjectivity)
X_test['claim subjectivity']  = X_test['claim'].apply(lambda x:TextBlob(x).sentiment.subjectivity)
import seaborn as sns
import matplotlib.pyplot as plt
sns.kdeplot(x="claim subjectivity", data=X_train[y_train==0])
sns.kdeplot(x="claim subjectivity", data=X_train[y_train==1])
plt.legend(['real','fake'])
plt.show()

### CountVectorizer `claim source`

In [None]:
X_train['claim source']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_src = CountVectorizer(stop_words='english', lowercase=True)
vectorizer_src.fit(X_train['claim source'].values)
src_vec_train = vectorizer_src.transform(X_train['claim source']) # fit & transform on train

# only transform on test
src_vec_test = vectorizer_src.transform(X_test['claim source'].values)

src_vec_train.shape

### CountVectorizer `Issues`

In [None]:
vectorizer_issues = CountVectorizer()
vectorizer_issues.fit(X_train['issues'].values)
issues_vec_train = vectorizer_issues.transform(X_train['issues']) # fit & transform on train
issues_vec_test = vectorizer_issues.transform(X_test['issues'].values)
issues_vec_train.shape

### One-Hot Encoding (OHE) for Categorical Features

- apply on categorical features: ['issue', 'state', 'bio.gender', 'type', 'party']

#### OHE all categorical variables tgt

In [None]:
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
categorical_variables = ['state', 'bio.gender', 'type', 'party']

X_train_categorical = X_train[categorical_variables]
X_test_categorical = X_test[categorical_variables]

# One-hot encoding
enc_lr = OneHotEncoder(handle_unknown="ignore")

transformer_lr = make_column_transformer((enc_lr, categorical_variables), remainder="passthrough")
X_train_categorical = transformer_lr.fit_transform(X_train_categorical)
X_test_categorical = transformer_lr.transform(X_test_categorical)

age_train = X_train['claimmer age'].values.reshape(-1,1)
age_test = X_test['claimmer age'].values.reshape(-1,1)
subjectivity_train = X_train['claim subjectivity'].values.reshape(-1,1)
subjectivity_test = X_test['claim subjectivity'].values.reshape(-1,1)

X_train_vec = np.hstack([age_train, subjectivity_train, src_vec_train.toarray(), issues_vec_train.toarray(),tfidf_claim_vec_train])
X_test_vec = np.hstack([age_test, subjectivity_test, src_vec_test.toarray(), issues_vec_test.toarray(), tfidf_claim_vec_test])

# then combine with the numerical matrices of the other features 
X_train_combined = np.hstack([X_train_vec, X_train_categorical.toarray()])
X_test_combined = np.hstack([X_test_vec, X_test_categorical.toarray()])

In [None]:
columns = ['claimmer age','claim subjectivity']
columns.extend(list(vectorizer_src.vocabulary_.keys()))
columns.extend(list(vectorizer_issues.vocabulary_.keys()))
columns.extend(list(tfidf_claim_vec.vocabulary_.keys()))
lr_cols = transformer_lr.transformers_[0][1].categories_ #state, bio.gender, type, party
for i in range(4):
    columns.extend(lr_cols[i])


In [None]:
columns.__len__()

In [None]:
X_train_combined.shape, X_test_combined.shape

In [None]:
y_train.shape,X_train_combined.shape

In [None]:
df_train = pd.DataFrame(data=X_train_combined, columns=columns)
df_train['accuracy'] = y_train.values
df_train.to_csv("train bio.csv")
df_test = pd.DataFrame(data=X_test_combined, columns=columns)
df_test['accuracy'] = y_test.values
df_test.to_csv("test bio.csv")