In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, f1_score
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_sample_weight
from scipy.optimize import linear_sum_assignment

## 500K dataset

In [31]:
data_500 = pd.read_csv("/Users/nirugidla/Documents/GitHub/milestone2_MADS/data_500k.csv", low_memory=False)
data_500

Unnamed: 0,RECORD_ID,ADD_TYPE,AFAMPROFLS,AGE,AI_COUNTY_NAME,AIRCOND,APP_CHILD,APP_MENBIG,APP_TODDLR,APP_WOMEN,...,VTR_PRI16,VTR_PRI17,VTR_PRI18,VTR_PRI19,VTR_PRI20,VTR_PRI21,VTR_PRI22,WORKWOMAN,YEARBUILT,ZIP
0,403390,S,,21.0,Fairbanks North Star,,,,,,...,,,,,,,,,,99705
1,62285,H,,,Anchorage,,,,,,...,,,,,,,,,,99506
2,331355,,,91.0,Kenai Peninsula,,,,,,...,,,,,,,,,,99603
3,206320,H,,65.0,Anchorage,,,,,,...,,,,,,,Y,,,99567
4,188078,S,,76.0,Juneau,,,,,,...,Y,,Y,,,,Y,Y,1985,99801
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,349635,H,,20.0,BIBB,,,,,,...,,,,,,,,,,31204
499996,420654,S,,50.0,COWETA,A,,,,,...,,,,,,,,,2003,30263
499997,131262,S,,19.0,ROCKDALE,,,,,,...,,,,,,,,,,30013
499998,315673,H,,21.0,BARROW,,,,,,...,,,,,,,,,,30680


In [32]:
remove_columns = [
    'PRFL_LGBT_SUPPORT',
    'PRFL_LIBERAL_NEWS',
    'PRFL_MARIJUANA_REFORM',
    'PRFL_BIDEN_SUPPORT',
    'PRFL_BORDER_SECURITY',
    'PRFL_CONSERVATIVE_NEWS',
    'PRFL_IMMIGRATION_REFORM',
    'PRFL_OBAMA',
    'PRFL_PERSUADABLE_VOTER',
    'PRFL_POLITICAL_IDEOLOGY',
    'PRFL_SANDERS_SUPPORT',
    'PRFL_TRUMP_SUPPORT',
    'ZIP',
    
    'VTR_GEN00', 'VTR_GEN01', 'VTR_GEN02', 'VTR_GEN03', 'VTR_GEN04', 'VTR_GEN05', 'VTR_GEN06', 'VTR_GEN07', 'VTR_GEN08', 'VTR_GEN09', 'VTR_GEN10', 'VTR_GEN11', 'VTR_GEN12', 'VTR_GEN13', 'VTR_GEN14', 'VTR_GEN15', 'VTR_GEN16', 'VTR_GEN17', 'VTR_GEN18', 'VTR_GEN19', 'VTR_GEN20', 'VTR_GEN21', 'VTR_GEN22', 'VTR_OTH00', 'VTR_OTH01', 'VTR_OTH02', 'VTR_OTH03', 'VTR_OTH04', 'VTR_OTH05', 'VTR_OTH06', 'VTR_OTH07', 'VTR_OTH08', 'VTR_OTH09', 'VTR_OTH10', 'VTR_OTH11', 'VTR_OTH12', 'VTR_OTH13', 'VTR_OTH14', 'VTR_OTH15', 'VTR_OTH16', 'VTR_OTH17', 'VTR_OTH18', 'VTR_OTH19', 'VTR_OTH20', 'VTR_OTH21', 'VTR_OTH22', 'VTR_PPP00', 'VTR_PPP04', 'VTR_PPP08', 'VTR_PPP12', 'VTR_PPP16', 'VTR_PPP20', 'VTR_PRI00', 'VTR_PRI01', 'VTR_PRI02', 'VTR_PRI03', 'VTR_PRI04', 'VTR_PRI05', 'VTR_PRI06', 'VTR_PRI07', 'VTR_PRI08', 'VTR_PRI09', 'VTR_PRI10', 'VTR_PRI11', 'VTR_PRI12', 'VTR_PRI13', 'VTR_PRI14', 'VTR_PRI15', 'VTR_PRI16', 'VTR_PRI17', 'VTR_PRI18', 'VTR_PRI19', 'VTR_PRI20', 'VTR_PRI21', 'VTR_PRI22',
    
        
      'PRFL_CHOICELIFE', 'TOD_PRES_D_2016_PREC', 'TOD_PRES_O_2016',
    'TOD_PRES_R_2016', 'TOD_PRES_R_2016_PREC', 'TOD_PRES_R_2020_PREC', 'VP_PPP',
    'AGE', 'CNSUS_PCTW',
    
    'PARTY_MIX', 'PRFL_MINWAGE', 'PRFL_FENCE_SITTER', 
]
# Drop the list of columns from the dataset
data_500.drop(columns=remove_columns, errors='ignore', inplace=True)


In [None]:
def clean_party_create_infer(data_500):
    #Replace blanks and spaces with NaN
    corrected_data.replace(r'^\s*$', np.nan, regex=True, inplace=True)

    # Update the 'I' values in PARTY_CODE to 'N' for rows where STATE is 'SD'
    corrected_data.loc[(corrected_data['STATE'] == 'SD') & (corrected_data['PARTY_CODE'] == 'I'), 'PARTY_CODE'] = 'N'

    # Update the 'U' values in PARTY_CODE to 'N' for rows where STATE is in ['DC', 'LA', 'RI']
    states_to_update = ['DC', 'LA', 'RI']
    corrected_data.loc[(corrected_data['STATE'].isin(states_to_update)) & (corrected_data['PARTY_CODE'] == 'U'), 'PARTY_CODE'] = 'N'

    # Update the 'I' values in PARTY_CODE to 'N' for rows where STATE is 'OK'
    corrected_data.loc[(corrected_data['STATE'] == 'OK') & (corrected_data['PARTY_CODE'] == 'I'), 'PARTY_CODE'] = 'N'
    
    print("N's after corrections")
    print(len(corrected_data.loc[corrected_data['PARTY_CODE'].isin(['N'])]))
    
    print("U's after corrections")
    print(len(corrected_data.loc[corrected_data['PARTY_CODE'].isin(['U'])]))
    
    print("inferred Rs:")
    print(len(corrected_data.loc[corrected_data['PARTY_CODE'].isin(['S'])]))

    print("inferred Ds:")
    print(len(corrected_data.loc[corrected_data['PARTY_CODE'].isin(['E'])]))

    # Define function to infer party based on given conditions
    def set_inferred_party(row):
        not_in_list = ['D', 'R', 'M', 'P', 'X', 'Z']
        rpx = ['R', 'P', 'X']
        dmz = ['D', 'M', 'Z']
        if row['PARTY_CODE'] not in ['D', 'R','E','S']:
            if (
               (row['FUND_POLIT'] == 'R' or row['DON_POLCONS'] or row['PRFL_HEALTHCARE_REFORM'] == '2' or
                row['PRFL_2NDAMEND'] == 'Y' or row['PRFL_CHOICELIFE'] == '1') and row['FUND_POLIT'] != 'D' and
                not row['DON_POLLIB'] and all(row[col] not in not_in_list for col in
                ['VTR_PRI' + "{:02}".format(i) for i in range(22, 2, -1)] + ['VTR_PPP' + "{:02}".format(i) for i in [20, 16, 12, 8, 4, 0]])
           ) or (
               sum(1 for col in ['VTR_PRI' + "{:02}".format(i) for i in range(22, 2, -1)] + ['VTR_PPP' + "{:02}".format(i) for i in [20, 16, 12, 8, 4, 0]]
                if row[col] in rpx) > sum(1 for col in ['VTR_PRI' + "{:02}".format(i) for i in range(22, 2, -1)] +
                ['VTR_PPP' + "{:02}".format(i) for i in [20, 16, 12, 8, 4, 0]] if row[col] in dmz)
           ):
                return 'S'
            elif (
               (row['FUND_POLIT'] == 'D' or row['DON_POLLIB'] or row['PRFL_HEALTHCARE_REFORM'] == '1' or
                row['PRFL_CHOICELIFE'] == '2') and row['FUND_POLIT'] != 'R' and not row['DON_POLCONS'] and
                all(row[col] not in not_in_list for col in ['VTR_PRI' + "{:02}".format(i) for i in range(22, 2, -1)] +
                ['VTR_PPP' + "{:02}".format(i) for i in [20, 16, 12, 8, 4, 0]])
           ) or (
               sum(1 for col in ['VTR_PRI' + "{:02}".format(i) for i in range(22, 2, -1)] + ['VTR_PPP' + "{:02}".format(i) for i in [20, 16, 12, 8, 4, 0]]
                if row[col] in rpx) < sum(1 for col in ['VTR_PRI' + "{:02}".format(i) for i in range(22, 2, -1)] +
                ['VTR_PPP' + "{:02}".format(i) for i in [20, 16, 12, 8, 4, 0]] if row[col] in dmz)
           ):
                return 'E'
        return row['PARTY_CODE']

    # Applying the function
    corrected_data['PARTY_CODE'] = corrected_data.apply(set_inferred_party, axis=1)
   
    print("new inferred Rs:")
    print(len(corrected_data.loc[corrected_data['PARTY_CODE'].isin(['S'])]))

    print("new inferred Ds:")
    print(len(corrected_data.loc[corrected_data['PARTY_CODE'].isin(['E'])]))
          

    # Create party_mapping using defaultdict.Lambda sets default (all other non-D and non-R party registrations to NaN.
    infer_party_mapping = defaultdict(lambda: float('nan'))
    infer_party_mapping.update({
        'D': 'D',  # Registered Democrats are mapped to Democrat
        'E': 'D',  # E (Inferred Democrats) are mapped to Democrat
        'R': 'R',  # Registered Republicans are mapped to Republican 
        'S': 'R'   # S (Inferred Republicans) are mapped to Republican
    })
    # Map PARTY_CODE to INFER_PARTY
    corrected_data['INFER_PARTY'] = corrected_data['PARTY_CODE'].map(infer_party_mapping)

    # Define the mapping for PARTY_CODE modifications
    party_code_mapping = {
        'E': float('nan'),
        'S': float('nan'),
        'U': float('nan'),
        'A': 'O',
        'B': 'O',
        'C': 'O',
        'F': 'O',
        'G': 'O',
        'H': 'O',
        'I': 'O',
        'J': 'O',
        'K': 'O',
        'L': 'L',
        'P': 'O',
        'Q': 'O',
        'T': 'O',
        'V': 'O',
        'W': 'O',
        'Y': 'O',
        'Z': 'O'
    }
    # Apply the mapping to the PARTY_CODE column
    corrected_data['PARTY_CODE'] = corrected_data['PARTY_CODE'].replace(party_code_mapping)

    print("Final PARTY_CODE values:")
    print(corrected_data['PARTY_CODE'].unique())
    print('Final INFER_PARTY values:')
    print(corrected_data['INFER_PARTY'].unique())
    print('INFER_PARTY Ds and Rs:')
    print(len(corrected_data.loc[corrected_data['INFER_PARTY'].isin(['D', 'R'])]))
    print('INFER_PARTY NaNs:')
    print(corrected_data['INFER_PARTY'].isna().sum())
    
    
    voter_columns = [col for col in corrected_data.columns if col.startswith("VTR")]

    #Create column with total number of votes in voter_columns per row
    corrected_data['VTR_TOTAL_VOTES'] = corrected_data[voter_columns].notnull().sum(axis=1)

    #Sum Democrat and Republican totals
    corrected_data['VTR_TOTAL_DVOTES'] = corrected_data[voter_columns].isin(['D', 'M', 'Z']).sum(axis=1)
    corrected_data['VTR_TOTAL_RVOTES'] = corrected_data[voter_columns].isin(['R', 'P', 'X']).sum(axis=1)

    #Create conditions that evaluate whether someone has cast at least two more votes for Democrats or Republicans: 
    conditions = [
        corrected_data['VTR_TOTAL_DVOTES'] - corrected_data['VTR_TOTAL_RVOTES'] >= 2,
        corrected_data['VTR_TOTAL_RVOTES'] - corrected_data['VTR_TOTAL_DVOTES'] >= 2
    ]

    choices = ['D', 'R']

    corrected_data['VTR_INFER_PARTY'] = np.select(conditions, choices, default=np.nan)
    corrected_data['VTR_INFER_PARTY'].replace('nan', np.nan, inplace=True)

    # Assign a 'D' or an 'R' to INFER_PARTY if either condition is true:
    corrected_data.loc[(corrected_data['VTR_INFER_PARTY'] == 'D') , 'INFER_PARTY'] = 'D'
    corrected_data.loc[(corrected_data['VTR_INFER_PARTY'] == 'R') , 'INFER_PARTY'] = 'R'

    print('Final INFER_PARTY Ds and Rs:')
    print(len(corrected_data.loc[corrected_data['INFER_PARTY'].isin(['D', 'R'])]))
    print('Final INFER_PARTY NaNs:')
    print(corrected_data['INFER_PARTY'].isna().sum())
    
    
    return (corrected_data)
clean_party_create_infer(data_500)


In [33]:
# # Load the dataset again
# #data = pd.read_csv('data/surveydata.csv')
# 
# corrected_data = data_500.copy()
# 
# # Update the 'I' values in PARTY_CODE to 'N' for rows where STATE is 'SD'
# corrected_data.loc[(corrected_data['STATE'] == 'SD') & (corrected_data['PARTY_CODE'] == 'I'), 'PARTY_CODE'] = 'N'
# 
# # Update the 'U' values in PARTY_CODE to 'N' for rows where STATE is in ['DC', 'LA', 'RI']
# states_to_update = ['DC', 'LA', 'RI']
# corrected_data.loc[(corrected_data['STATE'].isin(states_to_update)) & (corrected_data['PARTY_CODE'] == 'U'), 'PARTY_CODE'] = 'N'
# 
# # Update the 'I' values in PARTY_CODE to 'N' for rows where STATE is 'OK'
# corrected_data.loc[(corrected_data['STATE'] == 'OK') & (corrected_data['PARTY_CODE'] == 'I'), 'PARTY_CODE'] = 'N'
# 
# 
# #categorical_columns
# # Create a mapping dictionary for PARTY_CODE to INFER_PARTY values
# party_mapping = {
#     'D': 'D',
#     'E': 'D',
#     'R': 'R',
#     'S': 'R',
#     'N': float('nan'),
#     'U': float('nan'),
#     'A': float('nan'),
#     'B': float('nan'),
#     'C': float('nan'),
#     'F': float('nan'),
#     'G': float('nan'),
#     'H': float('nan'),
#     'I': float('nan'),
#     'J': float('nan'),
#     'K': float('nan'),
#     'L': float('nan'),
#     'P': float('nan'),
#     'Q': float('nan'),
#     'T': float('nan'),
#     'V': float('nan'),
#     'W': float('nan'),
#     'Y': float('nan'),
#     'Z': float('nan'),
#     'O': float('nan'),
# }
# 
# # Create the INFER_PARTY column using the mapping
# corrected_data['INFER_PARTY'] = corrected_data['PARTY_CODE'].map(party_mapping)
# 
# print(corrected_data['PARTY_CODE'].unique())
# 
# 
# # Display the unique values in the INFER_PARTY column to ensure correctness
# unique_infer_party = corrected_data['INFER_PARTY'].unique()
# 
# print('unique_infer_party:')
# 
# print(unique_infer_party)
# 
# # Define the mapping for PARTY_CODE modifications
# party_code_mapping = {
#     'E': float('nan'),
#     'S': float('nan'),
#     'U': float('nan'),
#     'A': 'O',
#     'B': 'O',
#     'C': 'O',
#     'F': 'O',
#     'G': 'O',
#     'H': 'O',
#     'I': 'O',
#     'J': 'O',
#     'K': 'O',
#     'L': 'L',
#     'P': 'O',
#     'Q': 'O',
#     'T': 'O',
#     'V': 'O',
#     'W': 'O',
#     'Y': 'O',
#     'Z': 'O'
# }
# 
# # Apply the mapping to the PARTY_CODE column
# corrected_data['PARTY_CODE'] = corrected_data['PARTY_CODE'].replace(party_code_mapping)
# 
# # Check the unique values of PARTY_CODE after the modifications
# unique_party_code_after_modifications = corrected_data['PARTY_CODE'].unique()
# 
# print("unique_party_code_after_modifications:")
# 
# print(unique_party_code_after_modifications)

['N' 'R' 'O' 'D' 'A' 'F' 'P' 'G' 'L' 'U' 'W' 'B' 'I' 'Y' 'V' 'H' ' ' 'S'
 'E' 'Q' 'Z']
unique_infer_party:
[nan 'R' 'D']
unique_party_code_after_modifications:
['N' 'R' 'O' 'D' 'L' nan ' ']


In [34]:
data_500['PARTY_CODE'].unique()

array(['N', 'R', 'O', 'D', 'A', 'F', 'P', 'G', 'L', 'U', 'W', 'B', 'I',
       'Y', 'V', 'H', ' ', 'S', 'E', 'Q', 'Z'], dtype=object)

In [35]:
engineered_data = corrected_data.copy()
voter_columns = [col for col in engineered_data.columns if col.startswith("VTR")]
#Create column with total number of votes in voter_columns per row
engineered_data['VTR_TOTAL_VOTES'] = engineered_data[voter_columns].notnull().sum(axis=1)
#Sum Democrat and Republican totals
engineered_data['VTR_TOTAL_DVOTES'] = engineered_data[voter_columns].isin(['D', 'M', 'Z']).sum(axis=1)
engineered_data['VTR_TOTAL_RVOTES'] = engineered_data[voter_columns].isin(['R', 'P', 'X']).sum(axis=1)
#Create conditions that evaluate whether more votes have been cast for Democrats or Republicans, and assign 'D' and 'R' to new column based on conditions
conditions = [
    engineered_data['VTR_TOTAL_DVOTES'] > engineered_data['VTR_TOTAL_RVOTES'],
    engineered_data['VTR_TOTAL_DVOTES'] < engineered_data['VTR_TOTAL_RVOTES']
]
choices = ['D', 'R']
engineered_data['VTR_INFER_PARTY'] = np.select(conditions, choices, default=np.nan)
#Create conditions that evaluate whether a voter is a swing voter or not
conditions_swing = [
    (engineered_data['VTR_TOTAL_DVOTES'] > 2) & (engineered_data['VTR_TOTAL_RVOTES'] > 2),
    ((engineered_data['VTR_TOTAL_DVOTES'] > 1) & (engineered_data['VTR_TOTAL_RVOTES'] == 0)) | ((engineered_data['VTR_TOTAL_RVOTES'] > 1) & (engineered_data['VTR_TOTAL_DVOTES'] == 0))
]
choices_swing = ['Y', 'N']
engineered_data['VTR_INFER_SWING'] = np.select(conditions_swing, choices_swing, default=np.nan)
#sampledf = engineered_data[['PARTY_CODE','INFER_PARTY','VTR_TOTAL_DVOTES','VTR_TOTAL_RVOTES','VTR_INFER_PARTY','VTR_INFER_SWING']]
#print(sampledf.head(50))
#Add values to INFER_PARTY and correct any other INFER_PARTY values that don't meet the conditions above:
print(sum(engineered_data['INFER_PARTY'].isna())) #291 NaNs for INFER_PARTY before
engineered_data.loc[(engineered_data['VTR_INFER_PARTY'] == 'D') & (engineered_data['VTR_INFER_SWING'] == 'N'), 'INFER_PARTY'] = 'D'
engineered_data.loc[(engineered_data['VTR_INFER_PARTY'] == 'R') & (engineered_data['VTR_INFER_SWING'] == 'N'), 'INFER_PARTY'] = 'R'
engineered_data.loc[(engineered_data['VTR_INFER_PARTY'] == 'D') & (engineered_data['VTR_INFER_SWING'] == 'Y'), 'INFER_PARTY'] =  float('nan')
engineered_data.loc[(engineered_data['VTR_INFER_PARTY'] == 'R') & (engineered_data['VTR_INFER_SWING'] == 'Y'), 'INFER_PARTY'] =  float('nan')
#Also delete any 'D' or 'R' INFER_PARTY labels for anyone deemed a "swing voter" based on criteria above of voting for both parties at least 3 times each:
engineered_data.loc[(engineered_data['VTR_INFER_SWING'] == 'Y') , 'INFER_PARTY'] = float('nan')
print(sum(engineered_data['INFER_PARTY'].isna())) #291 NaNs for INFER_PARTY before
#Drop auxiliary columns used for math, keeping 'VTR_TOTAL_VOTES', 'VTR_INFER_SWING', and the updated 'INFER_PARTY' columns as features:
engineered_data = engineered_data.drop(columns=['VTR_TOTAL_DVOTES','VTR_TOTAL_RVOTES','VTR_INFER_PARTY'])
# Drop the list of columns from the dataset
#engineered_data_cleaned = engineered_data.drop(columns=remove_columns, errors='ignore')

# features_to_remove = [
#     'PRFL_CHOICELIFE', 'TOD_PRES_D_2016_PREC', 'TOD_PRES_O_2016',
#     'TOD_PRES_R_2016', 'TOD_PRES_R_2016_PREC', 'TOD_PRES_R_2020_PREC', 'VP_PPP',
#     'AGE', 'CNSUS_PCTW'
# ]
# Assuming 'engineered_data' is your DataFrame, remove the less important features
#engineered_data = engineered_data.drop(columns=features_to_remove, errors='ignore')

154352
154352


In [36]:
# voter_columns = [col for col in engineered_data.columns if col.startswith("VTR")]
# # Function to infer party
# def infer_party(row):
#     # Check for VTR_PPP20 value first
#     if row['VTR_PPP20'] in ['D', 'M', 'Z']:
#         return 'D'
#     elif row['VTR_PPP20'] in ['R', 'P', 'X']:
#         return 'R'
#     # If VTR_PPP20 condition is not met, continue with existing logic
#     last_votes = [vote for vote in row[voter_columns] if vote in ['D', 'M', 'Z', 'R', 'P', 'X']][-2:]
#     if len(last_votes) == 2:
#         if all(vote in ['D', 'M', 'Z'] for vote in last_votes):
#             return 'D'
#         elif all(vote in ['R', 'P', 'X'] for vote in last_votes):
#             return 'R'
#     return np.nan
# # Create new series with inferred parties
# inferred_parties = engineered_data.apply(infer_party, axis=1)
# print(sum(engineered_data['INFER_PARTY'].isna()))

## Data Cleaning

In [37]:
data_500.sample(n=500)

Unnamed: 0,RECORD_ID,ADD_TYPE,AFAMPROFLS,AGE,AI_COUNTY_NAME,AIRCOND,APP_CHILD,APP_MENBIG,APP_TODDLR,APP_WOMEN,...,TRAIL_CNT,TRAVEL,VOTER_CNT,VOTER_TRLR,VP_GEN,VP_OTH,VP_PPP,VP_PRI,WORKWOMAN,YEARBUILT
11558,138598,S,,39.0,PLACER,,,,,,...,1,,14,1,50.00,0.0,100.00,0.0,Y,
135845,264469,S,,40.0,BALDWIN,,,,,,...,2,,8,1,40.00,0.0,0.00,10.0,,2014
330455,289286,S,,40.0,MEDINA,,,,,,...,1,,1,0,0.00,0.0,0.00,0.0,,
216587,148015,S,,44.0,WASHTENAW,,,,,,...,1,,4,0,20.00,0.0,0.00,0.0,,
257848,398789,S,,44.0,MECKLENBURG,,,,,,...,1,,4,0,28.57,0.0,50.00,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62262,263044,S,,74.0,SUSSEX,A,,,,,...,1,,21,1,60.00,10.0,33.33,50.0,Y,1999
111662,337263,H,,46.0,COOK,,,,,,...,1,,8,0,20.00,0.0,0.00,0.0,,
41375,452029,S,,27.0,LOS ANGELES,,,,,,...,1,,2,0,33.33,0.0,100.00,0.0,,
82407,450742,S,,34.0,COLUMBIA,,,,,,...,2,,0,1,0.00,0.0,0.00,0.0,,


In [38]:
# Data Loading
#data_500 = pd.read_csv("/Users/nirugidla/Documents/GitHub/milestone2_MADS/data_500k.csv", low_memory=False)

# Data Cleaning - Column Names
data_500.columns = data_500.columns.str.strip()

# Data Cleaning - Drop ZIP
#data_500.drop('ZIP', axis=1, inplace=True)
# Drop the list of columns from the dataset
data_500_cleaned = data_500.drop(columns=remove_columns, errors='ignore')

# Data Cleaning - Drop Duplicates
data_500.drop_duplicates(inplace=True)

# Data Cleaning - Object Columns
for col in data_500.columns:
    if data_500[col].dtype == 'object':
        data_500[col] = data_500[col].str.strip()

# Data Cleaning - Empty Strings
data_500.replace('', 'Unknown', inplace=True)

# Data Cleaning - NaN for Object Types
data_500.loc[:, data_500.dtypes == 'object'] = data_500.loc[:, data_500.dtypes == 'object'].fillna('Unknown')

# Data Cleaning - Drop Columns and Rows with All NaNs
data_500.dropna(axis=1, how='all', inplace=True)
data_500.dropna(axis=0, how='all', inplace=True)

# Identify numeric and non-numeric columns
numeric_cols = data_500.select_dtypes(include=['int64', 'float64']).columns
non_numeric_cols = data_500.select_dtypes(exclude=['int64', 'float64']).columns

# Data Cleaning - Removing Non-Numeric Columns with More Than 90% Missing Data
missing_data_percentage = data_500.isnull().mean() * 100
non_numeric_cols_to_remove = missing_data_percentage[non_numeric_cols]
non_numeric_cols_to_remove = non_numeric_cols_to_remove[non_numeric_cols_to_remove > 90].index.tolist()
data_500_reduced = data_500.drop(columns=non_numeric_cols_to_remove)

# Update the list of non-numeric columns after removal
non_numeric_cols = data_500_reduced.select_dtypes(exclude=['int64', 'float64']).columns

# Identifying Specific Types of Non-Numeric Columns
cols_with_Y_or_Unknown = [col for col in non_numeric_cols if set(data_500_reduced[col].unique()) <= {'Y', 'Unknown'}]
cols_with_more_than_two_categories = [col for col in non_numeric_cols if len(data_500_reduced[col].unique()) > 2]

# Print identified columns
print("Columns with 'Y' or 'Unknown':", cols_with_Y_or_Unknown, len(cols_with_Y_or_Unknown))
print("Columns with more than two categories:", cols_with_more_than_two_categories, len(cols_with_more_than_two_categories))

Columns with 'Y' or 'Unknown': ['AFAMPROFLS', 'APP_CHILD', 'APP_MENBIG', 'APP_TODDLR', 'APP_WOMEN', 'APP_WOMPET', 'APP_WOMPLS', 'APP_YNGMEN', 'ARTS', 'AUTOACCES', 'AUTOWORK', 'BOATING', 'BROADERLIV', 'CARDUSER', 'CATOWNER', 'CH_0002FEM', 'CH_0002MAL', 'CH_0002UNK', 'CH_0305FEM', 'CH_0305MAL', 'CH_0305UNK', 'CH_0610FEM', 'CH_0610MAL', 'CH_0610UNK', 'CH_1115FEM', 'CH_1115MAL', 'CH_1115UNK', 'CH_1617FEM', 'CH_1617MAL', 'CH_1617UNK', 'CHRISTFAM', 'COL_ANTIQ', 'COL_ARTS', 'COL_COIN', 'COL_SPORT', 'COL_STAMP', 'COMPHOMOFC', 'COMPUTERS', 'COOK_GEN', 'CURRAFFAIR', 'DEPTSTCRD', 'DIETING', 'DIYLIV', 'DOGOWNER', 'DON_ANML', 'DON_ARTCUL', 'DON_CHARIT', 'DON_CHILD', 'DON_ENVIR', 'DON_ENVWLD', 'DON_HEALTH', 'DON_INTAID', 'DON_OTHER', 'DON_POLCONS', 'DON_POLIT', 'DON_POLLIB', 'DON_RELIG', 'DON_VET', 'DONATION', 'EDU_ONLINE', 'EQUESTRIAN', 'EXER_GROUP', 'GAMING', 'GARDENER', 'GOLF', 'GRANDCHLD', 'HEALTHBEAU', 'HEATHMED', 'HH_SENIOR', 'HH_VETERAN', 'HH_YOUNGAD', 'HIGHBROW', 'HIGHENDAPP', 'HISTMIL', 'HI

## Just the columns with Yes or Unknown non_numeric_cols 

In [39]:
data_reduced_with_Y_or_Unknown = data_500[cols_with_Y_or_Unknown].copy()
data_reduced_with_Y_or_Unknown.shape

(500000, 131)

In [40]:
# Assuming cols_with_Y_or_Unknown has been identified and data_500 has been cleaned
data_reduced_with_Y_or_Unknown = engineered_data[cols_with_Y_or_Unknown].copy()

# Adding the target column to this data
data_reduced_with_Y_or_Unknown['PARTY_CODE'] = engineered_data['PARTY_CODE']

# Remove rows where 'PARTY_CODE' is missing, as it's our target variable
data_reduced_with_Y_or_Unknown = data_reduced_with_Y_or_Unknown[data_reduced_with_Y_or_Unknown['PARTY_CODE'].notna()]

# Sample 100,000 rows from the data
data_sample = data_reduced_with_Y_or_Unknown.sample(n=100000, random_state=42)
data_sample = data_sample.drop(columns='INFER_PARTY', errors='ignore')

# Count instances of each class in 'PARTY_CODE' again to filter out classes with fewer than 2 instances
class_counts = data_sample['PARTY_CODE'].value_counts()
valid_classes = class_counts[class_counts >= 2].index.tolist()
data_sample = data_sample[data_sample['PARTY_CODE'].isin(valid_classes)]

# Label-encode 'PARTY_CODE' column
le = LabelEncoder()
data_sample['PARTY_CODE'] = le.fit_transform(data_sample['PARTY_CODE'].astype(str))

# One-Hot Encoding
data_one_hot = pd.get_dummies(data_sample, columns=cols_with_Y_or_Unknown, drop_first=True)

# Splitting the Data into Training and Test Sets
X = data_one_hot.drop('PARTY_CODE', axis=1)
y = data_one_hot['PARTY_CODE']

# Perform train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize and Train XGBoost Classifier
xgb = XGBClassifier(objective='multi:softmax', random_state=42)
xgb.fit(X_train, y_train)

# Make Predictions and Evaluate the Model
y_pred = xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"XGBoost Model Accuracy: {accuracy}")

XGBoost Model Accuracy: 0.4291


In [41]:
# Assuming cols_with_Y_or_Unknown has been identified and engineered_data has been cleaned
data_reduced_with_Y_or_Unknown = engineered_data[cols_with_Y_or_Unknown].copy()

# Adding the target column to this data
data_reduced_with_Y_or_Unknown['INFER_PARTY'] = engineered_data['INFER_PARTY']

# Remove rows where 'INFER_PARTY' is missing, as it's our target variable
data_reduced_with_Y_or_Unknown = data_reduced_with_Y_or_Unknown[data_reduced_with_Y_or_Unknown['INFER_PARTY'].notna()]

# Sample 100,000 rows from the data
data_sample = data_reduced_with_Y_or_Unknown.sample(n=100000, random_state=42)

# Count instances of each class in 'INFER_PARTY' again to filter out classes with fewer than 2 instances
class_counts = data_sample['INFER_PARTY'].value_counts()
valid_classes = class_counts[class_counts >= 2].index.tolist()

# Check for sufficient number of valid classes
if len(valid_classes) < 2:
    raise ValueError("Insufficient number of valid classes for multi-class classification.")

data_sample = data_sample[data_sample['INFER_PARTY'].isin(valid_classes)]

# Label-encode 'INFER_PARTY' column
le = LabelEncoder()
data_sample['INFER_PARTY'] = le.fit_transform(data_sample['INFER_PARTY'].astype(str))

# One-Hot Encoding
data_one_hot = pd.get_dummies(data_sample, columns=cols_with_Y_or_Unknown, drop_first=True)

# Splitting the Data into Training and Test Sets
X = data_one_hot.drop('INFER_PARTY', axis=1)
y = data_one_hot['INFER_PARTY']

# Perform train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize and Train XGBoost Classifier
xgb = XGBClassifier(objective='multi:softmax', random_state=42, num_class=len(valid_classes))
xgb.fit(X_train, y_train)

# Make Predictions and Evaluate the Model
y_pred = xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"XGBoost Model Accuracy: {accuracy}")

XGBoost Model Accuracy: 0.63435


In [42]:
def run_xgboost(sample_size, num_runs, top_N_features):
    results_list = []

    for run in range(1, num_runs + 1):
        # Assuming cols_with_Y_or_Unknown has been identified and data_500 has been cleaned
        data_reduced_with_Y_or_Unknown = engineered_data[cols_with_Y_or_Unknown].copy()
        data_reduced_with_Y_or_Unknown = data_reduced_with_Y_or_Unknown.drop(columns='PARTY_CODE', errors='ignore')

        # Adding the target column to this data
        data_reduced_with_Y_or_Unknown['INFER_PARTY'] = engineered_data['INFER_PARTY']

        # Remove rows where 'PARTY_CODE' is missing, as it's our target variable
        data_reduced_with_Y_or_Unknown = data_reduced_with_Y_or_Unknown[data_reduced_with_Y_or_Unknown['INFER_PARTY'].notna()]

        # Sample data
        data_sample = data_reduced_with_Y_or_Unknown.sample(n=sample_size, random_state=run)

        # Dynamic Class Handling
        class_counts = data_sample['INFER_PARTY'].value_counts()
        valid_classes = class_counts[class_counts >= 1].index.tolist()
        data_sample = data_sample[data_sample['INFER_PARTY'].isin(valid_classes)]

        # Label-encode 'PARTY_CODE' column
        le = LabelEncoder()
        data_sample['INFER_PARTY'] = le.fit_transform(data_sample['INFER_PARTY'].astype(str))

        # One-Hot Encoding
        data_one_hot = pd.get_dummies(data_sample, columns=cols_with_Y_or_Unknown, drop_first=True)

        # Splitting the Data into Training and Test Sets
        X = data_one_hot.drop('INFER_PARTY', axis=1)
        y = data_one_hot['INFER_PARTY']

        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        # Initialize and Train XGBoost Classifier
        xgb = XGBClassifier(objective='multi:softmax', num_class=len(valid_classes), random_state=42)
        xgb.fit(X_train, y_train)

        # Make Predictions and Evaluate the Model
        y_pred = xgb.predict(X_test)

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')

        # Get top N features
        feature_importances = xgb.feature_importances_
        sorted_idx = feature_importances.argsort()[::-1][:top_N_features]
        top_features = X.columns[sorted_idx].tolist()

        # Append to results list
        results_list.append({
            'Run': run,
            'Accuracy': accuracy,
            'F1_Score': f1,
            'Recall': recall,
            'Top_N_Features': top_features
        })

    # Create a DataFrame from the results list
    results_df = pd.DataFrame(results_list)
    
    return results_df

# Example of how to call this function
result = run_xgboost(sample_size=10000, num_runs=2, top_N_features=10)
result

Unnamed: 0,Run,Accuracy,F1_Score,Recall,Top_N_Features
0,1,0.6145,0.599599,0.6145,"[PRFL_2NDAMEND_Y, DON_POLCONS_Y, PETS_Y, DON_P..."
1,2,0.6055,0.59371,0.6055,"[PRFL_2NDAMEND_Y, DON_POLCONS_Y, DON_POLLIB_Y,..."


In [16]:
def run_xgboost(sample_size, num_runs, top_N_features):
    results_list = []

    for run in range(1, num_runs + 1):
        # Assuming cols_with_Y_or_Unknown has been identified and data_500 has been cleaned
        data_reduced_with_Y_or_Unknown = engineered_data[cols_with_Y_or_Unknown].copy()

        # Adding the target column to this data
        data_reduced_with_Y_or_Unknown['INFER_PARTY'] = engineered_data['INFER_PARTY']

        # Remove rows where 'PARTY_CODE' is missing, as it's our target variable
        data_reduced_with_Y_or_Unknown = data_reduced_with_Y_or_Unknown[data_reduced_with_Y_or_Unknown['INFER_PARTY'].notna()]

        # Sample data
        data_sample = data_reduced_with_Y_or_Unknown.sample(n=sample_size, random_state=run)

        # Dynamic Class Handling
        class_counts = data_sample['INFER_PARTY'].value_counts()
        valid_classes = class_counts[class_counts >= 1].index.tolist()
        
         # Check if we have more than one class, else skip this run
        if len(valid_classes) > 1:
            data_sample = data_sample[data_sample['INFER_PARTY'].isin(valid_classes)]

            # Label-encode 'PARTY_CODE' column
            le = LabelEncoder()
            data_sample['INFER_PARTY'] = le.fit_transform(data_sample['INFER_PARTY'].astype(str))
    
            # One-Hot Encoding
            data_one_hot = pd.get_dummies(data_sample, columns=cols_with_Y_or_Unknown, drop_first=True)
    
            # Splitting the Data into Training and Test Sets
            X = data_one_hot.drop('INFER_PARTY', axis=1)
            y = data_one_hot['INFER_PARTY']
    
            # Train-test split
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42, stratify=y
            )
    
            # Initialize and Train XGBoost Classifier
            xgb = XGBClassifier(objective='multi:softmax', num_class=len(valid_classes), random_state=42)
            xgb.fit(X_train, y_train)
    
            # Make Predictions and Evaluate the Model
            y_pred = xgb.predict(X_test)
    
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average='weighted')
            recall = recall_score(y_test, y_pred, average='weighted')
    
            # Get top N features
            feature_importances = xgb.feature_importances_
            sorted_idx = feature_importances.argsort()[::-1][:top_N_features]
            top_features = X.columns[sorted_idx].tolist()
    
            # Append to results list
            results_list.append({
                'Run': run,
                'Accuracy': accuracy,
                'F1_Score': f1,
                'Recall': recall,
                'Top_N_Features': top_features
            })
        else:
            print(f"Skipping run {run} due to insufficient classes.")

    # Create a DataFrame from the results list
    results_df = pd.DataFrame(results_list)
    
    return results_df

# Example of how to call this function
result = run_xgboost(sample_size=10000, num_runs=2, top_N_features=10)
result

Unnamed: 0,Run,Accuracy,F1_Score,Recall,Top_N_Features
0,1,0.6145,0.599599,0.6145,"[PRFL_2NDAMEND_Y, DON_POLCONS_Y, PETS_Y, DON_P..."
1,2,0.6055,0.59371,0.6055,"[PRFL_2NDAMEND_Y, DON_POLCONS_Y, DON_POLLIB_Y,..."


In [17]:
def run_xgboost(sample_size, num_runs, top_N_features):
    results_list = []

    for run in range(1, num_runs + 1):
        # Assuming cols_with_Y_or_Unknown has been identified and data_500 has been cleaned
        data_reduced_with_Y_or_Unknown = data_500[cols_with_Y_or_Unknown].copy()

        # Adding the target column to this data
        data_reduced_with_Y_or_Unknown['PARTY_CODE'] = engineered_data['PARTY_CODE']

        # Remove rows where 'PARTY_CODE' is missing, as it's our target variable
        data_reduced_with_Y_or_Unknown = data_reduced_with_Y_or_Unknown[data_reduced_with_Y_or_Unknown['PARTY_CODE'].notna()]

        # Sample data
        data_sample = data_reduced_with_Y_or_Unknown.sample(n=sample_size, random_state=run)

        # Dynamic Class Handling
        class_counts = data_sample['PARTY_CODE'].value_counts()
        valid_classes = class_counts[class_counts >= 2].index.tolist()
        data_sample = data_sample[data_sample['PARTY_CODE'].isin(valid_classes)]

        # Label-encode 'PARTY_CODE' column
        le = LabelEncoder()
        data_sample['PARTY_CODE'] = le.fit_transform(data_sample['PARTY_CODE'].astype(str))

        # One-Hot Encoding
        data_one_hot = pd.get_dummies(data_sample, columns=cols_with_Y_or_Unknown, drop_first=True)

        # Splitting the Data into Training and Test Sets
        X = data_one_hot.drop('PARTY_CODE', axis=1)
        y = data_one_hot['PARTY_CODE']

        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        # Initialize and Train XGBoost Classifier
        xgb = XGBClassifier(objective='multi:softmax', random_state=42)
        xgb.fit(X_train, y_train)

        # Make Predictions and Evaluate the Model
        y_pred = xgb.predict(X_test)

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')

        # Get top N features
        feature_importances = xgb.feature_importances_
        sorted_idx = feature_importances.argsort()[::-1][:top_N_features]
        top_features = X.columns[sorted_idx].tolist()

        # Append to results list
        results_list.append({
            'Run': run,
            'Accuracy': accuracy,
            'F1_Score': f1,
            'Recall': recall,
            'Top_N_Features': top_features
        })

    # Create a DataFrame from the results list
    results_df = pd.DataFrame(results_list)
    
    return results_df

# Example of how to call this function
result = run_xgboost(sample_size=10000, num_runs=2, top_N_features=10)
result

Unnamed: 0,Run,Accuracy,F1_Score,Recall,Top_N_Features
0,1,0.448,0.422753,0.448,"[PRFL_BLM_SUPPORT_Y, PRFL_GUN_CONTROL_Y, PRFL_..."
1,2,0.426,0.398875,0.426,"[PRFL_BLM_SUPPORT_Y, PRFL_GUN_CONTROL_Y, MAILO..."


In [18]:
print(engineered_data['INFER_PARTY'].nunique())

2


In [19]:
for run in result['Top_N_Features']:
    print(run)
    print("+++++")

['PRFL_BLM_SUPPORT_Y', 'PRFL_GUN_CONTROL_Y', 'PRFL_2NDAMEND_Y', 'RD_SCIFI_Y', 'PRFL_METOO_SUPPORT_Y', 'PRFL_AMZN_PRIME_Y', 'RELIGINSP_Y', 'CH_1617FEM_Y', 'DON_POLLIB_Y', 'CH_1617MAL_Y']
+++++
['PRFL_BLM_SUPPORT_Y', 'PRFL_GUN_CONTROL_Y', 'MAILORDRSP_Y', 'RD_RELIG_Y', 'COL_STAMP_Y', 'PRFL_METOO_SUPPORT_Y', 'PRFL_2NDAMEND_Y', 'MOTORCYCLE_Y', 'CH_1617FEM_Y', 'DON_POLLIB_Y']
+++++


In [20]:
# Assuming result is your DataFrame and 'Top_N_Features' is the column with the lists of top features
all_features = [feature for sublist in result['Top_N_Features'].tolist() for feature in sublist]

# Count the frequency of each feature
feature_counts = Counter(all_features)

# Find the most unique features (those that appear only once across all runs)
most_unique_features = [feature for feature, count in feature_counts.items() if count == 1]

# Find the most common features (those that appear the most across all runs)
most_common_features = [feature for feature, count in feature_counts.most_common()]

print("Most Unique Features:", most_unique_features)
print("Most Common Features:", most_common_features)

Most Unique Features: ['RD_SCIFI_Y', 'PRFL_AMZN_PRIME_Y', 'RELIGINSP_Y', 'CH_1617MAL_Y', 'MAILORDRSP_Y', 'RD_RELIG_Y', 'COL_STAMP_Y', 'MOTORCYCLE_Y']
Most Common Features: ['PRFL_BLM_SUPPORT_Y', 'PRFL_GUN_CONTROL_Y', 'PRFL_2NDAMEND_Y', 'PRFL_METOO_SUPPORT_Y', 'CH_1617FEM_Y', 'DON_POLLIB_Y', 'RD_SCIFI_Y', 'PRFL_AMZN_PRIME_Y', 'RELIGINSP_Y', 'CH_1617MAL_Y', 'MAILORDRSP_Y', 'RD_RELIG_Y', 'COL_STAMP_Y', 'MOTORCYCLE_Y']


## Just the non_numeric_cols with more than 2 values

In [21]:
data_reduced_with_more_than_two_categories = data_500[cols_with_more_than_two_categories].copy()
data_reduced_with_more_than_two_categories.shape

(500000, 49)

In [22]:
# Assuming cols_with_Y_or_Unknown has been identified and data_500 has been cleaned
data_reduced_with_more_than_two_categories = data_500[cols_with_more_than_two_categories].copy()

# Adding the target column to this data
data_reduced_with_more_than_two_categories['PARTY_CODE'] = data_500['PARTY_CODE']

# Remove rows where 'PARTY_CODE' is missing, as it's our target variable
data_reduced_with_more_than_two_categories = data_reduced_with_more_than_two_categories[data_reduced_with_more_than_two_categories['PARTY_CODE'].notna()]

# Sample 100,000 rows from the data
data_sample = data_reduced_with_more_than_two_categories.sample(n=100000, random_state=42)

# Count instances of each class in 'PARTY_CODE' again to filter out classes with fewer than 2 instances
class_counts = data_sample['PARTY_CODE'].value_counts()
valid_classes = class_counts[class_counts >= 2].index.tolist()
data_sample = data_sample[data_sample['PARTY_CODE'].isin(valid_classes)]

# Label-encode 'PARTY_CODE' column
le = LabelEncoder()
data_sample['PARTY_CODE'] = le.fit_transform(data_sample['PARTY_CODE'].astype(str))
print("Columns after label encoding:", data_sample.columns)


# Identify the columns in `cols_with_more_than_two_categories` that are actually present in `data_sample`
# Identify the columns in `cols_with_more_than_two_categories` that are actually present in `data_sample`
cols_to_encode = [col for col in cols_with_more_than_two_categories if col in data_sample.columns and col != 'PARTY_CODE']

# One-Hot Encoding
data_one_hot = pd.get_dummies(data_sample, columns=cols_to_encode, drop_first=True)
#print("Columns after one-hot encoding:", data_one_hot.columns)
#print("Columns before dropping PARTY_CODE:", data_one_hot.columns)

# Splitting the Data into Training and Test Sets
X = data_one_hot.drop('PARTY_CODE', axis=1)
y = data_one_hot['PARTY_CODE']

# Perform train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Sanitize column names to remove characters not allowed by XGBoost
X_train.columns = X_train.columns.str.replace('[', '_').str.replace(']', '_').str.replace('<', '_')
X_test.columns = X_test.columns.str.replace('[', '_').str.replace(']', '_').str.replace('<', '_')

# Initialize and Train XGBoost Classifier
xgb = XGBClassifier(objective='multi:softmax', random_state=42)
xgb.fit(X_train, y_train)

# Make Predictions and Evaluate the Model
y_pred = xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"XGBoost Model Accuracy: {accuracy}")

Columns after label encoding: Index(['ADD_TYPE', 'AI_COUNTY_NAME', 'AIRCOND', 'ASSMLCODE', 'BUS_OWNER',
       'CENSUS_ST', 'CNS_MEDINC', 'CONG_DIST', 'COUNTY_ST', 'COUNTY_TYPE',
       'CRD_RANGE', 'CREDRATE', 'EDUCATION', 'ETHNIC_INFER', 'ETHNICCODE',
       'ETHNICCONF', 'ETHNICGRP', 'FUND_POLIT', 'GENDER_MIX', 'GENERATION',
       'HH_NUMGEN', 'HH_SIZE', 'HOMEMKTVAL', 'HOMEOWNER', 'HOMEOWNRNT',
       'INCOMESTHH', 'LANGUAGE', 'LENGTH_RES', 'LIFESTAGE_CLUSTER', 'NETWORTH',
       'NUMCHILD', 'OCCDETAIL', 'OCCUPATION', 'PARTY_CODE', 'PERSONS_HH',
       'POOL', 'PRFL_TEAPARTY', 'RELIGION', 'SEX', 'ST_LO_HOUS', 'ST_UP_HOUS',
       'STATE', 'STATUS', 'TOD_PRES_DIFF_2016', 'TOD_PRES_DIFF_2016_PREC',
       'TOD_PRES_DIFF_2020_PREC', 'VOTER_CNT', 'VOTER_TRLR', 'YEARBUILT'],
      dtype='object')
XGBoost Model Accuracy: 0.5749


In [23]:
def timer_decorator(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Elapsed time: {elapsed_time:.2f} seconds")
        return result
    return wrapper

In [24]:
@timer_decorator
def run_xgboost(data, cols_with_more_than_two_categories, sample_size=100000, num_runs=1, top_N_features=10):
    results_list = []

    for run in range(1, num_runs + 1):
        print(f"Starting run {run}...")
        # Replicate the original code inside the loop
        data_reduced_with_more_than_two_categories = data[cols_with_more_than_two_categories].copy()
        data_reduced_with_more_than_two_categories['PARTY_CODE'] = data['PARTY_CODE']
        data_reduced_with_more_than_two_categories = data_reduced_with_more_than_two_categories[data_reduced_with_more_than_two_categories['PARTY_CODE'].notna()]
        data_sample = data_reduced_with_more_than_two_categories.sample(n=sample_size, random_state=42)
        
        class_counts = data_sample['PARTY_CODE'].value_counts()
        valid_classes = class_counts[class_counts >= 2].index.tolist()
        data_sample = data_sample[data_sample['PARTY_CODE'].isin(valid_classes)]
        
        le = LabelEncoder()
        data_sample['PARTY_CODE'] = le.fit_transform(data_sample['PARTY_CODE'].astype(str))
        
        cols_to_encode = [col for col in cols_with_more_than_two_categories if col in data_sample.columns and col != 'PARTY_CODE']
        data_one_hot = pd.get_dummies(data_sample, columns=cols_to_encode, drop_first=True)
        
        X = data_one_hot.drop('PARTY_CODE', axis=1)
        y = data_one_hot['PARTY_CODE']
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        
        X_train.columns = X_train.columns.str.replace('[', '_').str.replace(']', '_').str.replace('<', '_')
        X_test.columns = X_test.columns.str.replace('[', '_').str.replace(']', '_').str.replace('<', '_')
        
        xgb = XGBClassifier(objective='multi:softmax', random_state=42)
        xgb.fit(X_train, y_train)

        y_pred = xgb.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')

        feature_importances = xgb.feature_importances_
        sorted_idx = feature_importances.argsort()[::-1][:top_N_features]
        top_features = X.columns[sorted_idx].tolist()

        results_list.append({
            'Run': run,
            'Accuracy': accuracy,
            'F1_Score': f1,
            'Recall': recall,
            'Top_N_Features': top_features
        })

    results_df = pd.DataFrame(results_list)
    
    return results_df

# Assuming data_500 and cols_with_more_than_two_categories are already defined
result = run_xgboost(data_500, cols_with_more_than_two_categories, sample_size=50000, num_runs=2, top_N_features=10)


Starting run 1...
Starting run 2...
Elapsed time: 384.54 seconds


In [None]:
#result['Top_N_Features'][0]
result

In [19]:
from collections import Counter

# Assuming result is your DataFrame and 'Top_N_Features' is the column with the lists of top features
all_features = [feature for sublist in result['Top_N_Features'].tolist() for feature in sublist]

# Count the frequency of each feature
feature_counts = Counter(all_features)

# Find the most unique features (those that appear only once across all runs)
most_unique_features = [feature for feature, count in feature_counts.items() if count == 1]

# Find the most common features (those that appear the most across all runs)
most_common_features = [feature for feature, count in feature_counts.most_common()]

print("Most Unique Features:", most_unique_features)
print("Most Common Features:", most_common_features)

Most Unique Features: []
Most Common Features: ['CENSUS_ST_48', 'PRFL_POLITICAL_IDEOLOGY_L', 'CENSUS_ST_13', 'PRFL_POLITICAL_IDEOLOGY_Unknown', 'TOD_PRES_DIFF_2020_PREC_Unknown']


#### modeling for 'PRFL_POLITICAL_IDEOLOGY'

In [9]:
@timer_decorator
def run_xgboost(data, cols_with_more_than_two_categories, sample_size=100000, num_runs=1, top_N_features=10):
    results_list = []
    
    # Remove duplicates from cols_with_more_than_two_categories
    cols_with_more_than_two_categories = list(set(cols_with_more_than_two_categories))

    for run in range(1, num_runs + 1):
        print(f"Starting run {run}...")
        
        data_reduced_with_more_than_two_categories = data[cols_with_more_than_two_categories].copy()
        
        # Change target variable to 'PRFL_POLITICAL_IDEOLOGY'
        data_reduced_with_more_than_two_categories['PRFL_POLITICAL_IDEOLOGY'] = data['PRFL_POLITICAL_IDEOLOGY']
        
        # Filter out rows where 'PRFL_POLITICAL_IDEOLOGY' is missing
        data_reduced_with_more_than_two_categories = data_reduced_with_more_than_two_categories[data_reduced_with_more_than_two_categories['PRFL_POLITICAL_IDEOLOGY'].notna()]

        data_sample = data_reduced_with_more_than_two_categories.sample(n=sample_size, random_state=42)
        
        class_counts = data_sample['PRFL_POLITICAL_IDEOLOGY'].value_counts()
        valid_classes = class_counts[class_counts >= 2].index.tolist()
        data_sample = data_sample[data_sample['PRFL_POLITICAL_IDEOLOGY'].isin(valid_classes)]
        
        le = LabelEncoder()
        data_sample['PRFL_POLITICAL_IDEOLOGY'] = le.fit_transform(data_sample['PRFL_POLITICAL_IDEOLOGY'].astype(str))
        
        cols_to_encode = [col for col in cols_with_more_than_two_categories if col in data_sample.columns and col != 'PRFL_POLITICAL_IDEOLOGY']
        data_one_hot = pd.get_dummies(data_sample, columns=cols_to_encode, drop_first=True)
        
        # Check for duplicate columns before training
        duplicate_columns = data_one_hot.columns[data_one_hot.columns.duplicated()]
        if len(duplicate_columns) > 0:
            print(f"Warning: Duplicate columns found: {duplicate_columns}")

        # Check for invalid data types
        invalid_dtypes = data_one_hot.select_dtypes(exclude=['int', 'float', 'bool', 'category']).columns
        if len(invalid_dtypes) > 0:
            print(f"Warning: Invalid data types found: {invalid_dtypes}")

        # Drop 'PRFL_POLITICAL_IDEOLOGY' to prepare data for training
        X = data_one_hot.drop('PRFL_POLITICAL_IDEOLOGY', axis=1)
        y = data_one_hot['PRFL_POLITICAL_IDEOLOGY']
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        
        X_train.columns = X_train.columns.str.replace('[', '_').str.replace(']', '_').str.replace('<', '_')
        X_test.columns = X_test.columns.str.replace('[', '_').str.replace(']', '_').str.replace('<', '_')
        
        xgb = XGBClassifier(objective='multi:softmax', random_state=42)
        xgb.fit(X_train, y_train)

        y_pred = xgb.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')

        feature_importances = xgb.feature_importances_
        sorted_idx = feature_importances.argsort()[::-1][:top_N_features]
        top_features = X.columns[sorted_idx].tolist()

        results_list.append({
            'Run': run,
            'Accuracy': accuracy,
            'F1_Score': f1,
            'Recall': recall,
            'Top_N_Features': top_features
        })

    results_df = pd.DataFrame(results_list)
    
    return results_df

# Assuming data_500 and cols_with_more_than_two_categories are already defined
# Include 'PARTY_CODE' in the cols_with_more_than_two_categories list
if 'PARTY_CODE' not in cols_with_more_than_two_categories:
    cols_with_more_than_two_categories.append('PARTY_CODE')

result = run_xgboost(data_500, cols_with_more_than_two_categories, sample_size=5000, num_runs=2, top_N_features=10)
result

Starting run 1...
Starting run 2...
Elapsed time: 79.81 seconds


Unnamed: 0,Run,Accuracy,F1_Score,Recall,Top_N_Features
0,1,0.9364,0.932831,0.9364,"[PARTY_CODE_S, PARTY_CODE_U, PARTY_CODE_N, PAR..."
1,2,0.9364,0.932831,0.9364,"[PARTY_CODE_S, PARTY_CODE_U, PARTY_CODE_N, PAR..."


## Model training on all features

In [None]:
@timer_decorator
def run_xgboost(data, cols_with_more_than_two_categories, sample_size=100000, num_runs=1, top_N_features=10):
    results_list = []
    
    # Remove duplicates from cols_with_more_than_two_categories
    cols_with_more_than_two_categories = list(set(cols_with_more_than_two_categories))

    for run in range(1, num_runs + 1):
        print(f"Starting run {run}...")
        
        data_reduced_with_more_than_two_categories = data[cols_with_more_than_two_categories].copy()
        
        # Change target variable to 'PRFL_POLITICAL_IDEOLOGY'
        data_reduced_with_more_than_two_categories['PRFL_POLITICAL_IDEOLOGY'] = data['PRFL_POLITICAL_IDEOLOGY']
        
        # Filter out rows where 'PRFL_POLITICAL_IDEOLOGY' is missing
        data_reduced_with_more_than_two_categories = data_reduced_with_more_than_two_categories[data_reduced_with_more_than_two_categories['PRFL_POLITICAL_IDEOLOGY'].notna()]

        data_sample = data_reduced_with_more_than_two_categories.sample(n=sample_size, random_state=42)
        
        class_counts = data_sample['PRFL_POLITICAL_IDEOLOGY'].value_counts()
        valid_classes = class_counts[class_counts >= 2].index.tolist()
        data_sample = data_sample[data_sample['PRFL_POLITICAL_IDEOLOGY'].isin(valid_classes)]
        
        le = LabelEncoder()
        data_sample['PRFL_POLITICAL_IDEOLOGY'] = le.fit_transform(data_sample['PRFL_POLITICAL_IDEOLOGY'].astype(str))
        
        cols_to_encode = [col for col in cols_with_more_than_two_categories if col in data_sample.columns and col != 'PRFL_POLITICAL_IDEOLOGY']
        data_one_hot = pd.get_dummies(data_sample, columns=cols_to_encode, drop_first=True)
        
        # Check for duplicate columns before training
        duplicate_columns = data_one_hot.columns[data_one_hot.columns.duplicated()]
        if len(duplicate_columns) > 0:
            print(f"Warning: Duplicate columns found: {duplicate_columns}")

        # Check for invalid data types
        invalid_dtypes = data_one_hot.select_dtypes(exclude=['int', 'float', 'bool', 'category']).columns
        if len(invalid_dtypes) > 0:
            print(f"Warning: Invalid data types found: {invalid_dtypes}")

        # Drop 'PRFL_POLITICAL_IDEOLOGY' to prepare data for training
        X = data_one_hot.drop('PRFL_POLITICAL_IDEOLOGY', axis=1)
        y = data_one_hot['PRFL_POLITICAL_IDEOLOGY']
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        
        X_train.columns = X_train.columns.str.replace('[', '_').str.replace(']', '_').str.replace('<', '_')
        X_test.columns = X_test.columns.str.replace('[', '_').str.replace(']', '_').str.replace('<', '_')
        
        xgb = XGBClassifier(objective='multi:softmax', random_state=42)
        xgb.fit(X_train, y_train)

        y_pred = xgb.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')

        feature_importances = xgb.feature_importances_
        sorted_idx = feature_importances.argsort()[::-1][:top_N_features]
        top_features = X.columns[sorted_idx].tolist()

        results_list.append({
            'Run': run,
            'Accuracy': accuracy,
            'F1_Score': f1,
            'Recall': recall,
            'Top_N_Features': top_features
        })

    results_df = pd.DataFrame(results_list)
    
    return results_df

# Assuming data_500 and cols_with_more_than_two_categories are already defined
# Include 'PARTY_CODE' in the cols_with_more_than_two_categories list
if 'PARTY_CODE' not in cols_with_more_than_two_categories:
    cols_with_more_than_two_categories.append('PARTY_CODE')

result = run_xgboost(data_500, cols_with_more_than_two_categories, sample_size=50000, num_runs=2, top_N_features=10)
result

# All Columns - Dealing with Imbalanced Data

### 'PARTY_CODE' as target variable

In [27]:
def preprocess_data(engineered_data, sample_size=10000):
    # Copy the data
    data_with_all_columns = engineered_data.copy()
    
    # Remove rows where the target column 'PARTY_CODE' is NaN
    data_with_all_columns = data_with_all_columns[data_with_all_columns['PARTY_CODE'].notna()]
    
    # Sample the data
    data_sample = data_with_all_columns.sample(n=sample_size, random_state=42)
    
    # Keep only the classes that have at least 2 samples
    class_counts = data_sample['PARTY_CODE'].value_counts()
    valid_classes = class_counts[class_counts >= 2].index.tolist()
    data_sample = data_sample[data_sample['PARTY_CODE'].isin(valid_classes)]
    
    # Label encode the target variable
    le = LabelEncoder()
    data_sample['PARTY_CODE'] = le.fit_transform(data_sample['PARTY_CODE'].astype(str))
    
    # Identify numeric and non-numeric columns
    numeric_cols = data_sample.select_dtypes(include=['int64', 'float64']).columns.difference(['PARTY_CODE'])
    non_numeric_cols = data_sample.select_dtypes(exclude=['int64', 'float64']).columns.difference(['PARTY_CODE'])
    
    # One-hot encode non-numeric columns
    data_one_hot = pd.get_dummies(data_sample, columns=non_numeric_cols, drop_first=True)
    
    # Split into features and labels
    #X = data_one_hot.drop('PARTY_CODE', axis=1)
    X = data_one_hot.drop(columns=['PARTY_CODE', 'INFER_PARTY'])
    y = data_one_hot['PARTY_CODE']
    
    # Handle special characters in column names that XGBoost doesn't like
    X.columns = X.columns.str.replace('[', '_').str.replace(']', '_').str.replace('<', '_')
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    return X_train, X_test, y_train, y_test

# Usage
# Assuming engineered_data is already defined
X_train, X_test, y_train, y_test = preprocess_data(engineered_data, sample_size=10000)

KeyError: "['INFER_PARTY'] not found in axis"

### 'INFER_PARTY' as target variable

In [None]:
def preprocess_data(engineered_data, sample_size=10000):
    # Copy the data
    data_with_all_columns = engineered_data.copy()
    
    # Remove rows where the target column 'INFER_PARTY' is NaN
    data_with_all_columns = data_with_all_columns[data_with_all_columns['INFER_PARTY'].notna()]
    
    # Sample the data
    data_sample = data_with_all_columns.sample(n=sample_size, random_state=42)
    
    # Keep only the classes that have at least 2 samples
    class_counts = data_sample['INFER_PARTY'].value_counts()
    valid_classes = class_counts[class_counts >= 2].index.tolist()
    data_sample = data_sample[data_sample['INFER_PARTY'].isin(valid_classes)]
    
    # Label encode the target variable
    le = LabelEncoder()
    data_sample['INFER_PARTY'] = le.fit_transform(data_sample['INFER_PARTY'].astype(str))
    
    # Identify numeric and non-numeric columns
    numeric_cols = data_sample.select_dtypes(include=['int64', 'float64']).columns.difference(['INFER_PARTY'])
    non_numeric_cols = data_sample.select_dtypes(exclude=['int64', 'float64']).columns.difference(['INFER_PARTY'])
    
    # One-hot encode non-numeric columns
    data_one_hot = pd.get_dummies(data_sample, columns=non_numeric_cols, drop_first=True)
    
    # Split into features and labels
    X = data_one_hot.drop('INFER_PARTY', axis=1)
    y = data_one_hot['INFER_PARTY']
    
    # Handle special characters in column names that XGBoost doesn't like
    X.columns = X.columns.str.replace('[', '_').str.replace(']', '_').str.replace('<', '_')
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    return X_train, X_test, y_train, y_test

# Usage
# Assuming engineered_data is already defined
X_train, X_test, y_train, y_test = preprocess_data(engineered_data, sample_size=10000)


#### Resampling Techniques

In [13]:
from imblearn.over_sampling import SMOTE

# Impute missing values with mean
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Determine the smallest class count
min_class_count = np.min(np.bincount(y_train))

# Apply SMOTE to the imputed training data
# Set n_neighbors to min_class_count - 1 or a default small number
n_neighbors = min(min_class_count - 1, 5)
smote = SMOTE(random_state=42, k_neighbors=n_neighbors)
X_res, y_res = smote.fit_resample(X_train_imputed, y_train)

# Train the XGBoost model
xgb = XGBClassifier(objective='multi:softmax', random_state=42)
xgb.fit(X_res, y_res)

# Make predictions on the imputed test set
y_pred = xgb.predict(X_test_imputed)

# Calculate the metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.925
Recall: 0.925
F1 Score: 0.9128656777493607


In [15]:
from imblearn.under_sampling import RandomUnderSampler

# Impute missing values with mean
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Apply RandomUnderSampler to the imputed training data
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train_imputed, y_train)

# Train the XGBoost model
xgb = XGBClassifier(objective='multi:softmax', random_state=42)
xgb.fit(X_res, y_res)

# Make predictions on the imputed test set
y_pred = xgb.predict(X_test_imputed)

# Calculate the metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.413
Recall: 0.413
F1 Score: 0.484446515639784


#### Weighted Loss Function

In [17]:
# Count the occurrences of each class in the target variable
counter = Counter(y_train)

# Calculate the number of samples
total_samples = len(y_train)

# Compute class weights
class_weights = {cls: float(total_samples / count) for cls, count in counter.items()}

# Map the weights to each sample in y_train
sample_weights = [class_weights[cls] for cls in y_train]

# Initialize XGBoost with multi:softmax objective
xgb = XGBClassifier(objective='multi:softmax', random_state=42)

# Fit the model, passing in the sample weights
xgb.fit(X_train, y_train, sample_weight=sample_weights)

# Make predictions
y_pred = xgb.predict(X_test)

# Calculate the metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.9145
Recall: 0.9145
F1 Score: 0.9094697333354964


#### Cost-sensitive Learning

In [24]:
def create_cost_matrix(y):
    # Count the frequency of each class in the target variable
    class_freq = Counter(y)
    
    # Calculate the inverse frequency
    inv_freq = {k: 1.0 / v for k, v in class_freq.items()}
    
    # Create the cost matrix
    cost_matrix = {}
    for class1 in class_freq.keys():
        cost_matrix[class1] = {}
        for class2 in class_freq.keys():
            if class1 == class2:
                continue
            cost_matrix[class1][class2] = inv_freq[class1] + inv_freq[class2]
            
    return cost_matrix

# Assuming y_train contains your training labels, and it's a pandas Series
# For demonstration, I'm using the value_counts information you provided
y_train_demo = ['D'] * 110365 + ['R'] * 85218 + ['N'] * 75028 + ['O'] * 8121 + ['L'] * 1621 + ['  '] * 45
cost_matrix = create_cost_matrix(y_train_demo)

print(cost_matrix)

{'D': {'R': 2.0795453623443604e-05, 'N': 2.2389200977768253e-05, 'O': 0.00013219838820189577, 'L': 0.0006259639897705816, '  ': 0.02223128306578676}, 'R': {'D': 2.0795453623443604e-05, 'N': 2.5062967472140136e-05, 'O': 0.00013487215469626767, 'L': 0.0006286377562649534, '  ': 0.022233956832281132}, 'N': {'D': 2.2389200977768253e-05, 'R': 2.5062967472140136e-05, 'O': 0.00013646590205059232, 'L': 0.0006302315036192781, '  ': 0.022235550579635454}, 'O': {'D': 0.00013219838820189577, 'R': 0.00013487215469626767, 'N': 0.00013646590205059232, 'L': 0.0007400406908434056, '  ': 0.02234535976685958}, 'L': {'D': 0.0006259639897705816, 'R': 0.0006286377562649534, 'N': 0.0006302315036192781, 'O': 0.0007400406908434056, '  ': 0.02283912536842827}, '  ': {'D': 0.02223128306578676, 'R': 0.022233956832281132, 'N': 0.022235550579635454, 'O': 0.02234535976685958, 'L': 0.02283912536842827}}


In [28]:
def compute_custom_sample_weight(cost_matrix, y):
    sample_weight = np.zeros(len(y))
    
    for i, class1 in enumerate(y):
        for class2, cost in cost_matrix[class1].items():
            sample_weight[i] += (y == class2).sum() * cost
            
    return sample_weight

# Assuming y_train is your actual training labels and it's a NumPy array
# For demonstration, converting y_train_demo to a NumPy array
#y_train_demo_np = np.array(y_train_demo)
# This should be your actual y_train, converted to a NumPy array
y_train_np = np.array(y_train) 

# Compute sample weights
sample_weight = compute_custom_sample_weight(cost_matrix, y_train_np)

# Fit the model
xgb = XGBClassifier(objective='multi:softmax', random_state=42)
xgb.fit(X_train, y_train, sample_weight=sample_weight)

# Rest of the code for predictions and metrics...



KeyError: 5

In [22]:
print(engineered_data['PARTY_CODE'].value_counts())

PARTY_CODE
D    110365
R     85218
N     75028
O      8121
L      1621
         45
Name: count, dtype: int64
