In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, f1_score
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_sample_weight
from scipy.optimize import linear_sum_assignment
import ipywidgets as widgets

## 500K dataset

In [19]:
data_500 = pd.read_csv("/Users/nirugidla/Documents/GitHub/milestone2_MADS/data_500k.csv", low_memory=False)
data_500

Unnamed: 0,RECORD_ID,ADD_TYPE,AFAMPROFLS,AGE,AI_COUNTY_NAME,AIRCOND,APP_CHILD,APP_MENBIG,APP_TODDLR,APP_WOMEN,...,VTR_PRI16,VTR_PRI17,VTR_PRI18,VTR_PRI19,VTR_PRI20,VTR_PRI21,VTR_PRI22,WORKWOMAN,YEARBUILT,ZIP
0,403390,S,,21.0,Fairbanks North Star,,,,,,...,,,,,,,,,,99705
1,62285,H,,,Anchorage,,,,,,...,,,,,,,,,,99506
2,331355,,,91.0,Kenai Peninsula,,,,,,...,,,,,,,,,,99603
3,206320,H,,65.0,Anchorage,,,,,,...,,,,,,,Y,,,99567
4,188078,S,,76.0,Juneau,,,,,,...,Y,,Y,,,,Y,Y,1985,99801
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,349635,H,,20.0,BIBB,,,,,,...,,,,,,,,,,31204
499996,420654,S,,50.0,COWETA,A,,,,,...,,,,,,,,,2003,30263
499997,131262,S,,19.0,ROCKDALE,,,,,,...,,,,,,,,,,30013
499998,315673,H,,21.0,BARROW,,,,,,...,,,,,,,,,,30680


In [20]:
remove_columns = [
    'PRFL_LGBT_SUPPORT',
    'PRFL_LIBERAL_NEWS',
    'PRFL_MARIJUANA_REFORM',
    'PRFL_BIDEN_SUPPORT',
    'PRFL_BORDER_SECURITY',
    'PRFL_CONSERVATIVE_NEWS',
    'PRFL_IMMIGRATION_REFORM',
    'PRFL_OBAMA',
    'PRFL_PERSUADABLE_VOTER',
    'PRFL_POLITICAL_IDEOLOGY',
    'PRFL_SANDERS_SUPPORT',
    'PRFL_TRUMP_SUPPORT',
    'ZIP',
    
    'VTR_GEN00', 'VTR_GEN01', 'VTR_GEN02', 'VTR_GEN03', 'VTR_GEN04', 'VTR_GEN05', 'VTR_GEN06', 'VTR_GEN07', 'VTR_GEN08', 'VTR_GEN09', 'VTR_GEN10', 'VTR_GEN11', 'VTR_GEN12', 'VTR_GEN13', 'VTR_GEN14', 'VTR_GEN15', 'VTR_GEN16', 'VTR_GEN17', 'VTR_GEN18', 'VTR_GEN19', 'VTR_GEN20', 'VTR_GEN21', 'VTR_GEN22', 'VTR_OTH00', 'VTR_OTH01', 'VTR_OTH02', 'VTR_OTH03', 'VTR_OTH04', 'VTR_OTH05', 'VTR_OTH06', 'VTR_OTH07', 'VTR_OTH08', 'VTR_OTH09', 'VTR_OTH10', 'VTR_OTH11', 'VTR_OTH12', 'VTR_OTH13', 'VTR_OTH14', 'VTR_OTH15', 'VTR_OTH16', 'VTR_OTH17', 'VTR_OTH18', 'VTR_OTH19', 'VTR_OTH20', 'VTR_OTH21', 'VTR_OTH22', 'VTR_PPP00', 'VTR_PPP04', 'VTR_PPP08', 'VTR_PPP12', 'VTR_PPP16', 'VTR_PPP20', 'VTR_PRI00', 'VTR_PRI01', 'VTR_PRI02', 'VTR_PRI03', 'VTR_PRI04', 'VTR_PRI05', 'VTR_PRI06', 'VTR_PRI07', 'VTR_PRI08', 'VTR_PRI09', 'VTR_PRI10', 'VTR_PRI11', 'VTR_PRI12', 'VTR_PRI13', 'VTR_PRI14', 'VTR_PRI15', 'VTR_PRI16', 'VTR_PRI17', 'VTR_PRI18', 'VTR_PRI19', 'VTR_PRI20', 'VTR_PRI21', 'VTR_PRI22',
    
        
      'PRFL_CHOICELIFE', 'TOD_PRES_D_2016_PREC', 'TOD_PRES_O_2016',
    'TOD_PRES_R_2016', 'TOD_PRES_R_2016_PREC', 'TOD_PRES_R_2020_PREC', 'VP_PPP',
    'AGE', 'CNSUS_PCTW',
    
    'PARTY_MIX', 'PRFL_MINWAGE', 'PRFL_FENCE_SITTER', 
]
# Drop the list of columns from the dataset
data_500.drop(columns=remove_columns, errors='ignore', inplace=True)


In [21]:
# Load the dataset again
#data = pd.read_csv('data/surveydata.csv')

corrected_data = data_500.copy()

# Update the 'I' values in PARTY_CODE to 'N' for rows where STATE is 'SD'
corrected_data.loc[(corrected_data['STATE'] == 'SD') & (corrected_data['PARTY_CODE'] == 'I'), 'PARTY_CODE'] = 'N'

# Update the 'U' values in PARTY_CODE to 'N' for rows where STATE is in ['DC', 'LA', 'RI']
states_to_update = ['DC', 'LA', 'RI']
corrected_data.loc[(corrected_data['STATE'].isin(states_to_update)) & (corrected_data['PARTY_CODE'] == 'U'), 'PARTY_CODE'] = 'N'

# Update the 'I' values in PARTY_CODE to 'N' for rows where STATE is 'OK'
corrected_data.loc[(corrected_data['STATE'] == 'OK') & (corrected_data['PARTY_CODE'] == 'I'), 'PARTY_CODE'] = 'N'


#categorical_columns
# Create a mapping dictionary for PARTY_CODE to INFER_PARTY values
party_mapping = {
    'D': 'D',
    'E': 'D',
    'R': 'R',
    'S': 'R',
    'N': float('nan'),
    'U': float('nan'),
    'A': float('nan'),
    'B': float('nan'),
    'C': float('nan'),
    'F': float('nan'),
    'G': float('nan'),
    'H': float('nan'),
    'I': float('nan'),
    'J': float('nan'),
    'K': float('nan'),
    'L': float('nan'),
    'P': float('nan'),
    'Q': float('nan'),
    'T': float('nan'),
    'V': float('nan'),
    'W': float('nan'),
    'Y': float('nan'),
    'Z': float('nan'),
    'O': float('nan'),
}

# Create the INFER_PARTY column using the mapping
corrected_data['INFER_PARTY'] = corrected_data['PARTY_CODE'].map(party_mapping)

print(corrected_data['PARTY_CODE'].unique())


# Display the unique values in the INFER_PARTY column to ensure correctness
unique_infer_party = corrected_data['INFER_PARTY'].unique()

print('unique_infer_party:')

print(unique_infer_party)

# Define the mapping for PARTY_CODE modifications
party_code_mapping = {
    'E': float('nan'),
    'S': float('nan'),
    'U': float('nan'),
    'A': 'O',
    'B': 'O',
    'C': 'O',
    'F': 'O',
    'G': 'O',
    'H': 'O',
    'I': 'O',
    'J': 'O',
    'K': 'O',
    'L': 'L',
    'P': 'O',
    'Q': 'O',
    'T': 'O',
    'V': 'O',
    'W': 'O',
    'Y': 'O',
    'Z': 'O'
}

# Apply the mapping to the PARTY_CODE column
corrected_data['PARTY_CODE'] = corrected_data['PARTY_CODE'].replace(party_code_mapping)

# Check the unique values of PARTY_CODE after the modifications
unique_party_code_after_modifications = corrected_data['PARTY_CODE'].unique()

print("unique_party_code_after_modifications:")

print(unique_party_code_after_modifications)

['N' 'R' 'O' 'D' 'A' 'F' 'P' 'G' 'L' 'U' 'W' 'B' 'I' 'Y' 'V' 'H' ' ' 'S'
 'E' 'Q' 'Z']
unique_infer_party:
[nan 'R' 'D']
unique_party_code_after_modifications:
['N' 'R' 'O' 'D' 'L' nan ' ']


In [22]:
data_500['PARTY_CODE'].unique()

array(['N', 'R', 'O', 'D', 'A', 'F', 'P', 'G', 'L', 'U', 'W', 'B', 'I',
       'Y', 'V', 'H', ' ', 'S', 'E', 'Q', 'Z'], dtype=object)

In [23]:
engineered_data = corrected_data.copy()
voter_columns = [col for col in engineered_data.columns if col.startswith("VTR")]
#Create column with total number of votes in voter_columns per row
engineered_data['VTR_TOTAL_VOTES'] = engineered_data[voter_columns].notnull().sum(axis=1)
#Sum Democrat and Republican totals
engineered_data['VTR_TOTAL_DVOTES'] = engineered_data[voter_columns].isin(['D', 'M', 'Z']).sum(axis=1)
engineered_data['VTR_TOTAL_RVOTES'] = engineered_data[voter_columns].isin(['R', 'P', 'X']).sum(axis=1)
#Create conditions that evaluate whether more votes have been cast for Democrats or Republicans, and assign 'D' and 'R' to new column based on conditions
conditions = [
    engineered_data['VTR_TOTAL_DVOTES'] > engineered_data['VTR_TOTAL_RVOTES'],
    engineered_data['VTR_TOTAL_DVOTES'] < engineered_data['VTR_TOTAL_RVOTES']
]
choices = ['D', 'R']
engineered_data['VTR_INFER_PARTY'] = np.select(conditions, choices, default=np.nan)
#Create conditions that evaluate whether a voter is a swing voter or not
conditions_swing = [
    (engineered_data['VTR_TOTAL_DVOTES'] > 2) & (engineered_data['VTR_TOTAL_RVOTES'] > 2),
    ((engineered_data['VTR_TOTAL_DVOTES'] > 1) & (engineered_data['VTR_TOTAL_RVOTES'] == 0)) | ((engineered_data['VTR_TOTAL_RVOTES'] > 1) & (engineered_data['VTR_TOTAL_DVOTES'] == 0))
]
choices_swing = ['Y', 'N']
engineered_data['VTR_INFER_SWING'] = np.select(conditions_swing, choices_swing, default=np.nan)
#sampledf = engineered_data[['PARTY_CODE','INFER_PARTY','VTR_TOTAL_DVOTES','VTR_TOTAL_RVOTES','VTR_INFER_PARTY','VTR_INFER_SWING']]
#print(sampledf.head(50))
#Add values to INFER_PARTY and correct any other INFER_PARTY values that don't meet the conditions above:
print(sum(engineered_data['INFER_PARTY'].isna())) #291 NaNs for INFER_PARTY before
engineered_data.loc[(engineered_data['VTR_INFER_PARTY'] == 'D') & (engineered_data['VTR_INFER_SWING'] == 'N'), 'INFER_PARTY'] = 'D'
engineered_data.loc[(engineered_data['VTR_INFER_PARTY'] == 'R') & (engineered_data['VTR_INFER_SWING'] == 'N'), 'INFER_PARTY'] = 'R'
engineered_data.loc[(engineered_data['VTR_INFER_PARTY'] == 'D') & (engineered_data['VTR_INFER_SWING'] == 'Y'), 'INFER_PARTY'] =  float('nan')
engineered_data.loc[(engineered_data['VTR_INFER_PARTY'] == 'R') & (engineered_data['VTR_INFER_SWING'] == 'Y'), 'INFER_PARTY'] =  float('nan')
#Also delete any 'D' or 'R' INFER_PARTY labels for anyone deemed a "swing voter" based on criteria above of voting for both parties at least 3 times each:
engineered_data.loc[(engineered_data['VTR_INFER_SWING'] == 'Y') , 'INFER_PARTY'] = float('nan')
print(sum(engineered_data['INFER_PARTY'].isna())) #291 NaNs for INFER_PARTY before
#Drop auxiliary columns used for math, keeping 'VTR_TOTAL_VOTES', 'VTR_INFER_SWING', and the updated 'INFER_PARTY' columns as features:
engineered_data = engineered_data.drop(columns=['VTR_TOTAL_DVOTES','VTR_TOTAL_RVOTES','VTR_INFER_PARTY'])
# Drop the list of columns from the dataset
#engineered_data_cleaned = engineered_data.drop(columns=remove_columns, errors='ignore')

features_to_remove = [
    'PRFL_CHOICELIFE', 'TOD_PRES_D_2016_PREC', 'TOD_PRES_O_2016',
    'TOD_PRES_R_2016', 'TOD_PRES_R_2016_PREC', 'TOD_PRES_R_2020_PREC', 'VP_PPP',
    'AGE', 'CNSUS_PCTW'
]
# Assuming 'engineered_data' is your DataFrame, remove the less important features
engineered_data = engineered_data.drop(columns=features_to_remove, errors='ignore')

154352
154352


## Data Cleaning

In [24]:
# Data Loading
#data_500 = pd.read_csv("/Users/nirugidla/Documents/GitHub/milestone2_MADS/data_500k.csv", low_memory=False)

# Data Cleaning - Column Names
data_500.columns = data_500.columns.str.strip()

# Data Cleaning - Drop ZIP
#data_500.drop('ZIP', axis=1, inplace=True)
# Drop the list of columns from the dataset
data_500_cleaned = data_500.drop(columns=remove_columns, errors='ignore')

# Data Cleaning - Drop Duplicates
data_500.drop_duplicates(inplace=True)

# Data Cleaning - Object Columns
for col in data_500.columns:
    if data_500[col].dtype == 'object':
        data_500[col] = data_500[col].str.strip()

# Data Cleaning - Empty Strings
data_500.replace('', 'Unknown', inplace=True)

# Data Cleaning - NaN for Object Types
data_500.loc[:, data_500.dtypes == 'object'] = data_500.loc[:, data_500.dtypes == 'object'].fillna('Unknown')

# Data Cleaning - Drop Columns and Rows with All NaNs
data_500.dropna(axis=1, how='all', inplace=True)
data_500.dropna(axis=0, how='all', inplace=True)



In [25]:
# Identify numeric and non-numeric columns
numeric_cols = data_500.select_dtypes(include=['int64', 'float64']).columns
non_numeric_cols = data_500.select_dtypes(exclude=['int64', 'float64']).columns

# Data Cleaning - Removing Non-Numeric Columns with More Than 90% Missing Data
missing_data_percentage = data_500.isnull().mean() * 100
non_numeric_cols_to_remove = missing_data_percentage[non_numeric_cols]
non_numeric_cols_to_remove = non_numeric_cols_to_remove[non_numeric_cols_to_remove > 90].index.tolist()
data_500_reduced = data_500.drop(columns=non_numeric_cols_to_remove)

# Update the list of non-numeric columns after removal
non_numeric_cols = data_500_reduced.select_dtypes(exclude=['int64', 'float64']).columns

# Identifying Specific Types of Non-Numeric Columns
cols_with_Y_or_Unknown = [col for col in non_numeric_cols if set(data_500_reduced[col].unique()) <= {'Y', 'Unknown'}]
cols_with_more_than_two_categories = [col for col in non_numeric_cols if len(data_500_reduced[col].unique()) > 2]

# Print identified columns
print("Columns with 'Y' or 'Unknown':", cols_with_Y_or_Unknown, len(cols_with_Y_or_Unknown))
print("Columns with more than two categories:", cols_with_more_than_two_categories, len(cols_with_more_than_two_categories))

Columns with 'Y' or 'Unknown': ['AFAMPROFLS', 'APP_CHILD', 'APP_MENBIG', 'APP_TODDLR', 'APP_WOMEN', 'APP_WOMPET', 'APP_WOMPLS', 'APP_YNGMEN', 'ARTS', 'AUTOACCES', 'AUTOWORK', 'BOATING', 'BROADERLIV', 'CARDUSER', 'CATOWNER', 'CH_0002FEM', 'CH_0002MAL', 'CH_0002UNK', 'CH_0305FEM', 'CH_0305MAL', 'CH_0305UNK', 'CH_0610FEM', 'CH_0610MAL', 'CH_0610UNK', 'CH_1115FEM', 'CH_1115MAL', 'CH_1115UNK', 'CH_1617FEM', 'CH_1617MAL', 'CH_1617UNK', 'CHRISTFAM', 'COL_ANTIQ', 'COL_ARTS', 'COL_COIN', 'COL_SPORT', 'COL_STAMP', 'COMPHOMOFC', 'COMPUTERS', 'COOK_GEN', 'CURRAFFAIR', 'DEPTSTCRD', 'DIETING', 'DIYLIV', 'DOGOWNER', 'DON_ANML', 'DON_ARTCUL', 'DON_CHARIT', 'DON_CHILD', 'DON_ENVIR', 'DON_ENVWLD', 'DON_HEALTH', 'DON_INTAID', 'DON_OTHER', 'DON_POLCONS', 'DON_POLIT', 'DON_POLLIB', 'DON_RELIG', 'DON_VET', 'DONATION', 'EDU_ONLINE', 'EQUESTRIAN', 'EXER_GROUP', 'GAMING', 'GARDENER', 'GOLF', 'GRANDCHLD', 'HEALTHBEAU', 'HEATHMED', 'HH_SENIOR', 'HH_VETERAN', 'HH_YOUNGAD', 'HIGHBROW', 'HIGHENDAPP', 'HISTMIL', 'HI

In [26]:
def timer_decorator(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Elapsed time: {elapsed_time:.2f} seconds")
        return result
    return wrapper

In [27]:
print(engineered_data['PARTY_CODE'].value_counts())

PARTY_CODE
D    110365
R     85218
N     75028
O      8121
L      1621
         45
Name: count, dtype: int64


In [28]:
# Remove or replace records where 'PARTY_CODE' is " " (white spaces)
engineered_data = engineered_data[engineered_data['PARTY_CODE'] != " "]

# Class distribution for 'PARTY_CODE'
print("Original Class Distribution for PARTY_CODE:", Counter(engineered_data['PARTY_CODE']))

# Features and Target for 'PARTY_CODE'
X_party = engineered_data.drop(columns=['PARTY_CODE', 'INFER_PARTY'])
y_party = engineered_data['PARTY_CODE']

# Train-Test Split for 'PARTY_CODE'
X_train_party, X_test_party, y_train_party, y_test_party = train_test_split(
    X_party, y_party, test_size=0.2, random_state=42
)

# Model Pipeline for 'PARTY_CODE'
pipeline_party = Pipeline([
    ('scaler', StandardScaler()),
    ('xgboost', XGBClassifier())
])

# Cross-Validation for 'PARTY_CODE'
scores_party = cross_val_score(pipeline_party, X_train_party, y_train_party, cv=5)
print("Cross-Validation Scores for PARTY_CODE:", scores_party)

# Fit the model for 'PARTY_CODE'
pipeline_party.fit(X_train_party, y_train_party)

Original Class Distribution for PARTY_CODE: Counter({'D': 110365, 'R': 85218, nan: 78296, 'N': 75028, nan: 71769, nan: 69537, 'O': 8121, 'L': 1621})


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/pipeline.py", line 416, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/pipeline.py", line 370, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/joblib/memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/pipeline.py", line 950, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/utils/_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/base.py", line 918, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/preprocessing/_data.py", line 837, in fit
    return self.partial_fit(X, y, sample_weight)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/preprocessing/_data.py", line 873, in partial_fit
    X = self._validate_data(
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/base.py", line 604, in _validate_data
    out = check_array(X, input_name="X", **check_params)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/utils/validation.py", line 917, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/utils/_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/pandas/core/generic.py", line 1998, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'S'

--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/pipeline.py", line 416, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/pipeline.py", line 370, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/joblib/memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/pipeline.py", line 950, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/utils/_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/base.py", line 918, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/preprocessing/_data.py", line 837, in fit
    return self.partial_fit(X, y, sample_weight)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/preprocessing/_data.py", line 873, in partial_fit
    X = self._validate_data(
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/base.py", line 604, in _validate_data
    out = check_array(X, input_name="X", **check_params)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/utils/validation.py", line 917, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/sklearn/utils/_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/Users/nirugidla/Library/Python/3.10/lib/python/site-packages/pandas/core/generic.py", line 1998, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'H'


In [None]:
y_party.value_counts()

# All Columns - Dealing with Imbalanced Data

### 'PARTY_CODE' as target variable

In [9]:
def preprocess_data(engineered_data, sample_size=10000):
    # Copy the data
    data_with_all_columns = engineered_data.copy()
    
    # Remove rows where the target column 'PARTY_CODE' is NaN
    data_with_all_columns = data_with_all_columns[data_with_all_columns['PARTY_CODE'].notna()]
    
    # Sample the data
    data_sample = data_with_all_columns.sample(n=sample_size, random_state=42)
    
    # Keep only the classes that have at least 2 samples
    class_counts = data_sample['PARTY_CODE'].value_counts()
    valid_classes = class_counts[class_counts >= 2].index.tolist()
    data_sample = data_sample[data_sample['PARTY_CODE'].isin(valid_classes)]
    
    # Label encode the target variable
    le = LabelEncoder()
    data_sample['PARTY_CODE'] = le.fit_transform(data_sample['PARTY_CODE'].astype(str))
    
    # Identify numeric and non-numeric columns
    numeric_cols = data_sample.select_dtypes(include=['int64', 'float64']).columns.difference(['PARTY_CODE'])
    non_numeric_cols = data_sample.select_dtypes(exclude=['int64', 'float64']).columns.difference(['PARTY_CODE'])
    
    # One-hot encode non-numeric columns
    data_one_hot = pd.get_dummies(data_sample, columns=non_numeric_cols, drop_first=True)
    
    # Split into features and labels
    X = data_one_hot.drop('PARTY_CODE', axis=1)
    y = data_one_hot['PARTY_CODE']
    
    # Handle special characters in column names that XGBoost doesn't like
    X.columns = X.columns.str.replace('[', '_').str.replace(']', '_').str.replace('<', '_')
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    return X_train, X_test, y_train, y_test

# Usage
# Assuming engineered_data is already defined
X_train, X_test, y_train, y_test = preprocess_data(engineered_data, sample_size=10000)

### 'INFER_PARTY' as target variable

In [None]:
def preprocess_data(engineered_data, sample_size=10000):
    # Copy the data
    data_with_all_columns = engineered_data.copy()
    
    # Remove rows where the target column 'INFER_PARTY' is NaN
    data_with_all_columns = data_with_all_columns[data_with_all_columns['INFER_PARTY'].notna()]
    
    # Sample the data
    data_sample = data_with_all_columns.sample(n=sample_size, random_state=42)
    
    # Keep only the classes that have at least 2 samples
    class_counts = data_sample['INFER_PARTY'].value_counts()
    valid_classes = class_counts[class_counts >= 2].index.tolist()
    data_sample = data_sample[data_sample['INFER_PARTY'].isin(valid_classes)]
    
    # Label encode the target variable
    le = LabelEncoder()
    data_sample['INFER_PARTY'] = le.fit_transform(data_sample['INFER_PARTY'].astype(str))
    
    # Identify numeric and non-numeric columns
    numeric_cols = data_sample.select_dtypes(include=['int64', 'float64']).columns.difference(['INFER_PARTY'])
    non_numeric_cols = data_sample.select_dtypes(exclude=['int64', 'float64']).columns.difference(['INFER_PARTY'])
    
    # One-hot encode non-numeric columns
    data_one_hot = pd.get_dummies(data_sample, columns=non_numeric_cols, drop_first=True)
    
    # Split into features and labels
    X = data_one_hot.drop('INFER_PARTY', axis=1)
    y = data_one_hot['INFER_PARTY']
    
    # Handle special characters in column names that XGBoost doesn't like
    X.columns = X.columns.str.replace('[', '_').str.replace(']', '_').str.replace('<', '_')
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    return X_train, X_test, y_train, y_test

# Usage
# Assuming engineered_data is already defined
X_train, X_test, y_train, y_test = preprocess_data(engineered_data, sample_size=10000)


#### Resampling Techniques

In [13]:
from imblearn.over_sampling import SMOTE

# Impute missing values with mean
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Determine the smallest class count
min_class_count = np.min(np.bincount(y_train))

# Apply SMOTE to the imputed training data
# Set n_neighbors to min_class_count - 1 or a default small number
n_neighbors = min(min_class_count - 1, 5)
smote = SMOTE(random_state=42, k_neighbors=n_neighbors)
X_res, y_res = smote.fit_resample(X_train_imputed, y_train)

# Train the XGBoost model
xgb = XGBClassifier(objective='multi:softmax', random_state=42)
xgb.fit(X_res, y_res)

# Make predictions on the imputed test set
y_pred = xgb.predict(X_test_imputed)

# Calculate the metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.925
Recall: 0.925
F1 Score: 0.9128656777493607


In [15]:
from imblearn.under_sampling import RandomUnderSampler

# Impute missing values with mean
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Apply RandomUnderSampler to the imputed training data
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train_imputed, y_train)

# Train the XGBoost model
xgb = XGBClassifier(objective='multi:softmax', random_state=42)
xgb.fit(X_res, y_res)

# Make predictions on the imputed test set
y_pred = xgb.predict(X_test_imputed)

# Calculate the metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.413
Recall: 0.413
F1 Score: 0.484446515639784


#### Weighted Loss Function

In [17]:
# Count the occurrences of each class in the target variable
counter = Counter(y_train)

# Calculate the number of samples
total_samples = len(y_train)

# Compute class weights
class_weights = {cls: float(total_samples / count) for cls, count in counter.items()}

# Map the weights to each sample in y_train
sample_weights = [class_weights[cls] for cls in y_train]

# Initialize XGBoost with multi:softmax objective
xgb = XGBClassifier(objective='multi:softmax', random_state=42)

# Fit the model, passing in the sample weights
xgb.fit(X_train, y_train, sample_weight=sample_weights)

# Make predictions
y_pred = xgb.predict(X_test)

# Calculate the metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.9145
Recall: 0.9145
F1 Score: 0.9094697333354964


#### Cost-sensitive Learning

In [24]:
def create_cost_matrix(y):
    # Count the frequency of each class in the target variable
    class_freq = Counter(y)
    
    # Calculate the inverse frequency
    inv_freq = {k: 1.0 / v for k, v in class_freq.items()}
    
    # Create the cost matrix
    cost_matrix = {}
    for class1 in class_freq.keys():
        cost_matrix[class1] = {}
        for class2 in class_freq.keys():
            if class1 == class2:
                continue
            cost_matrix[class1][class2] = inv_freq[class1] + inv_freq[class2]
            
    return cost_matrix

# Assuming y_train contains your training labels, and it's a pandas Series
# For demonstration, I'm using the value_counts information you provided
y_train_demo = ['D'] * 110365 + ['R'] * 85218 + ['N'] * 75028 + ['O'] * 8121 + ['L'] * 1621 + ['  '] * 45
cost_matrix = create_cost_matrix(y_train_demo)

print(cost_matrix)

{'D': {'R': 2.0795453623443604e-05, 'N': 2.2389200977768253e-05, 'O': 0.00013219838820189577, 'L': 0.0006259639897705816, '  ': 0.02223128306578676}, 'R': {'D': 2.0795453623443604e-05, 'N': 2.5062967472140136e-05, 'O': 0.00013487215469626767, 'L': 0.0006286377562649534, '  ': 0.022233956832281132}, 'N': {'D': 2.2389200977768253e-05, 'R': 2.5062967472140136e-05, 'O': 0.00013646590205059232, 'L': 0.0006302315036192781, '  ': 0.022235550579635454}, 'O': {'D': 0.00013219838820189577, 'R': 0.00013487215469626767, 'N': 0.00013646590205059232, 'L': 0.0007400406908434056, '  ': 0.02234535976685958}, 'L': {'D': 0.0006259639897705816, 'R': 0.0006286377562649534, 'N': 0.0006302315036192781, 'O': 0.0007400406908434056, '  ': 0.02283912536842827}, '  ': {'D': 0.02223128306578676, 'R': 0.022233956832281132, 'N': 0.022235550579635454, 'O': 0.02234535976685958, 'L': 0.02283912536842827}}


In [17]:
def compute_custom_sample_weight(cost_matrix, y):
    sample_weight = np.zeros(len(y))
    
    for i, class1 in enumerate(y):
        for class2, cost in cost_matrix[class1].items():
            sample_weight[i] += (y == class2).sum() * cost
            
    return sample_weight

# Assuming y_train is your actual training labels and it's a NumPy array
# For demonstration, converting y_train_demo to a NumPy array
y_train_demo_np = np.array(y_train_demo)

# Compute sample weights
sample_weight = compute_custom_sample_weight(cost_matrix, y_train_demo_np)

# Fit the model
xgb = XGBClassifier(objective='multi:softmax', random_state=42)
xgb.fit(X_train, y_train, sample_weight=sample_weight)

# Rest of the code for predictions and metrics...


NameError: name 'y_train_demo' is not defined

In [22]:
print(engineered_data['PARTY_CODE'].value_counts())

PARTY_CODE
D    110365
R     85218
N     75028
O      8121
L      1621
         45
Name: count, dtype: int64
