In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
import xgboost as xgb
from collections import Counter
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, f1_score
import seaborn as sns
import time

## 500K dataset

In [5]:
data_500 = pd.read_csv("/Users/nirugidla/Documents/GitHub/milestone2_MADS/data_500k.csv", low_memory=False)
data_500

Unnamed: 0,RECORD_ID,ADD_TYPE,AFAMPROFLS,AGE,AI_COUNTY_NAME,AIRCOND,APP_CHILD,APP_MENBIG,APP_TODDLR,APP_WOMEN,...,VTR_PRI16,VTR_PRI17,VTR_PRI18,VTR_PRI19,VTR_PRI20,VTR_PRI21,VTR_PRI22,WORKWOMAN,YEARBUILT,ZIP
0,403390,S,,21.0,Fairbanks North Star,,,,,,...,,,,,,,,,,99705
1,62285,H,,,Anchorage,,,,,,...,,,,,,,,,,99506
2,331355,,,91.0,Kenai Peninsula,,,,,,...,,,,,,,,,,99603
3,206320,H,,65.0,Anchorage,,,,,,...,,,,,,,Y,,,99567
4,188078,S,,76.0,Juneau,,,,,,...,Y,,Y,,,,Y,Y,1985,99801
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,349635,H,,20.0,BIBB,,,,,,...,,,,,,,,,,31204
499996,420654,S,,50.0,COWETA,A,,,,,...,,,,,,,,,2003,30263
499997,131262,S,,19.0,ROCKDALE,,,,,,...,,,,,,,,,,30013
499998,315673,H,,21.0,BARROW,,,,,,...,,,,,,,,,,30680


In [6]:
remove_columns = [
    'PRFL_LGBT_SUPPORT',
    'PRFL_LIBERAL_NEWS',
    'PRFL_MARIJUANA_REFORM',
    'PRFL_BIDEN_SUPPORT',
    'PRFL_BORDER_SECURITY',
    'PRFL_CONSERVATIVE_NEWS',
    'PRFL_IMMIGRATION_REFORM',
    'PRFL_OBAMA',
    'PRFL_PERSUADABLE_VOTER',
    'PRFL_POLITICAL_IDEOLOGY',
    'PRFL_SANDERS_SUPPORT',
    'PRFL_TRUMP_SUPPORT',
    'ZIP',
    
    'VTR_GEN00', 'VTR_GEN01', 'VTR_GEN02', 'VTR_GEN03', 'VTR_GEN04', 'VTR_GEN05', 'VTR_GEN06', 'VTR_GEN07', 'VTR_GEN08', 'VTR_GEN09', 'VTR_GEN10', 'VTR_GEN11', 'VTR_GEN12', 'VTR_GEN13', 'VTR_GEN14', 'VTR_GEN15', 'VTR_GEN16', 'VTR_GEN17', 'VTR_GEN18', 'VTR_GEN19', 'VTR_GEN20', 'VTR_GEN21', 'VTR_GEN22', 'VTR_OTH00', 'VTR_OTH01', 'VTR_OTH02', 'VTR_OTH03', 'VTR_OTH04', 'VTR_OTH05', 'VTR_OTH06', 'VTR_OTH07', 'VTR_OTH08', 'VTR_OTH09', 'VTR_OTH10', 'VTR_OTH11', 'VTR_OTH12', 'VTR_OTH13', 'VTR_OTH14', 'VTR_OTH15', 'VTR_OTH16', 'VTR_OTH17', 'VTR_OTH18', 'VTR_OTH19', 'VTR_OTH20', 'VTR_OTH21', 'VTR_OTH22', 'VTR_PPP00', 'VTR_PPP04', 'VTR_PPP08', 'VTR_PPP12', 'VTR_PPP16', 'VTR_PPP20', 'VTR_PRI00', 'VTR_PRI01', 'VTR_PRI02', 'VTR_PRI03', 'VTR_PRI04', 'VTR_PRI05', 'VTR_PRI06', 'VTR_PRI07', 'VTR_PRI08', 'VTR_PRI09', 'VTR_PRI10', 'VTR_PRI11', 'VTR_PRI12', 'VTR_PRI13', 'VTR_PRI14', 'VTR_PRI15', 'VTR_PRI16', 'VTR_PRI17', 'VTR_PRI18', 'VTR_PRI19', 'VTR_PRI20', 'VTR_PRI21', 'VTR_PRI22',
    
        
      'PRFL_CHOICELIFE', 'TOD_PRES_D_2016_PREC', 'TOD_PRES_O_2016',
    'TOD_PRES_R_2016', 'TOD_PRES_R_2016_PREC', 'TOD_PRES_R_2020_PREC', 'VP_PPP',
    'AGE', 'CNSUS_PCTW',
    
    'PARTY_MIX', 'PRFL_MINWAGE', 'PRFL_FENCE_SITTER'
]
# Drop the list of columns from the dataset
data_500.drop(columns=remove_columns, errors='ignore', inplace=True)


## Data Cleaning

In [7]:
# Data Loading
#data_500 = pd.read_csv("/Users/nirugidla/Documents/GitHub/milestone2_MADS/data_500k.csv", low_memory=False)

# Data Cleaning - Column Names
data_500.columns = data_500.columns.str.strip()

# Data Cleaning - Drop Duplicates
data_500.drop_duplicates(inplace=True)

# Data Cleaning - Object Columns
for col in data_500.columns:
    if data_500[col].dtype == 'object':
        data_500[col] = data_500[col].str.strip()

# Data Cleaning - Empty Strings
data_500.replace('', 'Unknown', inplace=True)

# Data Cleaning - NaN for Object Types
data_500.loc[:, data_500.dtypes == 'object'] = data_500.loc[:, data_500.dtypes == 'object'].fillna('Unknown')

# Data Cleaning - Drop Columns and Rows with All NaNs
data_500.dropna(axis=1, how='all', inplace=True)
data_500.dropna(axis=0, how='all', inplace=True)

# Identify numeric and non-numeric columns
numeric_cols = data_500.select_dtypes(include=['int64', 'float64']).columns
non_numeric_cols = data_500.select_dtypes(exclude=['int64', 'float64']).columns

# Data Cleaning - Removing Non-Numeric Columns with More Than 90% Missing Data
missing_data_percentage = data_500.isnull().mean() * 100
non_numeric_cols_to_remove = missing_data_percentage[non_numeric_cols]
non_numeric_cols_to_remove = non_numeric_cols_to_remove[non_numeric_cols_to_remove > 90].index.tolist()
data_500_reduced = data_500.drop(columns=non_numeric_cols_to_remove)

# Update the list of non-numeric columns after removal
non_numeric_cols = data_500_reduced.select_dtypes(exclude=['int64', 'float64']).columns

# Identifying Specific Types of Non-Numeric Columns
cols_with_Y_or_Unknown = [col for col in non_numeric_cols if set(data_500_reduced[col].unique()) <= {'Y', 'Unknown'}]
cols_with_more_than_two_categories = [col for col in non_numeric_cols if len(data_500_reduced[col].unique()) > 2]

# Print identified columns
print("Columns with 'Y' or 'Unknown':", cols_with_Y_or_Unknown, len(cols_with_Y_or_Unknown))
print("Columns with more than two categories:", cols_with_more_than_two_categories, len(cols_with_more_than_two_categories))

Columns with 'Y' or 'Unknown': ['AFAMPROFLS', 'APP_CHILD', 'APP_MENBIG', 'APP_TODDLR', 'APP_WOMEN', 'APP_WOMPET', 'APP_WOMPLS', 'APP_YNGMEN', 'ARTS', 'AUTOACCES', 'AUTOWORK', 'BOATING', 'BROADERLIV', 'CARDUSER', 'CATOWNER', 'CH_0002FEM', 'CH_0002MAL', 'CH_0002UNK', 'CH_0305FEM', 'CH_0305MAL', 'CH_0305UNK', 'CH_0610FEM', 'CH_0610MAL', 'CH_0610UNK', 'CH_1115FEM', 'CH_1115MAL', 'CH_1115UNK', 'CH_1617FEM', 'CH_1617MAL', 'CH_1617UNK', 'CHRISTFAM', 'COL_ANTIQ', 'COL_ARTS', 'COL_COIN', 'COL_SPORT', 'COL_STAMP', 'COMPHOMOFC', 'COMPUTERS', 'COOK_GEN', 'CURRAFFAIR', 'DEPTSTCRD', 'DIETING', 'DIYLIV', 'DOGOWNER', 'DON_ANML', 'DON_ARTCUL', 'DON_CHARIT', 'DON_CHILD', 'DON_ENVIR', 'DON_ENVWLD', 'DON_HEALTH', 'DON_INTAID', 'DON_OTHER', 'DON_POLCONS', 'DON_POLIT', 'DON_POLLIB', 'DON_RELIG', 'DON_VET', 'DONATION', 'EDU_ONLINE', 'EQUESTRIAN', 'EXER_GROUP', 'GAMING', 'GARDENER', 'GOLF', 'GRANDCHLD', 'HEALTHBEAU', 'HEATHMED', 'HH_SENIOR', 'HH_VETERAN', 'HH_YOUNGAD', 'HIGHBROW', 'HIGHENDAPP', 'HISTMIL', 'HI

In [8]:
# Load the dataset again
#data = pd.read_csv('data/surveydata.csv')

corrected_data = data_500.copy()

# Update the 'I' values in PARTY_CODE to 'N' for rows where STATE is 'SD'
corrected_data.loc[(corrected_data['STATE'] == 'SD') & (corrected_data['PARTY_CODE'] == 'I'), 'PARTY_CODE'] = 'N'

# Update the 'U' values in PARTY_CODE to 'N' for rows where STATE is in ['DC', 'LA', 'RI']
states_to_update = ['DC', 'LA', 'RI']
corrected_data.loc[(corrected_data['STATE'].isin(states_to_update)) & (corrected_data['PARTY_CODE'] == 'U'), 'PARTY_CODE'] = 'N'

# Update the 'I' values in PARTY_CODE to 'N' for rows where STATE is 'OK'
corrected_data.loc[(corrected_data['STATE'] == 'OK') & (corrected_data['PARTY_CODE'] == 'I'), 'PARTY_CODE'] = 'N'


#categorical_columns
# Create a mapping dictionary for PARTY_CODE to INFER_PARTY values
party_mapping = {
    'D': 'D',
    'E': 'D',
    'R': 'R',
    'S': 'R',
    'N': float('nan'),
    'U': float('nan'),
    'A': float('nan'),
    'B': float('nan'),
    'C': float('nan'),
    'F': float('nan'),
    'G': float('nan'),
    'H': float('nan'),
    'I': float('nan'),
    'J': float('nan'),
    'K': float('nan'),
    'L': float('nan'),
    'P': float('nan'),
    'Q': float('nan'),
    'T': float('nan'),
    'V': float('nan'),
    'W': float('nan'),
    'Y': float('nan'),
    'Z': float('nan'),
    'O': float('nan'),
}

# Create the INFER_PARTY column using the mapping
corrected_data['INFER_PARTY'] = corrected_data['PARTY_CODE'].map(party_mapping)

print(corrected_data['PARTY_CODE'].unique())


# Display the unique values in the INFER_PARTY column to ensure correctness
unique_infer_party = corrected_data['INFER_PARTY'].unique()

print('unique_infer_party:')

print(unique_infer_party)

# Define the mapping for PARTY_CODE modifications
party_code_mapping = {
    'E': float('nan'),
    'S': float('nan'),
    'U': float('nan'),
    'A': 'O',
    'B': 'O',
    'C': 'O',
    'F': 'O',
    'G': 'O',
    'H': 'O',
    'I': 'O',
    'J': 'O',
    'K': 'O',
    'L': 'L',
    'P': 'O',
    'Q': 'O',
    'T': 'O',
    'V': 'O',
    'W': 'O',
    'Y': 'O',
    'Z': 'O'
}

# Apply the mapping to the PARTY_CODE column
corrected_data['PARTY_CODE'] = corrected_data['PARTY_CODE'].replace(party_code_mapping)

# Check the unique values of PARTY_CODE after the modifications
unique_party_code_after_modifications = corrected_data['PARTY_CODE'].unique()

print("unique_party_code_after_modifications:")

print(unique_party_code_after_modifications)

['N' 'R' 'O' 'D' 'A' 'F' 'P' 'G' 'L' 'U' 'W' 'B' 'I' 'Y' 'V' 'H' 'Unknown'
 'S' 'E' 'Q' 'Z']
unique_infer_party:
[nan 'R' 'D']
unique_party_code_after_modifications:
['N' 'R' 'O' 'D' 'L' nan 'Unknown']


In [9]:
engineered_data = corrected_data.copy()
voter_columns = [col for col in engineered_data.columns if col.startswith("VTR")]
#Create column with total number of votes in voter_columns per row
engineered_data['VTR_TOTAL_VOTES'] = engineered_data[voter_columns].notnull().sum(axis=1)
#Sum Democrat and Republican totals
engineered_data['VTR_TOTAL_DVOTES'] = engineered_data[voter_columns].isin(['D', 'M', 'Z']).sum(axis=1)
engineered_data['VTR_TOTAL_RVOTES'] = engineered_data[voter_columns].isin(['R', 'P', 'X']).sum(axis=1)
#Create conditions that evaluate whether more votes have been cast for Democrats or Republicans, and assign 'D' and 'R' to new column based on conditions
conditions = [
    engineered_data['VTR_TOTAL_DVOTES'] > engineered_data['VTR_TOTAL_RVOTES'],
    engineered_data['VTR_TOTAL_DVOTES'] < engineered_data['VTR_TOTAL_RVOTES']
]
choices = ['D', 'R']
engineered_data['VTR_INFER_PARTY'] = np.select(conditions, choices, default=np.nan)
#Create conditions that evaluate whether a voter is a swing voter or not
conditions_swing = [
    (engineered_data['VTR_TOTAL_DVOTES'] > 2) & (engineered_data['VTR_TOTAL_RVOTES'] > 2),
    ((engineered_data['VTR_TOTAL_DVOTES'] > 1) & (engineered_data['VTR_TOTAL_RVOTES'] == 0)) | ((engineered_data['VTR_TOTAL_RVOTES'] > 1) & (engineered_data['VTR_TOTAL_DVOTES'] == 0))
]
choices_swing = ['Y', 'N']
engineered_data['VTR_INFER_SWING'] = np.select(conditions_swing, choices_swing, default=np.nan)
#sampledf = engineered_data[['PARTY_CODE','INFER_PARTY','VTR_TOTAL_DVOTES','VTR_TOTAL_RVOTES','VTR_INFER_PARTY','VTR_INFER_SWING']]
#print(sampledf.head(50))
#Add values to INFER_PARTY and correct any other INFER_PARTY values that don't meet the conditions above:
print(sum(engineered_data['INFER_PARTY'].isna())) #291 NaNs for INFER_PARTY before
engineered_data.loc[(engineered_data['VTR_INFER_PARTY'] == 'D') & (engineered_data['VTR_INFER_SWING'] == 'N'), 'INFER_PARTY'] = 'D'
engineered_data.loc[(engineered_data['VTR_INFER_PARTY'] == 'R') & (engineered_data['VTR_INFER_SWING'] == 'N'), 'INFER_PARTY'] = 'R'
engineered_data.loc[(engineered_data['VTR_INFER_PARTY'] == 'D') & (engineered_data['VTR_INFER_SWING'] == 'Y'), 'INFER_PARTY'] =  float('nan')
engineered_data.loc[(engineered_data['VTR_INFER_PARTY'] == 'R') & (engineered_data['VTR_INFER_SWING'] == 'Y'), 'INFER_PARTY'] =  float('nan')
#Also delete any 'D' or 'R' INFER_PARTY labels for anyone deemed a "swing voter" based on criteria above of voting for both parties at least 3 times each:
engineered_data.loc[(engineered_data['VTR_INFER_SWING'] == 'Y') , 'INFER_PARTY'] = float('nan')
print(sum(engineered_data['INFER_PARTY'].isna())) #291 NaNs for INFER_PARTY before
#Drop auxiliary columns used for math, keeping 'VTR_TOTAL_VOTES', 'VTR_INFER_SWING', and the updated 'INFER_PARTY' columns as features:
engineered_data = engineered_data.drop(columns=['VTR_TOTAL_DVOTES','VTR_TOTAL_RVOTES','VTR_INFER_PARTY'])

154352
154352


## Just the columns with Yes or Unknown non_numeric_cols 

In [21]:
data_reduced_with_Y_or_Unknown = data_500[cols_with_Y_or_Unknown].copy()
data_reduced_with_Y_or_Unknown.shape

(500000, 131)

In [20]:
def run_xgboost(sample_size, num_runs, top_N_features):
    results_list = []

    for run in range(1, num_runs + 1):
        # Assuming cols_with_Y_or_Unknown has been identified and data_500 has been cleaned
        data_reduced_with_Y_or_Unknown = engineered_data[cols_with_Y_or_Unknown].copy()

        # Adding the target column to this data
        data_reduced_with_Y_or_Unknown['INFER_PARTY'] = engineered_data['INFER_PARTY']

        # Remove rows where 'PARTY_CODE' is missing, as it's our target variable
        data_reduced_with_Y_or_Unknown = data_reduced_with_Y_or_Unknown[data_reduced_with_Y_or_Unknown['INFER_PARTY'].notna()]

        # Sample data
        data_sample = data_reduced_with_Y_or_Unknown.sample(n=sample_size, random_state=run)

        # Dynamic Class Handling
        class_counts = data_sample['INFER_PARTY'].value_counts()
        valid_classes = class_counts[class_counts >= 1].index.tolist()
        data_sample = data_sample[data_sample['INFER_PARTY'].isin(valid_classes)]

        # Label-encode 'PARTY_CODE' column
        le = LabelEncoder()
        data_sample['INFER_PARTY'] = le.fit_transform(data_sample['INFER_PARTY'].astype(str))

        # One-Hot Encoding
        data_one_hot = pd.get_dummies(data_sample, columns=cols_with_Y_or_Unknown, drop_first=True)

        # Splitting the Data into Training and Test Sets
        X = data_one_hot.drop('INFER_PARTY', axis=1)
        y = data_one_hot['INFER_PARTY']

        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        print("y_test", y_test.value_counts())
        
        
        # Initialize and Train XGBoost Classifier
        xgb = XGBClassifier(objective='multi:softmax', num_class=len(valid_classes), random_state=42)
        xgb.fit(X_train, y_train)

        # Make Predictions and Evaluate the Model
        y_pred = xgb.predict(X_test)
        y_pred_value_counts = Counter(y_pred)
        print("y_pred_value_counts",y_pred_value_counts)

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        recall = recall_score(y_test.to_list(), y_pred)
        
        true_positive = sum(1 for a, b in zip(y_test.to_list(), y_pred) if a == 1 and b == 1)
        new_recall = true_positive/ (939)
        print(new_recall)

        # Get top N features
        feature_importances = xgb.feature_importances_
        sorted_idx = feature_importances.argsort()[::-1][:top_N_features]
        top_features = X.columns[sorted_idx].tolist()

        # Append to results list
        results_list.append({
            'Run': run,
            'Accuracy': accuracy,
            'F1_Score': f1,
            'Recall': recall,
            'Top_N_Features': top_features
        })

    # Create a DataFrame from the results list
    results_df = pd.DataFrame(results_list)
    
    return results_df

# Example of how to call this function
result = run_xgboost(sample_size=10000, num_runs=1, top_N_features=10)
result

y_test INFER_PARTY
0    1061
1     939
Name: count, dtype: int64
y_pred_value_counts Counter({0: 1266, 1: 734})
0.5047923322683706


Unnamed: 0,Run,Accuracy,F1_Score,Recall,Top_N_Features
0,1,0.6375,0.566647,0.504792,"[PRFL_BLM_SUPPORT_Y, PRFL_2NDAMEND_Y, PRFL_MET..."


In [23]:
# Assuming result is your DataFrame and 'Top_N_Features' is the column with the lists of top features
all_features = [feature for sublist in result['Top_N_Features'].tolist() for feature in sublist]

# Count the frequency of each feature
feature_counts = Counter(all_features)

# Find the most unique features (those that appear only once across all runs)
most_unique_features = [feature for feature, count in feature_counts.items() if count == 1]

# Find the most common features (those that appear the most across all runs)
most_common_features = [feature for feature, count in feature_counts.most_common()]

print("Most Unique Features:", most_unique_features)
print("Most Common Features:", most_common_features)

Most Unique Features: ['PRFL_HEALTHCARE_Y', 'CH_1115MAL_Y', 'CH_1115FEM_Y', 'AUTOWORK_Y', 'HOME_DECOR_Y', 'MOTORCYCLE_Y', 'CHRISTFAM_Y', 'HUNTSHOOT_Y']
Most Common Features: ['PRFL_BLM_SUPPORT_Y', 'PRFL_2NDAMEND_Y', 'PRFL_METOO_SUPPORT_Y', 'PRFL_GUN_CONTROL_Y', 'DON_POLLIB_Y', 'DON_POLCONS_Y', 'PRFL_HEALTHCARE_Y', 'CH_1115MAL_Y', 'CH_1115FEM_Y', 'AUTOWORK_Y', 'HOME_DECOR_Y', 'MOTORCYCLE_Y', 'CHRISTFAM_Y', 'HUNTSHOOT_Y']


## Just the non_numeric_cols with more than 2 values

In [24]:
data_reduced_with_more_than_two_categories = data_500[cols_with_more_than_two_categories].copy()
data_reduced_with_more_than_two_categories.shape

(500000, 49)

In [1]:
def timer_decorator(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Elapsed time: {elapsed_time:.2f} seconds")
        return result
    return wrapper

@timer_decorator
def run_xgboost(data, cols_with_more_than_two_categories, sample_size=100000, num_runs=1, top_N_features=10):
    results_list = []

    for run in range(1, num_runs + 1):
        print(f"Starting run {run}...")
        
        data_reduced_with_more_than_two_categories = data[cols_with_more_than_two_categories].copy()
        
        # Using 'INFER_PARTY' as the target variable
        data_reduced_with_more_than_two_categories['INFER_PARTY'] = data['INFER_PARTY']
        
        # Remove rows where 'INFER_PARTY' is missing, as it's our target variable
        data_reduced_with_more_than_two_categories = data_reduced_with_more_than_two_categories[data_reduced_with_more_than_two_categories['INFER_PARTY'].notna()]

        # Sample data
        data_sample = data_reduced_with_more_than_two_categories.sample(n=sample_size, random_state=42)
        
        # Dynamic Class Handling
        class_counts = data_sample['INFER_PARTY'].value_counts()
        valid_classes = class_counts[class_counts >= 1].index.tolist()
        data_sample = data_sample[data_sample['INFER_PARTY'].isin(valid_classes)]
        
        le = LabelEncoder()
        data_sample['INFER_PARTY'] = le.fit_transform(data_sample['INFER_PARTY'].astype(str))
        
        cols_to_encode = [col for col in cols_with_more_than_two_categories if col in data_sample.columns and col != 'INFER_PARTY']
        data_one_hot = pd.get_dummies(data_sample, columns=cols_to_encode, drop_first=True)
        
        X = data_one_hot.drop('INFER_PARTY', axis=1)
        y = data_one_hot['INFER_PARTY']
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        X_train.columns = X_train.columns.str.replace('[', '_').str.replace(']', '_').str.replace('<', '_')
        X_test.columns = X_test.columns.str.replace('[', '_').str.replace(']', '_').str.replace('<', '_')
        print("Training set class distribution:\n", y_train.value_counts())
        print("Testing set class distribution:\n", y_test.value_counts())
        
        num_classes = len(np.unique(y_train))

        xgb = XGBClassifier(objective='multi:softmax', num_class=num_classes, random_state=42)
        xgb.fit(X_train, y_train)

        y_pred = xgb.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')

        print(f"Run {run} - Accuracy: {accuracy}, F1 Score: {f1}, Recall: {recall}")
        feature_importances = xgb.feature_importances_
        sorted_idx = feature_importances.argsort()[::-1][:top_N_features]
        top_features = X.columns[sorted_idx].tolist()

        results_list.append({
            'Run': run,
            'Accuracy': accuracy,
            'F1_Score': f1,
            'Recall': recall,
            'Top_N_Features': top_features
        })

    results_df = pd.DataFrame(results_list)
    
    return results_df

# Assuming engineered_data and cols_with_more_than_two_categories are already defined
result = run_xgboost(engineered_data, cols_with_more_than_two_categories, sample_size=50000, num_runs=2, top_N_features=10)

NameError: name 'engineered_data' is not defined

In [26]:
#result['Top_N_Features'][0]
result

Unnamed: 0,Run,Accuracy,F1_Score,Recall,Top_N_Features
0,1,0.911,0.911058,0.911,"[PARTY_CODE_R, ETHNIC_INFER_C, TOD_PRES_DIFF_2..."
1,2,0.911,0.911058,0.911,"[PARTY_CODE_R, ETHNIC_INFER_C, TOD_PRES_DIFF_2..."


In [27]:
# Assuming result is your DataFrame and 'Top_N_Features' is the column with the lists of top features
all_features = [feature for sublist in result['Top_N_Features'].tolist() for feature in sublist]

# Count the frequency of each feature
feature_counts = Counter(all_features)

# Find the most unique features (those that appear only once across all runs)
most_unique_features = [feature for feature, count in feature_counts.items() if count == 1]

# Find the most common features (those that appear the most across all runs)
most_common_features = [feature for feature, count in feature_counts.most_common()]

print("Most Unique Features:", most_unique_features)
print("Most Common Features:", most_common_features)

Most Unique Features: []
Most Common Features: ['PARTY_CODE_R', 'ETHNIC_INFER_C', 'TOD_PRES_DIFF_2020_PREC_Unknown', 'CENSUS_ST_12', 'CENSUS_ST_42', 'ADD_TYPE_Unknown', 'CENSUS_ST_13', 'CENSUS_ST_34', 'FUND_POLIT_D', 'CENSUS_ST_06']


In [28]:
# Load the dataset
file_path = '/Users/nirugidla/Documents/GitHub/milestone2_MADS/surveydata.csv'
survey_data = pd.read_csv(file_path)

# Create a copy for corrections
corrected_data = survey_data.copy()

# Replace blanks and spaces with NaN
corrected_data.replace(r'^\s*$', np.nan, regex=True, inplace=True)

# Update 'PARTY_CODE' based on certain conditions in 'STATE'
states_to_update_for_U = ['DC', 'LA', 'RI']
states_to_update_for_I = ['SD', 'OK']

# Set 'PARTY_CODE' to 'N' for rows where 'STATE' is in the predefined lists and 'PARTY_CODE' is 'U' or 'I'
corrected_data.loc[(corrected_data['STATE'].isin(states_to_update_for_U)) & (corrected_data['PARTY_CODE'] == 'U'), 'PARTY_CODE'] = 'N'
corrected_data.loc[(corrected_data['STATE'].isin(states_to_update_for_I)) & (corrected_data['PARTY_CODE'] == 'I'), 'PARTY_CODE'] = 'N'

# Count the number of rows where 'PARTY_CODE' is 'N' or 'U'
count_N_U = len(corrected_data.loc[corrected_data['PARTY_CODE'].isin(['N', 'U'])])

print(f"Number of rows with 'PARTY_CODE' as 'N' or 'U': {count_N_U}")

Number of rows with 'PARTY_CODE' as 'N' or 'U': 273


In [29]:
from collections import defaultdict

# Create a copy to de-fragment the DataFrame
corrected_data = corrected_data.copy()

# Create party_mapping using defaultdict. Lambda sets default (all other non-D and non-R party registrations to NaN).
infer_party_mapping = defaultdict(lambda: float('nan'))
infer_party_mapping.update({
    'D': 'D',
    'E': 'D',
    'R': 'R',
    'S': 'R'
})

# Map PARTY_CODE to INFER_PARTY
corrected_data['INFER_PARTY'] = corrected_data['PARTY_CODE'].map(infer_party_mapping)

# Define the mapping for PARTY_CODE modifications
party_code_mapping = {
    'E': float('nan'),
    'S': float('nan'),
    'U': float('nan'),
    'A': 'O',
    'B': 'O',
    'C': 'O',
    'F': 'O',
    'G': 'O',
    'H': 'O',
    'I': 'O',
    'J': 'O',
    'K': 'O',
    'L': 'L',
    'P': 'O',
    'Q': 'O',
    'T': 'O',
    'V': 'O',
    'W': 'O',
    'Y': 'O',
    'Z': 'O'
}

# Apply the mapping to the PARTY_CODE column
corrected_data['PARTY_CODE'] = corrected_data['PARTY_CODE'].replace(party_code_mapping)

# The rest of your code
print("Final PARTY_CODE values:")
print(corrected_data['PARTY_CODE'].unique())
print('Final INFER_PARTY values:')
print(corrected_data['INFER_PARTY'].unique())
print('Total INFER_PARTY Ds and Rs:')
print(len(corrected_data.loc[corrected_data['INFER_PARTY'].isin(['D', 'R'])]))
print('INFER_PARTY NaNs:')
print(corrected_data['INFER_PARTY'].isna().sum())


Final PARTY_CODE values:
[nan 'R' 'D' 'N' 'O' 'L']
Final INFER_PARTY values:
['D' 'R' nan]
Total INFER_PARTY Ds and Rs:
1269
INFER_PARTY NaNs:
291


In [30]:
# Create a de-fragmented copy of the DataFrame to avoid performance issues
corrected_data = corrected_data.copy()

# Identify columns starting with "VTR"
voter_columns = [col for col in corrected_data.columns if col.startswith("VTR")]

# Calculate total votes, Democrat votes, and Republican votes
corrected_data['VTR_TOTAL_VOTES'] = corrected_data[voter_columns].notnull().sum(axis=1)
corrected_data['VTR_TOTAL_DVOTES'] = corrected_data[voter_columns].isin(['D', 'M', 'Z']).sum(axis=1)
corrected_data['VTR_TOTAL_RVOTES'] = corrected_data[voter_columns].isin(['R', 'P', 'X']).sum(axis=1)

# Create conditions to infer party based on vote counts
conditions = [
    corrected_data['VTR_TOTAL_DVOTES'] - corrected_data['VTR_TOTAL_RVOTES'] >= 2,
    corrected_data['VTR_TOTAL_RVOTES'] - corrected_data['VTR_TOTAL_DVOTES'] >= 2,
]
choices = ['D', 'R']

# Apply conditions to create a new column
corrected_data['VTR_INFER_PARTY'] = np.select(conditions, choices, default=np.nan)
corrected_data['VTR_INFER_PARTY'].replace('nan', np.nan, inplace=True)

# Update the INFER_PARTY column based on VTR_INFER_PARTY
corrected_data.loc[(corrected_data['VTR_INFER_PARTY'] == 'D'), 'INFER_PARTY'] = 'D'
corrected_data.loc[(corrected_data['VTR_INFER_PARTY'] == 'R'), 'INFER_PARTY'] = 'R'

# Display statistics
print("Total number of D's and R's in INFER_PARTY column:", len(corrected_data.loc[corrected_data['INFER_PARTY'].isin(['D', 'R'])]))
print("Total number of NaNs in INFER_PARTY column:", corrected_data['INFER_PARTY'].isna().sum())

Total number of D's and R's in INFER_PARTY column: 1291
Total number of NaNs in INFER_PARTY column: 269


In [26]:
# Load the dataset again
survey_data = pd.read_csv('/Users/nirugidla/Documents/GitHub/milestone2_MADS/surveydata.csv')

corrected_data=survey_data

#Replace blanks and spaces with NaN
corrected_data.replace(r'^\s*$', np.nan, regex=True, inplace=True)

# Update the 'I' values in PARTY_CODE to 'N' for rows where STATE is 'SD'
corrected_data.loc[(corrected_data['STATE'] == 'SD') & (corrected_data['PARTY_CODE'] == 'I'), 'PARTY_CODE'] = 'N'

# Update the 'U' values in PARTY_CODE to 'N' for rows where STATE is in ['DC', 'LA', 'RI']
states_to_update = ['DC', 'LA', 'RI']
corrected_data.loc[(corrected_data['STATE'].isin(states_to_update)) & (corrected_data['PARTY_CODE'] == 'U'), 'PARTY_CODE'] = 'N'

# Update the 'I' values in PARTY_CODE to 'N' for rows where STATE is 'OK'
corrected_data.loc[(corrected_data['STATE'] == 'OK') & (corrected_data['PARTY_CODE'] == 'I'), 'PARTY_CODE'] = 'N'

print(len(corrected_data.loc[corrected_data['PARTY_CODE'].isin(['N', 'U'])]))

273


In [27]:
from collections import defaultdict
# Create party_mapping using defaultdict.Lambda sets default (all other non-D and non-R party registrations to NaN).
infer_party_mapping = defaultdict(lambda: float('nan'))
infer_party_mapping.update({
    'D': 'D',  # Registered Democrats are mapped to Democrat
    'E': 'D',  # E (Inferred Democrats) are mapped to Democrat
    'R': 'R',  # Registered Republicans are mapped to Republican 
    'S': 'R'   # S (Inferred Republicans) are mapped to Republican
})
# Map PARTY_CODE to INFER_PARTY
corrected_data['INFER_PARTY'] = corrected_data['PARTY_CODE'].map(infer_party_mapping)

# Define the mapping for PARTY_CODE modifications
party_code_mapping = {
    'E': float('nan'),
    'S': float('nan'),
    'U': float('nan'),
    'A': 'O',
    'B': 'O',
    'C': 'O',
    'F': 'O',
    'G': 'O',
    'H': 'O',
    'I': 'O',
    'J': 'O',
    'K': 'O',
    'L': 'L',
    'P': 'O',
    'Q': 'O',
    'T': 'O',
    'V': 'O',
    'W': 'O',
    'Y': 'O',
    'Z': 'O'
}
# Apply the mapping to the PARTY_CODE column
corrected_data['PARTY_CODE'] = corrected_data['PARTY_CODE'].replace(party_code_mapping)

print("Final PARTY_CODE values:")
print(corrected_data['PARTY_CODE'].unique())
print('Final INFER_PARTY values:')
print(corrected_data['INFER_PARTY'].unique())
print('Total INFER_PARTY Ds and Rs:')
print(len(corrected_data.loc[corrected_data['INFER_PARTY'].isin(['D', 'R'])]))
print('INFER_PARTY NaNs:')
print(corrected_data['INFER_PARTY'].isna().sum())

Final PARTY_CODE values:
[nan 'R' 'D' 'N' 'O' 'L']
Final INFER_PARTY values:
['D' 'R' nan]
Total INFER_PARTY Ds and Rs:
1269
INFER_PARTY NaNs:
291


  corrected_data['INFER_PARTY'] = corrected_data['PARTY_CODE'].map(infer_party_mapping)


In [23]:
voter_columns = [col for col in corrected_data.columns if col.startswith("VTR")]

#Create column with total number of votes in voter_columns per row
corrected_data['VTR_TOTAL_VOTES'] = corrected_data[voter_columns].notnull().sum(axis=1)

#Sum Democrat and Republican totals
corrected_data['VTR_TOTAL_DVOTES'] = corrected_data[voter_columns].isin(['D', 'M', 'Z']).sum(axis=1)
corrected_data['VTR_TOTAL_RVOTES'] = corrected_data[voter_columns].isin(['R', 'P', 'X']).sum(axis=1)

#Create conditions that evaluate whether someone has cast at least two more votes for Democrats or Republicans: 
conditions = [
    corrected_data['VTR_TOTAL_DVOTES'] - corrected_data['VTR_TOTAL_RVOTES'] >= 2,
    corrected_data['VTR_TOTAL_DVOTES'] - corrected_data['VTR_TOTAL_RVOTES'] >= 2
]

choices = ['D', 'R']

corrected_data['VTR_INFER_PARTY'] = np.select(conditions, choices, default=np.nan)
corrected_data['VTR_INFER_PARTY'].replace('nan', np.nan, inplace=True)

# Assign a 'D' or an 'R' to INFER_PARTY if either condition is true:
corrected_data.loc[(corrected_data['VTR_INFER_PARTY'] == 'D') , 'INFER_PARTY'] = 'D'
corrected_data.loc[(corrected_data['VTR_INFER_PARTY'] == 'R') , 'INFER_PARTY'] = 'R'

print("Total number of D's and R's in INFER_PARTY column:")
print(len(corrected_data.loc[corrected_data['INFER_PARTY'].isin(['D','R'])])) 
print("Total number of NaNs in INFER_PARTY column:")
print(sum(corrected_data['INFER_PARTY'].isna()))

Total number of D's and R's in INFER_PARTY column:
1280
Total number of NaNs in INFER_PARTY column:
280


  corrected_data['VTR_TOTAL_VOTES'] = corrected_data[voter_columns].notnull().sum(axis=1)
  corrected_data['VTR_TOTAL_DVOTES'] = corrected_data[voter_columns].isin(['D', 'M', 'Z']).sum(axis=1)
  corrected_data['VTR_TOTAL_RVOTES'] = corrected_data[voter_columns].isin(['R', 'P', 'X']).sum(axis=1)
  corrected_data['VTR_INFER_PARTY'] = np.select(conditions, choices, default=np.nan)


In [None]:
# Load the dataset again
data = pd.read_csv('data/surveydata.csv')

corrected_data=data

#Replace blanks and spaces with NaN
corrected_data.replace(r'^\s*$', np.nan, regex=True, inplace=True)

# Update the 'I' values in PARTY_CODE to 'N' for rows where STATE is 'SD'
corrected_data.loc[(corrected_data['STATE'] == 'SD') & (corrected_data['PARTY_CODE'] == 'I'), 'PARTY_CODE'] = 'N'

# Update the 'U' values in PARTY_CODE to 'N' for rows where STATE is in ['DC', 'LA', 'RI']
states_to_update = ['DC', 'LA', 'RI']
corrected_data.loc[(corrected_data['STATE'].isin(states_to_update)) & (corrected_data['PARTY_CODE'] == 'U'), 'PARTY_CODE'] = 'N'

# Update the 'I' values in PARTY_CODE to 'N' for rows where STATE is 'OK'
corrected_data.loc[(corrected_data['STATE'] == 'OK') & (corrected_data['PARTY_CODE'] == 'I'), 'PARTY_CODE'] = 'N'

print(len(corrected_data.loc[corrected_data['PARTY_CODE'].isin(['N', 'U'])]))

# Clean PARTY_CODE and create INFER_PARTY column indicating D or R for all rows, both inferred and real
[543]:
from collections import defaultdict
# Create party_mapping using defaultdict.Lambda sets default (all other non-D and non-R party registrations to NaN).
infer_party_mapping = defaultdict(lambda: float('nan'))
infer_party_mapping.update({
    'D': 'D',  # Registered Democrats are mapped to Democrat
    'E': 'D',  # E (Inferred Democrats) are mapped to Democrat
    'R': 'R',  # Registered Republicans are mapped to Republican 
    'S': 'R'   # S (Inferred Republicans) are mapped to Republican
})
# Map PARTY_CODE to INFER_PARTY
corrected_data['INFER_PARTY'] = corrected_data['PARTY_CODE'].map(infer_party_mapping)

# Define the mapping for PARTY_CODE modifications
party_code_mapping = {
    'E': float('nan'),
    'S': float('nan'),
    'U': float('nan'),
    'A': 'O',
    'B': 'O',
    'C': 'O',
    'F': 'O',
    'G': 'O',
    'H': 'O',
    'I': 'O',
    'J': 'O',
    'K': 'O',
    'L': 'L',
    'P': 'O',
    'Q': 'O',
    'T': 'O',
    'V': 'O',
    'W': 'O',
    'Y': 'O',
    'Z': 'O'
}
# Apply the mapping to the PARTY_CODE column
corrected_data['PARTY_CODE'] = corrected_data['PARTY_CODE'].replace(party_code_mapping)

print("Final PARTY_CODE values:")
print(corrected_data['PARTY_CODE'].unique())
print('Final INFER_PARTY values:')
print(corrected_data['INFER_PARTY'].unique())
print('Total INFER_PARTY Ds and Rs:')
print(len(corrected_data.loc[corrected_data['INFER_PARTY'].isin(['D', 'R'])]))
print('INFER_PARTY NaNs:')
print(corrected_data['INFER_PARTY'].isna().sum())

voter_columns = [col for col in corrected_data.columns if col.startswith("VTR")]

#Create column with total number of votes in voter_columns per row
corrected_data['VTR_TOTAL_VOTES'] = corrected_data[voter_columns].notnull().sum(axis=1)

#Sum Democrat and Republican totals
corrected_data['VTR_TOTAL_DVOTES'] = corrected_data[voter_columns].isin(['D', 'M', 'Z']).sum(axis=1)
corrected_data['VTR_TOTAL_RVOTES'] = corrected_data[voter_columns].isin(['R', 'P', 'X']).sum(axis=1)

#Create conditions that evaluate whether someone has cast at least two more votes for Democrats or Republicans: 
conditions = [
    corrected_data['VTR_TOTAL_DVOTES'] - corrected_data['VTR_TOTAL_RVOTES'] >= 2,
    corrected_data['VTR_TOTAL_DVOTES'] - corrected_data['VTR_TOTAL_RVOTES'] >= 2
]

choices = ['D', 'R']

corrected_data['VTR_INFER_PARTY'] = np.select(conditions, choices, default=np.nan)
corrected_data['VTR_INFER_PARTY'].replace('nan', np.nan, inplace=True)

# Assign a 'D' or an 'R' to INFER_PARTY if either condition is true:
corrected_data.loc[(corrected_data['VTR_INFER_PARTY'] == 'D') , 'INFER_PARTY'] = 'D'
corrected_data.loc[(corrected_data['VTR_INFER_PARTY'] == 'R') , 'INFER_PARTY'] = 'R'

print("Total number of D's and R's in INFER_PARTY column:")
print(len(corrected_data.loc[corrected_data['INFER_PARTY'].isin(['D','R'])])) 
print("Total number of NaNs in INFER_PARTY column:")
print(sum(corrected_data['INFER_PARTY'].isna()))