In [19]:
import pandas as pd
import numpy as np
import time
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

## 500K dataset

In [4]:
# Data Loading
data_500 = pd.read_csv("/Users/nirugidla/Documents/GitHub/milestone2_MADS/data_500k.csv", low_memory=False)

# Data Cleaning - Column Names
data_500.columns = data_500.columns.str.strip()

# Data Cleaning - Drop ZIP
data_500.drop('ZIP', axis=1, inplace=True)

# Data Cleaning - Drop Duplicates
data_500.drop_duplicates(inplace=True)

# Data Cleaning - Object Columns
for col in data_500.columns:
    if data_500[col].dtype == 'object':
        data_500[col] = data_500[col].str.strip()

# Data Cleaning - Empty Strings
data_500.replace('', 'Unknown', inplace=True)

# Data Cleaning - NaN for Object Types
data_500.loc[:, data_500.dtypes == 'object'] = data_500.loc[:, data_500.dtypes == 'object'].fillna('Unknown')

# Data Cleaning - Drop Columns and Rows with All NaNs
data_500.dropna(axis=1, how='all', inplace=True)
data_500.dropna(axis=0, how='all', inplace=True)

# Identify numeric and non-numeric columns
numeric_cols = data_500.select_dtypes(include=['int64', 'float64']).columns
non_numeric_cols = data_500.select_dtypes(exclude=['int64', 'float64']).columns

# Data Cleaning - Removing Non-Numeric Columns with More Than 90% Missing Data
missing_data_percentage = data_500.isnull().mean() * 100
non_numeric_cols_to_remove = missing_data_percentage[non_numeric_cols]
non_numeric_cols_to_remove = non_numeric_cols_to_remove[non_numeric_cols_to_remove > 90].index.tolist()
data_500_reduced = data_500.drop(columns=non_numeric_cols_to_remove)

# Update the list of non-numeric columns after removal
non_numeric_cols = data_500_reduced.select_dtypes(exclude=['int64', 'float64']).columns

# Identifying Specific Types of Non-Numeric Columns
cols_with_Y_or_Unknown = [col for col in non_numeric_cols if set(data_500_reduced[col].unique()) <= {'Y', 'Unknown'}]
cols_with_more_than_two_categories = [col for col in non_numeric_cols if len(data_500_reduced[col].unique()) > 2]

# Print identified columns
print("Columns with 'Y' or 'Unknown':", cols_with_Y_or_Unknown, len(cols_with_Y_or_Unknown))
print("Columns with more than two categories:", cols_with_more_than_two_categories, len(cols_with_more_than_two_categories))

Columns with 'Y' or 'Unknown': ['AFAMPROFLS', 'APP_CHILD', 'APP_MENBIG', 'APP_TODDLR', 'APP_WOMEN', 'APP_WOMPET', 'APP_WOMPLS', 'APP_YNGMEN', 'ARTS', 'AUTOACCES', 'AUTOWORK', 'BOATING', 'BROADERLIV', 'CARDUSER', 'CATOWNER', 'CH_0002FEM', 'CH_0002MAL', 'CH_0002UNK', 'CH_0305FEM', 'CH_0305MAL', 'CH_0305UNK', 'CH_0610FEM', 'CH_0610MAL', 'CH_0610UNK', 'CH_1115FEM', 'CH_1115MAL', 'CH_1115UNK', 'CH_1617FEM', 'CH_1617MAL', 'CH_1617UNK', 'CHRISTFAM', 'COL_ANTIQ', 'COL_ARTS', 'COL_COIN', 'COL_SPORT', 'COL_STAMP', 'COMPHOMOFC', 'COMPUTERS', 'COOK_GEN', 'CURRAFFAIR', 'DEPTSTCRD', 'DIETING', 'DIYLIV', 'DOGOWNER', 'DON_ANML', 'DON_ARTCUL', 'DON_CHARIT', 'DON_CHILD', 'DON_ENVIR', 'DON_ENVWLD', 'DON_HEALTH', 'DON_INTAID', 'DON_OTHER', 'DON_POLCONS', 'DON_POLIT', 'DON_POLLIB', 'DON_RELIG', 'DON_VET', 'DONATION', 'EDU_ONLINE', 'EQUESTRIAN', 'EXER_GROUP', 'GAMING', 'GARDENER', 'GOLF', 'GRANDCHLD', 'HEALTHBEAU', 'HEATHMED', 'HH_SENIOR', 'HH_VETERAN', 'HH_YOUNGAD', 'HIGHBROW', 'HIGHENDAPP', 'HISTMIL', 'HI

## time decorator

In [None]:
def timer_decorator(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Elapsed time: {elapsed_time:.2f} seconds")
        return result
    return wrapper

In [29]:
@timer_decorator
def run_random_forest(data, sample_size=100000, num_runs=1, top_N_features=10):
    results_list = []

    for run in range(1, num_runs + 1):
        print(f"Starting Random Forest run {run}...")
        
        # Copy the original data to work with
        data_copy = data.copy()
        
        # Drop rows with missing 'PARTY_CODE'
        data_copy = data_copy.dropna(subset=['PARTY_CODE'])
        
        # Sample data
        data_sample = data_copy.sample(n=sample_size, random_state=42+run)
        
        # Encode 'PARTY_CODE' column
        le = LabelEncoder()
        data_sample['PARTY_CODE'] = le.fit_transform(data_sample['PARTY_CODE'].astype(str))
        
        # Select numerical columns
        numerical_cols = data_sample.select_dtypes(include=['int64', 'float64']).columns
        
        # Impute missing values with column means
        imputer = SimpleImputer(strategy='mean')
        data_sample[numerical_cols] = imputer.fit_transform(data_sample[numerical_cols])
        
        # Extract features and target variable
        X = data_sample[numerical_cols]
        y = data_sample['PARTY_CODE']
        
        # Train-test split without stratified sampling
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Initialize Random Forest Classifier
        rf = RandomForestClassifier(random_state=42+run)
        
        # Fit the model
        rf.fit(X_train, y_train)
        
        # Make predictions
        y_pred = rf.predict(X_test)
        
        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        
        # Get top N features using SelectKBest with chi-squared
        k_best = SelectKBest(score_func=chi2, k=top_N_features)
        k_best.fit(X_train, y_train)
        top_feature_indices = np.argsort(k_best.scores_)[::-1][:top_N_features]
        top_features = X.columns[top_feature_indices].tolist()
        
        # Append results to the list
        results_list.append({
            'Run': run,
            'Accuracy': accuracy,
            'F1_Score': f1,
            'Recall': recall,
            'Top_N_Features': top_features
        })

    # Create a DataFrame from the results list
    results_df = pd.DataFrame(results_list)

    return results_df

# Assuming data_500 is your dataset, you can call the function like this:
result = run_random_forest(data_500, sample_size=50000, num_runs=2, top_N_features=10)
result

Starting Random Forest run 1...
Starting Random Forest run 2...
Elapsed time: 32.87 seconds


Unnamed: 0,Run,Accuracy,F1_Score,Recall,Top_N_Features
0,1,0.9841,0.979634,0.9841,"[CENSUS_TRK, RECORD_ID, VP_PPP, CNSUS_PCTB, PA..."
1,2,0.9871,0.983469,0.9871,"[CENSUS_TRK, RECORD_ID, VP_PPP, CNSUS_PCTB, PA..."


In [30]:
result['Top_N_Features'][0]

['CENSUS_TRK',
 'RECORD_ID',
 'VP_PPP',
 'CNSUS_PCTB',
 'PARTY_CODE',
 'TOD_PRES_R_2016_PREC',
 'TOD_PRES_D_2016_PREC',
 'CNSUS_PCTW',
 'VP_PRI',
 'TOD_PRES_R_2020_PREC']

#### Gradient boosting

In [32]:
@timer_decorator
def run_gradient_boosting(data, sample_size=100000, num_runs=1, top_N_features=10):
    results_list = []

    for run in range(1, num_runs + 1):
        print(f"Starting Gradient Boosting run {run}...")
        
        # Copy the original data to work with
        data_copy = data.copy()
        
        # Drop rows with missing 'PARTY_CODE'
        data_copy = data_copy.dropna(subset=['PARTY_CODE'])
        
        # Sample data
        data_sample = data_copy.sample(n=sample_size, random_state=42)
        
        # Encode 'PARTY_CODE' column
        le = LabelEncoder()
        data_sample['PARTY_CODE'] = le.fit_transform(data_sample['PARTY_CODE'].astype(str))
        
        # Check for classes with only one instance after sampling and encoding
        class_counts = data_sample['PARTY_CODE'].value_counts()
        min_class_count = class_counts.min()
        
        # Select numerical columns
        numerical_cols = data_sample.select_dtypes(include=['int64', 'float64']).columns
        
        # Impute missing values with column means
        imputer = SimpleImputer(strategy='mean')
        data_sample[numerical_cols] = imputer.fit_transform(data_sample[numerical_cols])
        
        # Extract features and target variable
        X = data_sample[numerical_cols]
        y = data_sample['PARTY_CODE']
        
        # Train-test split
        if min_class_count > 1:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
        else:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        
        # Initialize Gradient Boosting Classifier
        gb = GradientBoostingClassifier(random_state=42)
        
        # Fit the model
        gb.fit(X_train, y_train)
        
        # Make predictions
        y_pred = gb.predict(X_test)
        
        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted', zero_division='warn')
        
        # Get top N features using SelectKBest with chi-squared
        k_best = SelectKBest(score_func=chi2, k=top_N_features)
        k_best.fit(X_train, y_train)
        top_feature_indices = np.argsort(k_best.scores_)[::-1][:top_N_features]
        top_features = X.columns[top_feature_indices].tolist()
        
        # Append results to the list
        results_list.append({
            'Run': run,
            'Accuracy': accuracy,
            'F1_Score': f1,
            'Recall': recall,
            'Top_N_Features': top_features
        })

    # Create a DataFrame from the results list
    results_df = pd.DataFrame(results_list)

    return results_df

# Assuming data_500 is your dataset, you can call the function like this:
result = run_gradient_boosting(data_500, sample_size=50000, num_runs=2, top_N_features=10)
result

Starting Gradient Boosting run 1...
Starting Gradient Boosting run 2...
Elapsed time: 456.50 seconds


Unnamed: 0,Run,Accuracy,F1_Score,Recall,Top_N_Features
0,1,0.9992,0.9992,0.9992,"[CENSUS_TRK, RECORD_ID, VP_PPP, CNSUS_PCTB, PA..."
1,2,0.9992,0.9992,0.9992,"[CENSUS_TRK, RECORD_ID, VP_PPP, CNSUS_PCTB, PA..."


In [33]:
result['Top_N_Features'][0]

['CENSUS_TRK',
 'RECORD_ID',
 'VP_PPP',
 'CNSUS_PCTB',
 'PARTY_CODE',
 'TOD_PRES_R_2016_PREC',
 'TOD_PRES_D_2016_PREC',
 'CNSUS_PCTW',
 'VP_PRI',
 'TOD_PRES_R_2020_PREC']

In [28]:
@timer_decorator
def run_gradient_boosting(data, sample_size=100000, num_runs=1, top_N_features=10):
    results_list = []

    for run in range(1, num_runs + 1):
        print(f"Starting Gradient Boosting run {run}...")
        
        # Copy the original data to work with
        data_copy = data.copy()
        
        # Check class distribution
        print("Class distribution before dropping NaNs: ", data_copy['PARTY_CODE'].value_counts())
        
        # Drop rows with missing 'PARTY_CODE'
        data_copy = data_copy.dropna(subset=['PARTY_CODE'])
        
        # Check class distribution
        print("Class distribution after dropping NaNs: ", data_copy['PARTY_CODE'].value_counts())
        
        # Sample data
        data_sample = data_copy.sample(n=sample_size, random_state=42+run)
        
        # Check class distribution
        print("Class distribution after sampling: ", data_sample['PARTY_CODE'].value_counts())
        
        # Encode 'PARTY_CODE' column
        le = LabelEncoder()
        data_sample['PARTY_CODE'] = le.fit_transform(data_sample['PARTY_CODE'].astype(str))
        
        # Extract features and target variable
        X = data_sample.drop('PARTY_CODE', axis=1)
        y = data_sample['PARTY_CODE']
        
        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42+run, stratify=y)
        
        # Initialize Gradient Boosting Classifier
        gb = GradientBoostingClassifier(random_state=42+run)
        
        # Fit the model
        gb.fit(X_train, y_train)
        
        # Make predictions
        y_pred = gb.predict(X_test)
        
        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)  # Changed zero_division to 1
        
        # Get top N features using SelectKBest with chi-squared
        k_best = SelectKBest(score_func=chi2, k=top_N_features)
        k_best.fit(X_train, y_train)
        top_feature_indices = np.argsort(k_best.scores_)[::-1][:top_N_features]
        top_features = X.columns[top_feature_indices].tolist()
        
        # Append results to the list
        results_list.append({
            'Run': run,
            'Accuracy': accuracy,
            'F1_Score': f1,
            'Recall': recall,
            'Top_N_Features': top_features
        })

    # Create a DataFrame from the results list
    results_df = pd.DataFrame(results_list)

    return results_df

# Assuming data_500 is your dataset, you can call the function like this:
result = run_gradient_boosting(data_500, sample_size=50000, num_runs=2, top_N_features=10)
result
result['Top_N_Features'][0]

['CENSUS_TRK', 'RECORD_ID', 'VP_PPP', 'CNSUS_PCTB', 'PARTY_CODE']

#### Logistic Regression

In [20]:
@timer_decorator
def run_logistic_regression(data, sample_size=100000, num_runs=1, top_N_features=10):
    results_list = []

    for run in range(1, num_runs + 1):
        print(f"Starting Logistic Regression run {run}...")
        
        # Copy the original data to work with
        data_copy = data.copy()
        
        # Drop rows with missing 'PARTY_CODE'
        data_copy = data_copy.dropna(subset=['PARTY_CODE'])
        
        # Sample data
        data_sample = data_copy.sample(n=sample_size, random_state=42+run)
        
        # Encode 'PARTY_CODE' column
        le = LabelEncoder()
        data_sample['PARTY_CODE'] = le.fit_transform(data_sample['PARTY_CODE'].astype(str))
        
        # Check for classes with only one instance after sampling and encoding
        class_counts = data_sample['PARTY_CODE'].value_counts()
        min_class_count = class_counts.min()
        
        # Select numerical columns
        numerical_cols = data_sample.select_dtypes(include=['int64', 'float64']).columns
        
        # Impute missing values with column means
        imputer = SimpleImputer(strategy='mean')
        data_sample[numerical_cols] = imputer.fit_transform(data_sample[numerical_cols])
        
        # Extract features and target variable
        X = data_sample[numerical_cols]
        y = data_sample['PARTY_CODE']
        
        # Train-test split
        if min_class_count > 1:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
        else:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        
        # Initialize Logistic Regression model
        lr = LogisticRegression(C=0.1, max_iter=10000, random_state=42+run)
    
        # Fit the model
        lr.fit(X_train, y_train)
        
        # Make predictions
        y_pred = lr.predict(X_test)
        
        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted', zero_division='warn')
        
        # Get top N features using SelectKBest with chi-squared
        k_best = SelectKBest(score_func=chi2, k=top_N_features)
        k_best.fit(X_train, y_train)
        top_feature_indices = np.argsort(k_best.scores_)[::-1][:top_N_features]
        top_features = X.columns[top_feature_indices].tolist()
        
        # Append results to the list
        results_list.append({
            'Run': run,
            'Accuracy': accuracy,
            'F1_Score': f1,
            'Recall': recall,
            'Top_N_Features': top_features
        })

    # Create a DataFrame from the results list
    results_df = pd.DataFrame(results_list)

    return results_df

# Assuming data_500 is your dataset, you can call the function like this:
result = run_logistic_regression(data_500, sample_size=50000, num_runs=2, top_N_features=10)
result

Starting Logistic Regression run 1...
Starting Logistic Regression run 2...
Starting Logistic Regression run 3...
Starting Logistic Regression run 4...
Starting Logistic Regression run 5...
Starting Logistic Regression run 6...
Starting Logistic Regression run 7...
Starting Logistic Regression run 8...
Starting Logistic Regression run 9...
Starting Logistic Regression run 10...
Elapsed time: 363.45 seconds


Unnamed: 0,Run,Accuracy,F1_Score,Recall,Top_N_Features
0,1,0.7568,0.7438,0.7568,"[PARTY_CODE, PARTY_MIX, VP_PPP, CNSUS_PCTB, TO..."
1,2,0.7568,0.7438,0.7568,"[PARTY_CODE, PARTY_MIX, VP_PPP, CNSUS_PCTB, TO..."
2,3,0.7568,0.7438,0.7568,"[PARTY_CODE, PARTY_MIX, VP_PPP, CNSUS_PCTB, TO..."
3,4,0.7568,0.7438,0.7568,"[PARTY_CODE, PARTY_MIX, VP_PPP, CNSUS_PCTB, TO..."
4,5,0.7568,0.7438,0.7568,"[PARTY_CODE, PARTY_MIX, VP_PPP, CNSUS_PCTB, TO..."
5,6,0.7568,0.7438,0.7568,"[PARTY_CODE, PARTY_MIX, VP_PPP, CNSUS_PCTB, TO..."
6,7,0.7568,0.7438,0.7568,"[PARTY_CODE, PARTY_MIX, VP_PPP, CNSUS_PCTB, TO..."
7,8,0.7568,0.7438,0.7568,"[PARTY_CODE, PARTY_MIX, VP_PPP, CNSUS_PCTB, TO..."
8,9,0.7568,0.7438,0.7568,"[PARTY_CODE, PARTY_MIX, VP_PPP, CNSUS_PCTB, TO..."
9,10,0.7568,0.7438,0.7568,"[PARTY_CODE, PARTY_MIX, VP_PPP, CNSUS_PCTB, TO..."


## Data Cleaning