In [8]:
from preprocessing_utils import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

survey_data_path = "/Volumes/DeepLearner/MADS/Milestone_data/survey_cleaned.csv"
data_5k_path = "/Volumes/DeepLearner/MADS/Milestone_data/data_5k_cleaned.csv"

In [9]:
df_survey = pd.read_csv(survey_data_path, low_memory=False)
df_survey = df_survey.drop(columns=['Unnamed: 0', 'RECORD_ID'])
df_survey.head(), df_survey.shape

(  ADD_TYPE   AGE AI_COUNTY_NAME  CENSUS_ST  CENSUS_TRK  CNS_MEDINC  \
 0        S  63.0         MARION         18    390602.0       728.0   
 1        S  73.0           POLK         12     14123.0       511.0   
 2        S  86.0    LOS ANGELES          6    403703.0       728.0   
 3        S  55.0          OCEAN         34    714200.0       868.0   
 4        S  73.0      JEFFERSON          8      9832.0       793.0   
 
    CNSUS_PCTA  CNSUS_PCTB  CNSUS_PCTH  CNSUS_PCTI  ...  VTR_PRI20  VTR_PRI21  \
 0    0.000000   30.357143    7.142857         0.0  ...          N          N   
 1    0.000000   28.571429   14.285714         0.0  ...          N          N   
 2   24.324324    0.000000   43.243243         0.0  ...          N          N   
 3    0.000000    0.000000   13.636364         0.0  ...          N          N   
 4    0.000000    8.450704   14.788732         0.0  ...          Y          N   
 
    VTR_PRI22    ZIP           Q1_Candidate                  Q2_Support  \
 0       

In [14]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

def preprocess_dataframe(df, use_frequency_encoding=False):
    """
    Preprocesses the dataframe based on the specified steps.
    
    Parameters:
        df (DataFrame): Input dataframe.
        use_frequency_encoding (bool): Whether to use frequency encoding for ZIP, State, and County.
        
    Returns:
        DataFrame: Preprocessed dataframe.
    """

    df = df.applymap(lambda x: x.upper() if type(x) == str else x)

    df['PRFL_MINWAGE'] = df['PRFL_MINWAGE'].replace('N', 'UNKNOWN')

    # one-Hot Encoding
    one_hot_cols = [
        'CENSUS_ST', 'AI_COUNTY_NAME', 'ADD_TYPE', 'CENSUS_TRK', 'CONG_DIST',
        'COUNTY_TYPE', 'DON_CHARIT', 'DON_POLIT', 'ETHNIC_INFER',
        'GENDER_MIX', 'GENERATION', 'HOMEOWNER', 'HOMEOWNRNT', 'LANGUAGE',
        'LIFESTAGE_CLUSTER', 'PARTY_CODE', 'PARTY_MIX', 'PRESENCHLD', 'RELIGION',
        'SEX', 'ST_LO_HOUS', 'ST_UP_HOUS', 'STATUS'
    ]
    
    df = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)

    # Handle PRFL_ columns
    prfl_cols = [col for col in df.columns if col.startswith('PRFL_')]
    df = pd.get_dummies(df, columns=prfl_cols, drop_first=True)

    # splitting Columns
    split_cols = ['TOD_PRES_DIFF_2016', 'TOD_PRES_DIFF_2016_PREC', 'TOD_PRES_DIFF_2020_PREC']
    for col in split_cols:
        df[col + '_num'] = df[col].str.extract('(\d+)').astype('float')
        df[col + '_party'] = df[col].str.extract('([RD])')

    #convert to Int
    int_cols = ['VOTER_CNT', 'TRAIL_CNT', 'CNS_MEDINC', 'HH_SIZE', 'LENGTH_RES', 'PERSONS_HH']
    df[int_cols] = df[int_cols].astype(int)

    # label Encoding
    label_cols = ['CREDRATE', 'EDUCATION', 'HH_SIZE', 'HOMEMKTVAL', 'INCOMESTHH', 'NETWORTH']
    for col in label_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

    # drop Columns
    drop_cols = ['ETHNICCODE']
    df.drop(columns=drop_cols, inplace=True)
    
    # frequency Encoding for ZIP, State, and County
    if use_frequency_encoding:
        freq_cols = ['ZIP', 'STATE', 'COUNTY_ST']
        for col in freq_cols:
            freq_map = df[col].value_counts(normalize=True)
            df[col + '_freq'] = df[col].map(freq_map)
            
    return df



In [15]:
df_survey = preprocess_dataframe(df_survey, use_frequency_encoding=True)
df_survey.head(), df_survey.shape

(    AGE  CNS_MEDINC  CNSUS_PCTA  CNSUS_PCTB  CNSUS_PCTH  CNSUS_PCTI  \
 0  63.0         728    0.000000   30.357143    7.142857         0.0   
 1  73.0         511    0.000000   28.571429   14.285714         0.0   
 2  86.0         728   24.324324    0.000000   43.243243         0.0   
 3  55.0         868    0.000000    0.000000   13.636364         0.0   
 4  73.0         793    0.000000    8.450704   14.788732         0.0   
 
    CNSUS_PCTM  CNSUS_PCTO  CNSUS_PCTP  CNSUS_PCTW  ...  PRFL_VETERAN_Y  \
 0    5.357143         0.0    0.000000   57.142857  ...               0   
 1    7.142857         0.0    0.000000   50.000000  ...               0   
 2    0.000000         0.0    0.000000   32.432432  ...               0   
 3    2.272727         0.0    0.000000   84.090909  ...               0   
 4    4.225352         0.0    0.704225   71.830986  ...               0   
 
    TOD_PRES_DIFF_2016_num  TOD_PRES_DIFF_2016_party  \
 0                    23.0                         D   
 1