In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the CSV file into a DataFrame
df = pd.read_csv("../data/cps_clean_v3.csv")

In [3]:
# Calculate the count of missing values for each column
missing_counts = df.isna().sum()

# Identify columns with more than 100000 missing values
cols_to_drop = missing_counts[missing_counts > 100000].index

# Drop the identified columns from the DataFrame
df = df.drop(columns=cols_to_drop)

# (Optional) Print the names of dropped columns
print("Dropped columns:", list(cols_to_drop))

Dropped columns: ['ELDCH', 'YNGCH', 'YRIMMIG', 'CLASSWKR', 'UHRSWORKT', 'WKSTAT', 'CLASSWLY', 'FULLPART', 'WKXPNS', 'NWLOOKWK', 'WANTJOB']


In [None]:
# Define the allowed features
oh_columns = ['RELATE', 'SEX', 'MARST', 'VETSTAT', 'FTYPE', 'FAMKIND', 'FAMREL', 'CITIZEN', 
                    'NATIVITY', 'HISPAN', 'EMPSTAT', 'LABFORCE',  'SCHLCOLL', 'DIFFHEAR', 'DIFFEYE', 
                    'DIFFREM', 'DIFFPHYS', 'DIFFMOB', 'DIFFCARE', 'DIFFANY', 'WORKLY', 'PENSION', 
                    'MIGRATE1', 'DISABWRK', 'QUITSICK']
sbert_columns = ['OCC', 'IND', 'OCCLY', 'INDLY', 'BPL', 'RACE']
inc_columns = ['FTOTVAL', 'INCTOT', 'INCWAGE', 'INCBUS', 'INCFARM', 'INCSS', 'INCWELFR', 'INCSSI', 
               'INCINT', 'INCUNEMP', 'INCWKCOM', 'INCVET', 'INCSURV', 'INCDISAB', 'INCDIVID', 
               'INCRENT', 'INCEDUC', 'INCCHILD', 'INCASIST', 'INCOTHER', 'ADJGINC', 'TAXINC']
exluded_columns = ['YEAR', 'INCLOG', 'INCZERO_ONE', 'INCZERO_TWO', 'INCPER_DELTA', 'comp_zero', 
                   'comp_central', 'comp_promo', 'comp_demo']
target_columns = ['INCLOG_DELTA']
ss_columns = ['AGE', 'FAMSIZE', 'NCHILD', 'NCHLT5', 'NSIBS', 'EDUC', 'WKSWORK1',
                'UHRSWORKLY', 'FIRMSIZE', 'NUMEMPS', 'MTHWELFR', 'HEALTH', 'INCPER']

In [5]:
# Select and copy the oh_columns from the original DataFrame into a new DataFrame
new_df = df[oh_columns].copy()

In [None]:
for col in inc_columns:
    # Clamp the values to a minimum of 1 to avoid issues with log10(0) or negative values
    clamped_series = df[col].clip(lower=100)
    # Apply the log10 transformation and divide by 6
    transformed_series = np.log10(clamped_series) / 6
    # Add the transformed column to new_df (using the same column name)
    new_df[col] = transformed_series

In [7]:
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the selected columns and transform the data
scaled_values = scaler.fit_transform(df[ss_columns])

# Create a DataFrame with the scaled values, ensuring the index aligns with new_df
scaled_df = pd.DataFrame(scaled_values, columns=ss_columns, index=df.index)

# Concatenate the scaled columns to new_df
new_df = pd.concat([new_df, scaled_df], axis=1)

In [8]:
# OCC
# Read the occ_key file
occ_df = pd.read_csv("../data/occ_key.csv")
occ_df.set_index("OCC_CODES", inplace=True)

# Keep only the occ_pca_i columns from occ_df
occ_df = occ_df.filter(regex="^occ_pca_")

# Temporarily add the OCC column from df to new_df for merging purposes
new_df["OCC_temp"] = df["OCC"]

# Merge the occ_pca_i columns using OCC_temp as the key (matching occ_df's index) and then drop the temporary column
new_df = new_df.merge(occ_df, left_on="OCC_temp", right_index=True, how="left")
new_df.drop(columns=["OCC_temp"], inplace=True)

# OCCLY
# Read the occ_key file
occly_df = pd.read_csv("../data/occ_key.csv")
occly_df.set_index("OCC_CODES", inplace=True)

# Keep only the occ_pca_* columns and rename their prefix to occly_pca_
occly_df = occly_df.filter(regex="^occ_pca_")
occly_df.rename(columns=lambda col: col.replace("occ_pca_", "occly_pca_"), inplace=True)

# Temporarily add the OCCLY column from df to new_df for merging purposes
new_df["OCCLY_temp"] = df["OCCLY"]

# Merge the occly_pca_i columns using OCCLY_temp as the key (matching occly_df's index)
new_df = new_df.merge(occly_df, left_on="OCCLY_temp", right_index=True, how="left")

# Drop the temporary key column
new_df.drop(columns=["OCCLY_temp"], inplace=True)

# IND
# Process IND: load the key file, filter, merge using df["IND"]
ind_df = pd.read_csv("../data/ind_key.csv")
ind_df.set_index("IND_CODES", inplace=True)
# Keep only the already labeled ind_pca_* columns
ind_df = ind_df.filter(regex="^ind_pca_")
# Use a temporary key column for merging
new_df["IND_temp"] = df["IND"]
new_df = new_df.merge(ind_df, left_on="IND_temp", right_index=True, how="left")
new_df.drop(columns=["IND_temp"], inplace=True)

# INDLY
# Process INDLY: load the key file, filter and rename columns, merge using df["INDLY"]
indly_df = pd.read_csv("../data/ind_key.csv")
indly_df.set_index("IND_CODES", inplace=True)
indly_df = indly_df.filter(regex="^ind_pca_")
# Rename columns to change prefix from ind_pca_ to indly_pca_
indly_df.rename(columns=lambda col: col.replace("ind_pca_", "indly_pca_"), inplace=True)
# Use a temporary key column for merging
new_df["INDLY_temp"] = df["INDLY"]
new_df = new_df.merge(indly_df, left_on="INDLY_temp", right_index=True, how="left")
new_df.drop(columns=["INDLY_temp"], inplace=True)

# BPL
# Process BPL: load key file, filter only columns starting with "bpl_pca_"
bpl_df = pd.read_csv("../data/bpl_key.csv")
bpl_df.set_index("BPL_CODES", inplace=True)
bpl_df = bpl_df.filter(regex="^bpl_pca_")

# Temporarily add the BPL column from df for merging purposes
new_df["BPL_temp"] = df["BPL"]

# Merge the filtered BPL columns into new_df using the temporary key
new_df = new_df.merge(bpl_df, left_on="BPL_temp", right_index=True, how="left")
new_df.drop(columns=["BPL_temp"], inplace=True)

# RACE
# Process RACE: load key file, filter only columns starting with "race_pca_"
race_df = pd.read_csv("../data/race_key.csv")
race_df.set_index("RACE_CODES", inplace=True)
race_df = race_df.filter(regex="^race_pca_")

# Temporarily add the RACE column from df for merging purposes
new_df["RACE_temp"] = df["RACE"]

# Merge the filtered RACE columns into new_df using the temporary key
new_df = new_df.merge(race_df, left_on="RACE_temp", right_index=True, how="left")
new_df.drop(columns=["RACE_temp"], inplace=True)


In [9]:
new_df["INCLOG_DELTA"] = df["INCLOG_DELTA"]

In [10]:
new_df[exluded_columns] = df[exluded_columns]

In [12]:
new_df.to_csv("../data/cleanv3_transform.csv", index=False)

In [13]:
# Standard scale the "INCPER" column from df
scaler = StandardScaler()
scaled_incpers = scaler.fit_transform(df[['INCPER']])

# Add the scaled INCPER column to new_df
new_df['INCPER'] = scaled_incpers

# Save the updated new_df to the CSV file
new_df.to_csv("../data/cleanv3_transform.csv", index=False)

In [None]:
new_features_to_add = [
    [('INCTOT', 'spline', 20)]
]
new_features_to_add = [
    [('EDUC', 'spline', 5)]
]