In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the CSV file
df = pd.read_csv("../data/CPS_Clean1.csv")

In [3]:
# Remove columns ending with '_2'
cols_to_drop = [col for col in df.columns if col.endswith('_2')]
df.drop(columns=cols_to_drop, inplace=True)

# Rename columns ending with '_1' by removing the suffix
df.rename(columns=lambda x: x[:-2] if x.endswith('_1') else x, inplace=True)

In [36]:
# Remove instances where BPL or NATIVITY are missing
df.dropna(subset=["BPL", "NATIVITY", "LABFORCE"], inplace=True)

In [43]:
df.drop(columns=["ASIAN", "DURUNEMP", "WHYUNEMP", "WHYABSNT", "WNFTLOOK", 
                 "WKSUNEM1", "WNLWNILF", "INCRETIR", 
                 "HOURWAGE", "PAIDHOUR", "UNION", "EARNWEEK", "POPSTAT"], inplace=True, errors='ignore')

In [44]:
# Split based on occupational presence
df_work = df[df["OCC"] != 0]
df_0 = df[df["OCC"] == 0]

In [48]:
# Assuming df_work and df_0 are already defined, get the number of rows in each.
n_instances_work = len(df_work)
n_instances_0 = len(df_0)

# Initialize the report dictionary
report = {}

# Loop through each column (feature) in df_work (assuming df_0 shares the same features)
for col in df_work.columns:
    # For df_work (numeric columns only)
    if pd.api.types.is_numeric_dtype(df_work[col]):
        work_median = df_work[col].median()
        # Count instances in df_work equal to the median value
        work_median_count = (df_work[col] == work_median).sum()
        # Calculate percentage of median values for df_work
        work_median_percent = (work_median_count / n_instances_work) * 100
    else:
        work_median = "Not Applicable"
        work_median_percent = "Not Applicable"
    
    # Missing data for df_work: count NaNs, and for string columns also empty strings.
    missing_work = df_work[col].isna()
    if pd.api.types.is_string_dtype(df_work[col]):
        missing_work |= (df_work[col].str.strip() == '')
    missing_count_work = missing_work.sum()
    missing_percent_work = (missing_count_work / n_instances_work) * 100

    # For df_0:
    if col in df_0.columns:
        if pd.api.types.is_numeric_dtype(df_0[col]):
            median_0 = df_0[col].median()
            median_count_0 = (df_0[col] == median_0).sum()
            median_percent_0 = (median_count_0 / n_instances_0) * 100
        else:
            median_0 = "Not Applicable"
            median_percent_0 = "Not Applicable"

        # Missing data for df_0
        missing_0 = df_0[col].isna()
        if pd.api.types.is_string_dtype(df_0[col]):
            missing_0 |= (df_0[col].str.strip() == '')
        missing_count_0 = missing_0.sum()
        missing_percent_0 = (missing_count_0 / n_instances_0) * 100
    else:
        median_0 = median_percent_0 = missing_percent_0 = "Column not found in df_0"
    
    # Save the results in the dictionary without "min" and "max"
    report[col] = {
        "median": work_median,                        # median for df_work
        "median_df_0": median_0,                        # median for df_0
        "median_percent": work_median_percent,          # percent of values equal to the median in df_work
        "median_percent_df_0": median_percent_0,          # percent of values equal to the median in df_0 (for numeric col)
        "missing_percent": missing_percent_work,        # percent of missing data in df_work
        "missing_percent_df_0": missing_percent_0         # percent of missing data in df_0
    }

# Create a DataFrame from the report dictionary.
report_df = pd.DataFrame(report).T

# Set pandas options to display all columns side-by-side.
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)  # Adjust width as needed

print(report_df)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


                    median   median_df_0  median_percent  median_percent_df_0  missing_percent  missing_percent_df_0
YEAR          2.015000e+03  2.015000e+03        7.044177             7.152032         0.000000              0.000000
SERIAL        4.551900e+04  4.856200e+04        0.001599             0.002072         0.000000              0.000000
MONTH         3.000000e+00  3.000000e+00      100.000000           100.000000         0.000000              0.000000
CPSID         2.014120e+13  2.015020e+13        0.000640             0.000000         0.000000              0.000000
ASECFLAG      1.000000e+00  1.000000e+00      100.000000           100.000000         0.000000              0.000000
ASECWTH       1.681290e+03  1.785205e+03        0.002558             0.000000         0.000000              0.000000
PERNUM        1.000000e+00  2.000000e+00       52.719557            31.104528         0.000000              0.000000
CPSIDP        2.014120e+13  2.015020e+13        0.000000        

In [46]:
# Fill missing values with the mode for specified categorical columns
mode_cols = ["VETSTAT", "WKSTAT", "CLASSWLY", "FULLPART", "PENSION"]
for col in mode_cols:
    if col in df_work.columns:
        mode_val = df_work[col].mode(dropna=True)
        if not mode_val.empty:
            df_work[col] = df_work[col].fillna(mode_val.iloc[0])

# Fill missing values with the median for specified numerical columns
median_cols = ["UHRSWORKT", "UHRSWORK1", "UHRSWORKLY", "FIRMSIZE", "NUMEMPS"]
for col in median_cols:
    if col in df_work.columns:
        median_val = df_work[col].median()
        df_work[col] = df_work[col].fillna(median_val)

df_work["WKXPNS"] = df_work["WKXPNS"].fillna(0)

# Fill missing SCHLCOLL values by mapping the modal value for that row's AGE
age_mode = df_work.groupby("AGE")["SCHLCOLL"].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
df_work["SCHLCOLL"] = df_work.apply(lambda row: age_mode[row["AGE"]] if pd.isna(row["SCHLCOLL"]) else row["SCHLCOLL"], axis=1)
df_work.loc[(df_work["AGE"] > 50) & (df_work["SCHLCOLL"].isna()), "SCHLCOLL"] = 5
df_work.loc[(df_work["AGE"] == 15) & (df_work["SCHLCOLL"].isna()), "SCHLCOLL"] = 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_work[col] = df_work[col].fillna(mode_val.iloc[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_work[col] = df_work[col].fillna(median_val)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_work[col] = df_work[col].fillna(median_val)
A value is trying to be set on a copy of a slice from a Da

In [39]:
# Fill missing values with the mode for specified categorical columns
mode_cols = ["VETSTAT", "PENSION", "DIFFHEAR", "DIFFHEAR", "DIFFEYE", "DIFFREM", "DIFFPHYS", "DIFFMOB", "DIFFCARE", "DIFFANY", "NWLOOKWK", "WANTJOB"]
for col in mode_cols:
    if col in df_0.columns:
        mode_val = df_0[col].mode(dropna=True)
        if not mode_val.empty:
            df_0[col] = df_0[col].fillna(mode_val.iloc[0])

# Fill missing values with the median for specified numerical columns
median_cols = ["UHRSWORKLY", "FIRMSIZE", "NUMEMPS"]
for col in median_cols:
    if col in df_0.columns:
        median_val = df_0[col].median()
        df_0[col] = df_0[col].fillna(median_val)

# Fill missing SCHLCOLL values by mapping the modal value for that row's AGE
age_mode = df_0.groupby("AGE")["SCHLCOLL"].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
df_0["SCHLCOLL"] = df_0.apply(lambda row: age_mode[row["AGE"]] if pd.isna(row["SCHLCOLL"]) else row["SCHLCOLL"], axis=1)
df_0.loc[(df_0["AGE"] > 50) & (df_0["SCHLCOLL"].isna()), "SCHLCOLL"] = 5
df_0.loc[(df_0["AGE"] == 15) & (df_0["SCHLCOLL"].isna()), "SCHLCOLL"] = 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_0[col] = df_0[col].fillna(mode_val.iloc[0])
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_0[col] = df_0[col].fillna(median_val)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_0[col] = df_0[col].fillna(median_val)
A value is trying t

In [50]:
combined_df = pd.concat([df_work, df_0], ignore_index=True)

In [51]:
# Define the adjustment factors by YEAR
adjustment_factors = {
    2009: 1.42,
    2010: 1.397,
    2011: 1.355,
    2012: 1.327,
    2013: 1.308,
    2014: 1.287,
    2015: 1.286,
    2016: 1.27,
    2017: 1.243,
    2018: 1.213,
    2019: 1.192,
    2020: 1.177,
    2021: 1.124,
    2022: 1.041,
    2023: 1
}

# Create a Series for the adjustment factor corresponding to each row's YEAR
adjustment_series = combined_df["YEAR"].map(adjustment_factors)

# Multiply all columns whose name contains "INC" by the adjustment factor, exempting specific columns
exempt_inc = {"INCPER", "INCPER_DELTA"}
for col in combined_df.columns:
    if "INC" in col and col not in exempt_inc and pd.api.types.is_numeric_dtype(combined_df[col]):
        combined_df[col] = combined_df[col] * adjustment_series

In [52]:
# Define the exception columns
# save_exceptions = {"YEAR", "SERIAL", "MONTH", "CPSID", "ASECFLAG", 
#               "ASECWTH", "PERNUM", "CPSIDP", "CPSIDV", "ASECWT", "INCCHANGE"}

save_exceptions = {"INCCHANGE", "SERIAL", "MONTH", "CPSID", "ASECFLAG", 
               "ASECWTH", "PERNUM", "CPSIDP", "CPSIDV", "ASECWT"}

# Create a new DataFrame with columns not in exceptions
df_to_save = combined_df[[col for col in combined_df.columns if col not in save_exceptions]]

# Save the DataFrame to a CSV file
df_to_save.to_csv("../data/cps_clean_v2.csv", index=False)