In [None]:
import pandas as pd

In [41]:
csvs = {
    "cluster": "Cluster_merge_raw.csv",
    "realtor": "Realtor - Market Hotness.csv",
    "statsAmerica": "StatsAmerica - Population by Age and Sex - Clean.csv"
}

In [105]:
# read in, clean FIPS data

df_cluster = pd.read_csv(csvs["cluster"])

df_unemp = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/fips-unemp-16.csv",
                   dtype={"fips": str})

df_cluster = (
    df_cluster[
        (df_cluster.ibrc_geo_id_pop == df_cluster.ibrc_geo_id_combo)
        ]
    .drop(["Unnamed: 0", "ibrc_geo_id_pop", "ibrc_geo_id_sc", "county_fips_hot", "description_sc"], axis=1)
    .rename(columns={
        "ibrc_geo_id_combo": "fips"
    })
)

df_cluster.fips = [
    str(int(x)).zfill(5) for x in df_cluster.fips
]

df_cluster = df_cluster.merge(df_unemp, on="fips")

In [106]:
# organize columns

index_cols = ["fips", "description_pop", "year_pop"]

big5_cols = ['Agreeableness_sc', 'Conscientiousness_sc', "Extraversion_sc", "Neuroticism_sc", "Openness_sc"]

demographic_cols = ["Religiosity_sc"] + [x for x in df_cluster.columns if "population" in x]

pol_cols = ['Belief In Science_sc', 'Collectivism_sc', "Gender Equality_sc"]

personality_cols = [
    'Conflict Awareness_sc', 'Empathy_sc', "Hopefulness_sc",
     "Risk Taking_sc", "Selflessness_sc", "Tolerance_sc",
     "Work Ethic_sc"
     ]

economy_cols = ["Employment Rate_sc", "unemp", "Entrepreneurship_sc", "Income Mobility_sc", "Income Per Capita_sc"]

hot_cols = [x for x in df_cluster.columns if x.endswith("_hot")]

In [107]:
# add proportion variables to population by age columns

demographic_prop_cols = []

for pop_col in [x for x in demographic_cols if x.startswith("population_")] + ["male_population_pop", "female_population_pop"]:
    df_cluster[f"prop_{pop_col}"] = df_cluster[pop_col]/df_cluster["total_population_pop"]

    demographic_prop_cols += [f"prop_{pop_col}"]

In [114]:
# normalize features

df_normalized = df_cluster.copy()

for col in df_normalized[[x for x in df_normalized.columns if x not in index_cols + hot_cols]]:

    # put to scale 1-100. this is an issue as it assumes a uniform distribution. percentiles not calculated accurately.
    # df_normalized[f"{col}_norm"] = (df_normalized[col] - df_normalized[col].min())*(100/(df_normalized[col].max() - df_normalized[col].min()))

    # calculated as z-score. assumes normal distribution.

    df_normalized[f"{col}_norm"] = (df_normalized[col] - df_normalized[col].mean()) / df_normalized[col].std()

In [117]:
# save data

df_normalized.to_csv("cluster_merge_clean_normalized.csv", index=None)