In [None]:
import os 
os.chdir("..")

In [None]:
import polars as pl
import geopandas as gpd
import arviz as az
import numpy as np
from src.data_process import DataReg
from scipy.spatial import distance

az.style.use("arviz-darkgrid")


dr = DataReg(database_file="data.ddb")

In [None]:
df = dr.data_set()
df = df.filter(
    (pl.col("industry_code") == "72")
)
df_dp03 = dr.pull_dp03()
df_dp03 = df_dp03.with_columns(
    area_fips=pl.col("geoid"),
)

In [None]:
df = df.group_by(["area_fips","year"]).agg(
    employment=((pl.col("month1_emplvl") + pl.col("month2_emplvl") + pl.col("month3_emplvl")) / 3).mean()
)

In [None]:
data = df.join(
    df_dp03, on=["area_fips","year"],how="left",validate="m:1"
).sort(by=["area_fips","year"])

In [None]:
data2 = data.filter(pl.col("year") == 2015)

In [None]:
data.columns

In [None]:
selected_cols = ["commute_car", "employment", "total_population"]
data_np = data2.select(selected_cols).to_numpy()

# Compute the mean and covariance matrix
mean_vec = np.mean(data_np, axis=0)
cov_matrix = np.cov(data_np, rowvar=False)
inv_cov_matrix = np.linalg.inv(cov_matrix)

# Compute Mahalanobis distance of each row from the mean
mahalanobis_distances = [
    distance.mahalanobis(row, mean_vec, inv_cov_matrix) for row in data_np
]

# Add distances to the DataFrame
data2 = data2.with_columns([
    pl.Series(name="mahalanobis", values=mahalanobis_distances)
])

In [None]:
data2

In [None]:
# columns = [
#     "total_population",
#     "own_children6",
#     "own_children17",
#     "commute_car",
#     "total_house",
#     "with_social_security",
#     "food_stamp",
# ]
# for col in columns:
#     data[col] = data.groupby("area_fips")[col].transform(
#         lambda group: group.interpolate(method="cubic")
#     )

In [None]:
gdf = dr.pull_county_shapes()
gdf

In [None]:
data

In [None]:
gdf = gdf.merge(
            data.to_pandas(),
            on=["area_fips"],
            how="left",
            validate="1:m",
        )

# 06081

In [None]:
gdf.columns

In [None]:
gdf = gdf[['county_name', 'area_fips', 'year',
       'qtr', 'industry_code', 'agglvl_code', 'avg_wkly_wage', 'qtrly_estabs','state_name', 'min_wage',
       'total_population']]
# tmp[(tmp["industry_code"] == "72") & (tmp["year"] == 2016) & (tmp["qtr"] == 1)].plot("avg_wkly_wage",legend=True)


In [None]:
gdf["food_stamp_per"] = gdf["food_stamp"] / gdf["total_population"]

In [None]:
gdf[(gdf["year"] == 2019) & gdf["geo_id"].str.startswith("55")].plot("food_stamp_per")

In [None]:
gdf["date"] = gdf["year"]*10 + gdf["qtr"]
gdf

In [None]:
columns_to_aggregate = [
    'total_qtrly_wages', 'avg_wkly_wage', 'qtrly_estabs',
    'total_population', 'in_labor_force', 
    'unemployment', 'own_children6', 'own_children17', 'commute_car', 
    'commute_time', 'total_house', 'inc_less_10k', 'inc_10k_15k', 'inc_15k_25k', 
    'inc_25k_35k', 'inc_35k_50k', 'inc_50k_75k', 'inc_75k_100k', 'inc_100k_150k', 
    'inc_150k_200k', 'inc_more_200k', 'with_social_security', 'food_stamp'
]

aggregated_gdf = gdf.groupby(['geometry', 'area_fips', "industry_code"])[columns_to_aggregate].agg(['mean', 'std']).reset_index()

aggregated_gdf.columns = ['_'.join(col).strip() for col in aggregated_gdf.columns.values]
aggregated_gdf = aggregated_gdf.reset_index(drop=True)


In [None]:
aggregated_gdf = gpd.GeoDataFrame(aggregated_gdf, geometry="geometry_")

In [None]:
aggregated_gdf

In [None]:
aggregated_gdf[(gdf["industry_code"] == "72")].plot("avg_wkly_wage_mean",legend=True)

In [None]:
temp = gdf.groupby([""])

In [None]:
gdf[
    (gdf["year"] == 2023) & 
    (gdf["qtr"] == 1) & 
    (gdf["industry_code"] == "11")
    ].plot("avg_wkly_wage")

In [None]:
gdf[(gdf["year"] == 2023) & (gdf["qtr"] == 1) & (gdf["industry_code"] == "11")].plot("with_social_security")