In [None]:
import os
os.chdir("..")

In [None]:
from src.data.data_process import DataReg
import polars as pl
import arviz as az
import matplotlib.dates as mdates
import numpy as np
import geopandas as gpd
import causalpy as cp
import pandas as pd
import arviz as az
import bambi as bmb
from shapely import wkt
import matplotlib.pyplot as plt
dr = DataReg()

In [None]:
df = dr.conn.sql("SELECT first_month_employment, second_month_employment, third_month_employment, ui_addr_5_zip, qtr, year FROM qcewtable").pl()

In [None]:
dp03 = dr.conn.sql("SELECT * FROM DP03Table").pl()
pr_zips = gpd.GeoDataFrame(dr.make_spatial_table().df())
pr_zips["geometry"] = pr_zips["geometry"].apply(wkt.loads)
pr_zips = pr_zips.set_geometry("geometry")
pr_zips["zipcode"] = pr_zips["zipcode"].astype(str)
gdf = pr_zips.join(
                dp03.to_pandas().set_index("zipcode"), on="zipcode", how="inner", validate="1:m"
        ).reset_index(drop=True)
gdf["area"]= gdf.area
gdf = gdf.set_crs(epsg=4326, inplace=True)
gdf = gdf.to_crs(epsg=32161)
gdf["pop_area"] = gdf["total_population"] / (gdf.area / 2564102.5641026) 
tmp = gdf[gdf["year"] == 2019]
is_rural = tmp[tmp["pop_area"] >= 500]["zipcode"].unique()

In [None]:
pr_zips = gpd.GeoDataFrame(dr.make_spatial_table().df())
pr_zips["geometry"] = pr_zips["geometry"].apply(wkt.loads)
pr_zips = pr_zips.set_geometry("geometry")
pr_zips["zipcode"] = pr_zips["zipcode"].astype(str)
empty_df = [pl.Series("date", [], dtype=pl.String)]
for zips in list(pr_zips["zipcode"].values):
    empty_df.append(pl.Series(f"zip_{zips}", [], dtype=pl.Int32))
df_master = pl.DataFrame(empty_df)
df_master

tmp = df.drop_nulls()
tmp = tmp.filter(pl.col("ui_addr_5_zip").is_in(list(pr_zips["zipcode"].values)))
tmp  = tmp.group_by(["year", "qtr","ui_addr_5_zip"]).agg(
    first_month_employment = pl.col("first_month_employment").sum(),
    second_month_employment = pl.col("second_month_employment").sum(),
    third_month_employment = pl.col("third_month_employment").sum()
)
tmp


In [None]:
df = tmp.unpivot(
    index=["year", "qtr", "ui_addr_5_zip"],  # Keep these columns as identifiers
    on=["first_month_employment", "second_month_employment", "third_month_employment"],  # Columns to unpivot
    variable_name="month",  # Name of the new column representing the month
    value_name="employment"  # Name of the column for the employment values)
)
df = df.with_columns(
        date=pl.when((pl.col("qtr") == 1) & (pl.col("month") == "first_month_employment")).then(pl.col("year").cast(pl.String) + "-01-01")
               .when((pl.col("qtr") == 1) & (pl.col("month") == "second_month_employment")).then(pl.col("year").cast(pl.String) + "-02-01")
               .when((pl.col("qtr") == 1) & (pl.col("month") == "third_month_employment")).then(pl.col("year").cast(pl.String) + "-03-01")
               .when((pl.col("qtr") == 2) & (pl.col("month") == "first_month_employment")).then(pl.col("year").cast(pl.String) + "-04-01")
               .when((pl.col("qtr") == 2) & (pl.col("month") == "second_month_employment")).then(pl.col("year").cast(pl.String) + "-05-01")
               .when((pl.col("qtr") == 2) & (pl.col("month") == "third_month_employment")).then(pl.col("year").cast(pl.String) + "-06-01")
               .when((pl.col("qtr") == 3) & (pl.col("month") == "first_month_employment")).then(pl.col("year").cast(pl.String) + "-07-01")
               .when((pl.col("qtr") == 3) & (pl.col("month") == "second_month_employment")).then(pl.col("year").cast(pl.String) + "-08-01")
               .when((pl.col("qtr") == 3) & (pl.col("month") == "third_month_employment")).then(pl.col("year").cast(pl.String) + "-09-01")
               .when((pl.col("qtr") == 4) & (pl.col("month") == "first_month_employment")).then(pl.col("year").cast(pl.String) + "-10-01")
               .when((pl.col("qtr") == 4) & (pl.col("month") == "second_month_employment")).then(pl.col("year").cast(pl.String) + "-11-01")
               .when((pl.col("qtr") == 4) & (pl.col("month") == "third_month_employment")).then(pl.col("year").cast(pl.String) + "-12-01")
               .otherwise(pl.lit("ERROR"))
    )
df = df.select(["date","ui_addr_5_zip","employment"])
df = df.with_columns(urban=pl.col("ui_addr_5_zip").is_in(is_rural))
df


In [None]:
data = df.to_pandas()
data["date"] = pd.to_datetime(data["date"])
data = data.sort_values("date").reset_index(drop=True)

# Set treatment date
treatment_date = pd.to_datetime('2023-01-01')

# Create post_treatment column (1 if date >= treatment date)
data["post_treatment"] = data["date"] >= treatment_date

# Convert date to integer: days relative to treatment date
data["date"] = (data["date"] - treatment_date).dt.days

# Cast boolean column(s) to int
data["unit"] = "zip_" + data["ui_addr_5_zip"]
data["urban"] = data["urban"].astype(int)

data

In [None]:
result = cp.DifferenceInDifferences(
    data,
    formula="employment ~ 1 + urban*post_treatment",
    time_variable_name="date",
    group_variable_name="urban",
    model=cp.pymc_models.LinearRegression(),
)

In [None]:
fig, ax = result.plot(round_to=3)

In [None]:
plt.savefig("test.png")