In [None]:
import os 
os.chdir("..")

In [None]:
import polars as pl
import geopandas as gpd
import pandas as pd
import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pymc as pm
import bambi as bmb
from src.data_process import DataReg
import requests

import arviz as az

import causalpy as cp

az.style.use("arviz-darkgrid")


dr = DataReg(database_file="data.ddb")

In [None]:
df = dr.data_set()
df = df.filter(
    (pl.col("industry_code") == "72")
)
df = df.with_columns(
    total_employment=(pl.col("month1_emplvl") + pl.col("month2_emplvl") + pl.col("month3_emplvl")) / 3
)
remove = df.filter(pl.col("total_employment") == 0).select(pl.col("area_fips")).unique().to_series().to_list()

In [None]:
df = dr.data_set()
df = df.filter(
    (pl.col("industry_code") == "72") &
    (~pl.col("area_fips").is_in(remove)) & 
    (pl.col("year") < 2020)

)
# df = df.filter(
#     (pl.col("area_fips") == "06081") | (pl.col("fips") == "56")
# )


# df = df.select(pl.col("area_fips", "year", "qtr", "avg_wkly_wage", "area_title"))
df = df.with_columns(
    date=pl.col("year").cast(pl.String) + "Q" + pl.col("qtr").cast(pl.String),
    dummy=pl.lit(1),
    area_fips= "i" + pl.col("area_fips"),
    total_employment=((pl.col("month1_emplvl") + pl.col("month2_emplvl") + pl.col("month3_emplvl")) /3).log()
)
# df.select(pl.col("area_fips")).unique().to_series().to_list()
df

In [None]:
data  = df.pivot(on="area_fips", index="date", values="total_employment").to_pandas().set_index("date")
data

In [None]:
# get useful country lists
target_country = "i06081"
all_countries = data.columns
other_countries = all_countries.difference({target_country})
all_countries = list(all_countries)
other_countries = list(other_countries)

In [None]:
# Plot the time series normalised so that intervention point (Q3 2016) is equal to 100
gdp_at_intervention = 20161


# plot
fig, ax = plt.subplots()
for col in data.columns:
    ax.plot(data.index, data[col], color="grey", alpha=0.2)

ax.plot(data.index, data[target_country], color="red", lw=3)
# ax = df_normalised.plot(legend=False)

# formatting
ax.set(title="Normalised GDP")
ax.axvline(x=20161, color="r", ls=":")

In [None]:
from scipy import stats
y = data["i06081"].values
x = data["i01007"].values
l = []
res = stats.pearsonr(x, y)
for i in other_countries:
    x = data[i].values
    res = stats.pearsonr(x, y)
    if res.pvalue < 0.001:
        l.append(i)
len(l)

In [None]:
# build a model formula
formula = target_country + " ~ " + "0 + " + " + ".join(l)

print(formula)

In [None]:
sample_kwargs = {"target_accept": 0.99, "random_seed": 787}

result = cp.SyntheticControl(
    data,
    20161,
    formula=formula,
    model=cp.pymc_models.WeightedSumFitter(sample_kwargs=sample_kwargs),
)

In [None]:
# import aesara.tensor as at
import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
# import pymc.sampling_jax
import seaborn as sns

# plt.style.use("bmh")
# plt.rcParams["figure.figsize"] = [10, 6]
# plt.rcParams["figure.dpi"] = 100
# plt.rcParams["figure.facecolor"] = "white"


In [None]:
data_path = "https://raw.githubusercontent.com/matheusfacure/python-causality-handbook/master/causal-inference-for-the-brave-and-true/data/smoking.csv"

raw_data_df = pd.read_csv(data_path)

raw_data_df.head()

In [None]:
df = raw_data_df.copy().drop(columns=["lnincome", "beer", "age15to24"])
df

In [None]:
df.columns

In [None]:
data = df.select(pl.col("area_fips", "date", "total_employment")).with_columns(controls=pl.when(pl.col("area_fips") == "i06081").then(True).otherwise(False)).to_pandas()
data["date"] = pd.PeriodIndex(df['date'], freq='Q').to_timestamp()
data

In [None]:
print(data['date'].dtype)

In [None]:
fig, ax = plt.subplots()

(
    data.groupby(["date", "controls"], as_index=False)
    .agg({"total_employment": np.mean})
    .pipe(
        (sns.lineplot, "data"),
        x="date",
        y="total_employment",
        hue="controls",
        marker="o",
        ax=ax,
    )
)
ax.axvline(
    x=pd.to_datetime("2016-01-01"),
    linestyle=":",
    lw=2,
    color="C2",
    label="Proposition 99",
)

ax.legend(loc="upper right")
ax.set(
    title="Gap in per-capita cigarette sales (in packs)",
    ylabel="Cigarette Sales Trend"
)


In [None]:
fig, ax = plt.subplots()

(
    df.groupby(["year", "california"], as_index=False)
    .agg({"cigsale": np.mean})
    .pipe(
        (sns.lineplot, "data"),
        x="year",
        y="cigsale",
        hue="california",
        marker="o",
        ax=ax,
    )
)
ax.axvline(
    x=1988,
    linestyle=":",
    lw=2,
    color="C2",
    label="Proposition 99",
)
ax.legend(loc="upper right")
ax.set(
    title="Gap in per-capita cigarette sales (in packs)",
    ylabel="Cigarette Sales Trend"
)


In [None]:
features = ["total_employment"]

pre_df = (
    data
    .query("~ after_treatment")
    .pivot(index='state', columns="year", values=features)
    .T
)

post_df = (
    data
    .query("after_treatment")
    .pivot(index='state', columns="year", values=features)
    .T
)

In [None]:
idx = 3

y_pre = pre_df[idx].to_numpy()
x_pre = pre_df.drop(columns=idx).to_numpy()
pre_years = pre_df.reset_index(inplace=False).year.unique()
n_pre = pre_years.size

y_post = post_df[idx].to_numpy()
x_post = post_df.drop(columns=idx).to_numpy()
post_years = post_df.reset_index(inplace=False).year.unique()
n_post = post_years.size

k = pre_df.shape[1] - 1

In [None]:
with pm.Model() as model:
    x = pm.Data(name="x", value=x_pre)
    y = pm.Data(name="y", value=y_pre)
    beta = pm.Dirichlet(name="beta", a=(1 / k) * np.ones(k))
    sigma = pm.HalfNormal(name="sigma", sigma=5)
    mu = pm.Deterministic(name="mu", var=pm.math.dot(x, beta))
    likelihood = pm.Normal(name="likelihood", mu=mu, sigma=sigma, observed=y)

pm.model_to_graphviz(model)

In [None]:
with model:
    idata = pm.sample(draws=4000, chains=4)
    posterior_predictive_pre = pm.sample_posterior_predictive(trace=idata)

In [None]:
az.plot_forest(data=idata, combined=True, var_names=["beta"])

In [None]:
with model:
    pm.set_data(new_data={"x": x_post, "y": y_post})
    posterior_predictive_post = pm.sample_posterior_predictive(
        trace=idata, var_names=["likelihood"]
    )

In [None]:
pre_posterior_mean = (
    posterior_predictive_pre.posterior_predictive["likelihood"][:, :, :n_pre]
    .stack(samples=("chain", "draw"))
    .mean(axis=1)
)

post_posterior_mean = (
    posterior_predictive_post.posterior_predictive["likelihood"][:, :, :n_post]
    .stack(samples=("chain", "draw"))
    .mean(axis=1)
)


fig, ax = plt.subplots()

(
    df.groupby(["year", "california"], as_index=False)
    .agg({"cigsale": np.mean})
    .assign(
        california=lambda x: x.california.map(
            {True: "is_california", False: "is_not_california"}
        )
    )
    .pipe(
        (sns.lineplot, "data"),
        x="year",
        y="cigsale",
        hue="california",
        alpha=0.5,
        ax=ax,
    )
)
ax.axvline(
    x=1988,
    linestyle=":",
    lw=2,
    color="C2",
    label="Proposition 99",
)
sns.lineplot(
    x=pre_years,
    y=pre_posterior_mean,
    color="C1",
    marker="o",
    label="pre-treatment posterior predictive mean",
    ax=ax,
)
sns.lineplot(
    x=post_years,
    y=post_posterior_mean,
    color="C2",
    marker="o",
    label="post-treatment posterior predictive mean",
    ax=ax,
)
az.plot_hdi(
    x=pre_years,
    y=posterior_predictive_pre.posterior_predictive["likelihood"][:, :, :n_pre],
    smooth=True,
    color="C1",
    fill_kwargs={"label": "pre-treatment posterior predictive (94% HDI)"},
    ax=ax,
)
az.plot_hdi(
    x=post_years,
    y=posterior_predictive_post.posterior_predictive["likelihood"][:, :, :n_post],
    smooth=True,
    color="C2",
    fill_kwargs={"label": "post-treatment posterior predictive (94% HDI)"},
    ax=ax,
)
ax.legend(loc="lower left")
ax.set(
    title="Gap in per-capita cigarette sales (in packs)", ylabel="Cigarette Sales Trend"
)