In [None]:
import os
os.chdir("..")

In [None]:
from src.data.data_process import DataReg
import arviz as az
import bambi as bmb
import matplotlib.pyplot as plt
from linearmodels.panel import RandomEffects, PanelOLS
import polars as pl
import pymc as pm
import numpy as np
import pandas as pd
import theano.tensor as tt
import numpy as np
from dotenv import load_dotenv
import os

load_dotenv()

az.style.use("arviz-darkgrid")
dr = DataReg()



In [None]:
dr.notify(url=str(os.environ.get("URL")),auth=str(os.environ.get("AUTH")), msg="hi")

In [None]:
dr.conn.sql("SELECT * FROM qcewtable WHERE ein=='660567825'").pl() #INFO We cound manualy obtain if they are foreing

In [None]:
data = dr.regular_data(naics="72-food")
data

In [None]:
data_pr = data[data["foreign"] == 0]
# data_pr = data_pr[data_pr["year"] > 2020]
# Prepare your data
X = data_pr[['log_k_index', 'own_children6', 'own_children17', 
             'commute_car', 'food_stamp', 'with_social_security']].values
y = data_pr['log_total_employment'].values

# Create categorical encoding for 'date' and 'ein'
# Assuming 'date' and 'ein' are categorical, we will use one-hot encoding
# You could also use `patsy` to create design matrices, but here's an example with pandas:
date_dummies = pd.get_dummies(data_pr['date'], prefix='date', drop_first=True)
ein_dummies = pd.get_dummies(data_pr['ein'], prefix='ein', drop_first=True)

# Combine the encoded variables with the continuous ones
X_full = np.hstack([X, date_dummies.values, ein_dummies.values])

# PyMC model
with pm.Model() as model:
    # Non-informative priors for the regression coefficients
    alpha = pm.Normal('alpha', mu=0, sigma=1000)  # Flat prior
    
    # For each of the predictors (continuous and categorical), we define a prior
    betas = pm.Normal('betas', mu=0, sigma=1000, shape=X_full.shape[1])
    
    # Linear regression model (with the additional categorical variables)
    mu = alpha + tt.dot(X_full, betas)
    
    # Define the likelihood (assuming Gaussian noise)
    sigma = pm.HalfNormal('sigma', sigma=10)
    likelihood = pm.Normal('y', mu=mu, sigma=sigma, observed=y)
    
    # Set up the sampler using NUTS
    trace = pm.sample(
        draws=500,
        tune=500,
        target_accept=0.8,
        chains=4,
        cores=15,  # Adjust based on available cores
        random_seed=42
    )

In [None]:

# results.posterior = results.posterior.rename({"k_index": "k_index_51"})
az.plot_trace(results)

In [None]:
az.summary(results)

- rain by naics (2 digits)
- drop null zipcodes
- broke down the 72 (remove food services as its own sector)
- only the first 2 regression(0,1)


In [None]:
naics_code = [
    "11",
    "21",
    "22",
    "23",
    "31-33",
    "42",
    "44-45",
    "48-49",
    "51",
    "52",
    "54",
    "55",
    "56",
    "61",
    "62",
    "71",
    "72-accommodation",
    "72-food",
    "81",
    "92"
]

In [None]:
import warnings

warnings.filterwarnings(
    "ignore",
    category=FutureWarning
)
warnings.filterwarnings(
    "ignore",
    category=UserWarning
)
for naics in naics_code:
    data = dr.regular_data(naics=naics)
    print(f"naics:{naics}")
    print(len(data))
    

In [None]:
data_pr = data[data["foreign"] == 0]
model = bmb.Model(
    "log_total_employment ~ 0 + date + ein + log_k_index + own_children6 + own_children17 + commute_car + food_stamp + with_social_security",
    data_pr, dropna=True,
)
results = model.fit(sample_kwargs={"nuts_sampler": "blackjax"},cores=10)



In [None]:
results = az.from_netcdf("data/processed/results_pr_model_21.nc")
az.plot_trace(results)

In [None]:
results2 = az.from_netcdf("data/processed/results_foreign_model_21.nc")
az.plot_trace(results2)

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

az.plot_posterior(results, var_names=["log_k_index"], ax=axes[0])
az.plot_posterior(results2, var_names=["log_k_index"], ax=axes[1])

axes[0].set_title("Empresas PR")
axes[1].set_title("Empresas Foreaneas")

plt.tight_layout()
plt.show()

In [None]:
az.plot_trace(results)

In [None]:
az.summary(results)

In [None]:

az.plot_trace(results, var_names=["log_k_index"])

In [None]:
az.summary(results)