In [122]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from linearmodels.panel import PanelOLS
from lightgbm import LGBMRegressor, LGBMClassifier

from doubleml.data import DoubleMLPanelData
from doubleml import DoubleMLDID


import warnings

warnings.filterwarnings('ignore')

In [120]:
df = pd.read_stata('/Users/mcargnel/Documents/mea/tesis/data/zc_level.dta')
df['frack_post'] = df['fracked'] * df['treatment']
df.dropna(inplace=True)
df.head()

Unnamed: 0,year,state,zipcode,zipid,lnactionnonoil,lnstate_formal_nonoil,treatment,fracked,lnone_non_oil,lnestab,lnemp,frack_post
0,1990,PA,15001,15001,0.693147,0.0,0.0,0.0,0.693147,8.013012,10.7887,0.0
1,1991,PA,15001,15001,1.386294,0.0,0.0,0.0,1.098612,8.054205,10.779685,0.0
2,1992,PA,15001,15001,0.0,0.0,0.0,0.0,0.0,8.06054,10.794809,0.0
3,1993,PA,15001,15001,0.693147,0.0,0.0,0.0,0.559616,8.068716,10.811787,0.0
4,1994,PA,15001,15001,0.0,0.0,0.0,0.0,0.0,8.069968,10.840267,0.0


# Replication using DID regression

*** Diferencias en diferencias ***

* Leo los datos
use "Datos - González 2024/zc_level.dta", clear

xtset zipcode year

* Grafico para validar supuesto de identificación
lgraph lnactionnonoil year, by(fracked) xline(2005) ytitle(Actividades de regulación ambiental (Log)) ///
 xtitle("Año") ///
 name(frack_plot, replace) ///
 graphregion(color(white)) ///
    plotregion(color(white))

* Genero la variable con la interacción
        gen frack_post= fracked * treatment

*Regresiones para replicar tabla 1 de Gonzales (2024)
eststo actions: xtreg lnactionnonoil frack_post fracked treatment i.year, fe cluster(zipcode)

eststo actions_c: xtreg lnactionnonoil frack_post fracked treatment lnestab lnemp i.year, fe cluster(zipcode)

eststo facilities: xtreg lnone_non_oil frack_post fracked treatment i.year, fe cluster(zipcode)

eststo facilities_c: xtreg lnone_non_oil frack_post fracked treatment lnestab lnemp i.year, fe cluster(zipcode)

eststo formal: xtreg lnstate_forma~l  frack_post fracked treatment lnestab lnemp i.year, fe cluster(zipcode)

eststo formal_c: xtreg lnstate_forma~l  frack_post fracked treatment lnestab lnemp i.year, fe cluster(zipcode)

*Tabla con resultados, fue formateada en LaTex para la entrega.
est table actions actions_c facilities facilities_c formal formal_c, star(0.1 0.05 0.01) b(%9.4f) stats(N r2 r2_w) drop(_cons)


In [121]:
df_gonzales = df.set_index(['zipcode','year'])
model1 = PanelOLS(
    dependent=df_gonzales['lnactionnonoil'],
    exog=df_gonzales[['frack_post', 'fracked', 'treatment', 'lnestab', 'lnemp']],
    entity_effects=True,  # zipcode fixed effects
    time_effects=True,    # year fixed effects (i.year)
    drop_absorbed=True
)

results = model1.fit(cov_type='clustered', cluster_entity=True)
print(results.summary)

                          PanelOLS Estimation Summary                           
Dep. Variable:         lnactionnonoil   R-squared:                        0.0127
Estimator:                   PanelOLS   R-squared (Between):              0.2995
No. Observations:              143275   R-squared (Within):               0.0325
Date:                Sun, Oct 12 2025   R-squared (Overall):              0.1926
Time:                        22:35:02   Log-likelihood                -7.764e+04
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      591.34
Entities:                        5731   P-value                           0.0000
Avg Obs:                       25.000   Distribution:                F(3,137517)
Min Obs:                       25.000                                           
Max Obs:                       25.000   F-statistic (robust):             63.735
                            

# Replication using DML

In [123]:
dml_data = DoubleMLPanelData(
    data=df,
    y_col="lnactionnonoil", # total regulatory activities in the non-energy sector
    d_cols="frack_post", # starting 2005 fracking got enough technological advancement to be used in the industry
    id_col="zipcode", # 
    t_col="year",
    x_cols=['lnestab', 'lnemp'] # private establishments and employess
)
print(dml_data)


# Step 3: Define DID model
dml_did = DoubleMLDID(
    dml_data,
    ml_g=LGBMRegressor(n_estimators=1000, max_depth=3, verbose=-1),
    ml_m=LGBMClassifier(n_estimators=1000, max_depth=3, verbose=-1),
    score="observational"   # good default when treatment not randomized
)

# Step 4: Fit the model
dml_did.fit()

# Step 5: Inspect results
print(dml_did.summary)


------------------ Data summary      ------------------
Outcome variable: lnactionnonoil
Treatment variable(s): ['frack_post']
Covariates: ['lnestab', 'lnemp']
Instrument variable(s): None
Time variable: year
Id variable: zipcode
No. Unique Ids: 5731
No. Observations: 143275

------------------ DataFrame info    ------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143275 entries, 0 to 143274
Columns: 12 entries, year to frack_post
dtypes: category(1), float32(8), int16(1), int32(1), object(1)
memory usage: 6.7+ MB

                coef   std err          t  P>|t|     2.5 %    97.5 %
frack_post  0.335772  0.008197  40.963447    0.0  0.319707  0.351838


significant and positive effects effect of fracking on regulatory actions.

In [124]:
dml_data = DoubleMLPanelData(
    data=df,
    y_col="lnone_non_oil", # total regulatory activities in the non-energy sector
    d_cols="frack_post", # starting 2005 fracking got enough technological advancement to be used in the industry
    id_col="zipcode", # 
    t_col="year",
    x_cols=['lnestab', 'lnemp'] # private establishments and employess
)
print(dml_data)


# Step 3: Define DID model
dml_did = DoubleMLDID(
    dml_data,
    ml_g=LGBMRegressor(n_estimators=1000, max_depth=3, verbose=-1),
    ml_m=LGBMClassifier(n_estimators=1000, max_depth=3, verbose=-1),
    score="observational"   # good default when treatment not randomized
)

# Step 4: Fit the model
dml_did.fit()

# Step 5: Inspect results
print(dml_did.summary)


------------------ Data summary      ------------------
Outcome variable: lnone_non_oil
Treatment variable(s): ['frack_post']
Covariates: ['lnestab', 'lnemp']
Instrument variable(s): None
Time variable: year
Id variable: zipcode
No. Unique Ids: 5731
No. Observations: 143275

------------------ DataFrame info    ------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143275 entries, 0 to 143274
Columns: 12 entries, year to frack_post
dtypes: category(1), float32(8), int16(1), int32(1), object(1)
memory usage: 6.7+ MB

                coef   std err          t          P>|t|     2.5 %   97.5 %
frack_post  0.227446  0.006089  37.353019  2.264910e-305  0.215511  0.23938


In [126]:
dml_data = DoubleMLPanelData(
    data=df,
    y_col="lnstate_formal_nonoil", # total regulatory activities in the non-energy sector
    d_cols="frack_post", # starting 2005 fracking got enough technological advancement to be used in the industry
    id_col="zipcode", # 
    t_col="year",
    x_cols=['lnestab', 'lnemp'] # private establishments and employess
)
print(dml_data)


# Step 3: Define DID model
dml_did = DoubleMLDID(
    dml_data,
    ml_g=LGBMRegressor(n_estimators=1000, max_depth=3, verbose=-1),
    ml_m=LGBMClassifier(n_estimators=1000, max_depth=3, verbose=-1),
    score="observational"   # good default when treatment not randomized
)

# Step 4: Fit the model
dml_did.fit()

# Step 5: Inspect results
print(dml_did.summary)


------------------ Data summary      ------------------
Outcome variable: lnstate_formal_nonoil
Treatment variable(s): ['frack_post']
Covariates: ['lnestab', 'lnemp']
Instrument variable(s): None
Time variable: year
Id variable: zipcode
No. Unique Ids: 5731
No. Observations: 143275

------------------ DataFrame info    ------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143275 entries, 0 to 143274
Columns: 12 entries, year to frack_post
dtypes: category(1), float32(8), int16(1), int32(1), object(1)
memory usage: 6.7+ MB

                coef   std err          t  P>|t|     2.5 %    97.5 %
frack_post  0.395736  0.007797  50.755186    0.0  0.380455  0.411018
