In [6]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import import_ipynb
from Control_group_density_threshold import control_m, control_m2

In [7]:
nielsen15 = pd.read_csv('../../Nielsen/aggregated_nielsen_2015.csv')
nielsen16 = pd.read_csv('../..//Nielsen/aggregated_nielsen_2016.csv')
nielsen15['year'] = 2015
nielsen16['year'] = 2016

In [8]:
nielsen = pd.concat((nielsen15, nielsen16))
nielsen = nielsen[~nielsen.is_walmart]

In [9]:
## Entriy/exit dates
fandom = pd.read_csv('../data_collection/plein_de_data/fandom_traitées.csv', parse_dates=['Opening_date', 'Closing_date'])[['State', 'County_name', 'County_fips', 'Opening_date', 'Closing_date']]

# We drop the state in which we do not trust our data (some mistakes stil remain)
fandom = fandom[~np.isin(fandom.State, ('CA', 'GA', 'KS', 'LA', 'TX'))]

# We concentrate our study on the movements (entries & exits) during the fiscal years 2015 and 2016
movements = fandom[((fandom.Opening_date >= '2015-01-31') & (fandom.Opening_date <= '2017-01-31')) | ((fandom.Closing_date >= '2015-01-31') & (fandom.Closing_date <= '2017-01-31'))]
#movements['year'] = movements.Opening_date.dt.year
#movements['month'] = movements.Opening_date.dt.month

## FIRST MODEL : one month

Regression model :
$$ Price_{i, t} = \alpha + \beta \cdot treat_i + \gamma \cdot post_t + \delta \cdot treat_i \cdot post_{t} + \varepsilon_{i, t}$$

In [10]:
year = 2016
month = 1

In [11]:
# We choose to focus on milk prices
milk = nielsen[nielsen.product_group_descr == 'FRESH PRODUCE']


# The control group is composed by all states where nothing (no entry nor exit) happened.
control = milk[~np.isin(milk.guessed_store_county_fips, movements)].copy()
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


# The treatment group is composed by the states where one entry took place in august 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.Opening_date + count.Closing_date == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index)) & (movements.Opening_date.dt.year == year) & (movements.Opening_date.dt.month == month)]

treatment = milk[np.isin(milk.guessed_store_county_fips, treatment_movements.County_fips )].copy()
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


# We create our dummies for the regression
control['treat'] = False
control['post'] = (control.purchase_month > month) & (control.purchase_year == year)
treatment['treat'] = True
treatment['post'] = (treatment.purchase_month > month) & (treatment.purchase_year == year)


# Final dataset for the regression :

df = pd.concat((control, treatment))[['upc_price', 'treat', 'post']]
df = df[df.upc_price != 0]

Size of the control group: 2289.
Size of the treatment group: 15.


In [12]:
reg = smf.ols(formula='np.log(upc_price) ~ treat * post', data=df)
results = reg.fit()

In [13]:
results.summary()

0,1,2,3
Dep. Variable:,np.log(upc_price),R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,2.083
Date:,"Mon, 31 Oct 2022",Prob (F-statistic):,0.1
Time:,16:35:08,Log-Likelihood:,-8959.8
No. Observations:,47443,AIC:,17930.0
Df Residuals:,47439,BIC:,17960.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.7333,0.002,401.555,0.000,0.730,0.737
treat[T.True],-0.0156,0.021,-0.742,0.458,-0.057,0.026
post[T.True],0.0059,0.003,2.175,0.030,0.001,0.011
treat[T.True]:post[T.True],-0.0075,0.031,-0.242,0.809,-0.068,0.053

0,1,2,3
Omnibus:,8021.388,Durbin-Watson:,1.313
Prob(Omnibus):,0.0,Jarque-Bera (JB):,47193.952
Skew:,-0.686,Prob(JB):,0.0
Kurtosis:,7.689,Cond. No.,29.1


## SECOND MODEL : all months

Regression model :
$$ Price_{i, t} = \alpha + \beta \cdot treat_i + \delta \cdot treat_i \cdot post_{t} + \varepsilon_{i, t}$$

#### Entire control group

In [14]:
# We choose to focus on milk prices
milk = nielsen[nielsen.product_group_descr == 'FRESH PRODUCE']


# The control group is composed by all states where nothing (no entry nor exit) happened.
control = milk[~np.isin(milk.guessed_store_county_fips, movements)].copy()
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


# The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.Opening_date + count.Closing_date == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]

treatment = milk[np.isin(milk.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


# We create our dummies for the regression
control['treat'] = False
control['interaction'] = False
treatment['treat'] = True
treatment['interaction'] = (treatment.purchase_month > treatment.Opening_date.dt.month) & (treatment.purchase_year >= treatment.Opening_date.dt.year)


# Final dataset for the regression :

df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction']]
df = df[df.upc_price != 0]

Size of the control group: 2289.
Size of the treatment group: 89.


In [15]:
reg = smf.ols(formula='np.log(upc_price) ~ treat + interaction', data=df)
results = reg.fit()

In [16]:
results.summary()

0,1,2,3
Dep. Variable:,np.log(upc_price),R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,1.33
Date:,"Mon, 31 Oct 2022",Prob (F-statistic):,0.264
Time:,16:35:26,Log-Likelihood:,-8722.5
No. Observations:,49205,AIC:,17450.0
Df Residuals:,49202,BIC:,17480.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.7360,0.001,552.736,0.000,0.733,0.739
treat[T.True],0.0062,0.008,0.787,0.432,-0.009,0.021
interaction[T.True],-0.0215,0.013,-1.622,0.105,-0.048,0.004

0,1,2,3
Omnibus:,8429.345,Durbin-Watson:,1.308
Prob(Omnibus):,0.0,Jarque-Bera (JB):,50995.363
Skew:,-0.689,Prob(JB):,0.0
Kurtosis:,7.793,Cond. No.,10.9


In [17]:
np.exp(results.params[2])-1

-0.021281645421718176

#### Control group with county density btw fisrt and third quartiles of treatment group density

In [18]:
# We choose to focus on milk prices
milk = nielsen[nielsen.product_group_descr == 'FRESH PRODUCE']


# The control group is composed by all states where nothing (no entry nor exit) happened.
control = milk[~np.isin(milk.guessed_store_county_fips, movements)].copy()
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


# The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.Opening_date + count.Closing_date == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]

treatment = milk[np.isin(milk.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


# We create our dummies for the regression
control['treat'] = False
control['interaction'] = False
treatment['treat'] = True
treatment['interaction'] = (treatment.purchase_month > treatment.Opening_date.dt.month) & (treatment.purchase_year >= treatment.Opening_date.dt.year)


# Final dataset for the regression :

df = pd.concat((control_m2, treatment))[['upc_price', 'treat', 'interaction']]
df = df[df.upc_price != 0]

Size of the control group: 2289.
Size of the treatment group: 89.


In [19]:
reg = smf.ols(formula='np.log(upc_price) ~ treat + interaction', data=df)
results = reg.fit()

In [20]:
results.summary()

0,1,2,3
Dep. Variable:,np.log(upc_price),R-squared:,0.105
Model:,OLS,Adj. R-squared:,0.105
Method:,Least Squares,F-statistic:,1476.0
Date:,"Mon, 31 Oct 2022",Prob (F-statistic):,0.0
Time:,16:35:40,Log-Likelihood:,6557.9
No. Observations:,25227,AIC:,-13110.0
Df Residuals:,25224,BIC:,-13090.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.9645,0.001,785.738,0.000,0.962,0.967
treat[T.True],-0.2224,0.005,-43.398,0.000,-0.232,-0.212
interaction[T.True],-0.0215,0.009,-2.511,0.012,-0.038,-0.005

0,1,2,3
Omnibus:,3006.641,Durbin-Watson:,0.718
Prob(Omnibus):,0.0,Jarque-Bera (JB):,9604.24
Skew:,-0.617,Prob(JB):,0.0
Kurtosis:,5.759,Cond. No.,7.84


In [21]:
np.exp(results.params[2])-1

-0.02128164542171851

#### Control group with county density btw min and max of treatment group density

In [22]:
# We choose to focus on milk prices
milk = nielsen[nielsen.product_group_descr == 'FRESH PRODUCE']


# The control group is composed by all states where nothing (no entry nor exit) happened.
control = milk[~np.isin(milk.guessed_store_county_fips, movements)].copy()
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


# The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.Opening_date + count.Closing_date == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]

treatment = milk[np.isin(milk.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


# We create our dummies for the regression
control['treat'] = False
control['interaction'] = False
treatment['treat'] = True
treatment['interaction'] = (treatment.purchase_month > treatment.Opening_date.dt.month) & (treatment.purchase_year >= treatment.Opening_date.dt.year)


# Final dataset for the regression :

df = pd.concat((control_m, treatment))[['upc_price', 'treat', 'interaction']]
df = df[df.upc_price != 0]

Size of the control group: 2289.
Size of the treatment group: 89.


In [23]:
reg = smf.ols(formula='np.log(upc_price) ~ treat + interaction', data=df)
results = reg.fit()

In [24]:
results.summary()

0,1,2,3
Dep. Variable:,np.log(upc_price),R-squared:,0.033
Model:,OLS,Adj. R-squared:,0.033
Method:,Least Squares,F-statistic:,1262.0
Date:,"Mon, 31 Oct 2022",Prob (F-statistic):,0.0
Time:,16:35:57,Log-Likelihood:,7115.2
No. Observations:,74195,AIC:,-14220.0
Df Residuals:,74192,BIC:,-14200.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.9779,0.001,1194.096,0.000,0.976,0.979
treat[T.True],-0.2358,0.006,-39.824,0.000,-0.247,-0.224
interaction[T.True],-0.0215,0.010,-2.131,0.033,-0.041,-0.002

0,1,2,3
Omnibus:,7985.809,Durbin-Watson:,0.853
Prob(Omnibus):,0.0,Jarque-Bera (JB):,27729.596
Skew:,-0.533,Prob(JB):,0.0
Kurtosis:,5.799,Cond. No.,13.4


In [25]:
np.exp(results.params[2])-1

-0.021281645421718842

## THIRD MODEL : adding time fixed effects

Regression model :
$$ Price_{i, t} = \alpha + \beta \cdot treat_i + \delta \cdot treat_i \cdot post_{t} + \sum_{\tau=Janv15}^{Dec16}\gamma_{\tau} \cdot \mathbb{1}(t=\tau) + \varepsilon_{i, t}$$

#### Entire control group

In [26]:
# We choose to focus on milk prices
milk = nielsen[nielsen.product_group_descr == 'FRESH PRODUCE']


# The control group is composed by all states where nothing (no entry nor exit) happened.
control = milk[~np.isin(milk.guessed_store_county_fips, movements)].copy()
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


# The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.Opening_date + count.Closing_date == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]

treatment = milk[np.isin(milk.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


# We create our dummies for the regression
control['treat'] = False
control['interaction'] = False
control['time_fixed_effects'] = list(zip(control.purchase_year, control.purchase_month))
treatment['treat'] = True
treatment['interaction'] = (treatment.purchase_month > treatment.Opening_date.dt.month) & (treatment.purchase_year >= treatment.Opening_date.dt.year)
treatment['time_fixed_effects'] = list(zip(treatment.purchase_year, treatment.purchase_month))


# Final dataset for the regression :

df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'time_fixed_effects']]
df = df[df.upc_price != 0]

Size of the control group: 2289.
Size of the treatment group: 89.


In [27]:
reg = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(time_fixed_effects)', data=df)
results = reg.fit()

In [28]:
results.summary()

0,1,2,3
Dep. Variable:,np.log(upc_price),R-squared:,0.004
Model:,OLS,Adj. R-squared:,0.004
Method:,Least Squares,F-statistic:,8.51
Date:,"Mon, 31 Oct 2022",Prob (F-statistic):,1.2600000000000002e-31
Time:,16:36:18,Log-Likelihood:,-8617.7
No. Observations:,49205,AIC:,17290.0
Df Residuals:,49179,BIC:,17520.0
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.7266,0.006,114.432,0.000,0.714,0.739
treat[T.True],0.0058,0.008,0.737,0.461,-0.010,0.021
interaction[T.True],-0.0202,0.013,-1.517,0.129,-0.046,0.006
"C(time_fixed_effects)[T.(2015, 2)]",0.0005,0.009,0.056,0.955,-0.017,0.018
"C(time_fixed_effects)[T.(2015, 3)]",-0.0162,0.009,-1.811,0.070,-0.034,0.001
"C(time_fixed_effects)[T.(2015, 4)]",-0.0146,0.009,-1.626,0.104,-0.032,0.003
"C(time_fixed_effects)[T.(2015, 5)]",0.0035,0.009,0.385,0.700,-0.014,0.021
"C(time_fixed_effects)[T.(2015, 6)]",0.0306,0.009,3.409,0.001,0.013,0.048
"C(time_fixed_effects)[T.(2015, 7)]",0.0250,0.009,2.793,0.005,0.007,0.043

0,1,2,3
Omnibus:,8520.635,Durbin-Watson:,1.309
Prob(Omnibus):,0.0,Jarque-Bera (JB):,52266.916
Skew:,-0.694,Prob(JB):,0.0
Kurtosis:,7.855,Cond. No.,24.9


In [29]:
np.exp(results.params[2])-1

-0.020037254547715322

#### Control group with county density btw fisrt and third quartiles of treatment group density

In [30]:
# We choose to focus on milk prices
milk = nielsen[nielsen.product_group_descr == 'FRESH PRODUCE']


# The control group is composed by all states where nothing (no entry nor exit) happened.
control = milk[~np.isin(milk.guessed_store_county_fips, movements)].copy()
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


# The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.Opening_date + count.Closing_date == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]

treatment = milk[np.isin(milk.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


# We create our dummies for the regression
control['treat'] = False
control['interaction'] = False
control_m2['time_fixed_effects'] = list(zip(control_m2.purchase_year, control_m2.purchase_month))
treatment['treat'] = True
treatment['interaction'] = (treatment.purchase_month > treatment.Opening_date.dt.month) & (treatment.purchase_year >= treatment.Opening_date.dt.year)
treatment['time_fixed_effects'] = list(zip(treatment.purchase_year, treatment.purchase_month))


# Final dataset for the regression :

df = pd.concat((control_m2, treatment))[['upc_price', 'treat', 'interaction', 'time_fixed_effects']]
df = df[df.upc_price != 0]

Size of the control group: 2289.
Size of the treatment group: 89.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_m2['time_fixed_effects'] = list(zip(control_m2.purchase_year, control_m2.purchase_month))


In [31]:
reg = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(time_fixed_effects)', data=df)
results = reg.fit()

In [32]:
results.summary()

0,1,2,3
Dep. Variable:,np.log(upc_price),R-squared:,0.159
Model:,OLS,Adj. R-squared:,0.158
Method:,Least Squares,F-statistic:,190.3
Date:,"Mon, 31 Oct 2022",Prob (F-statistic):,0.0
Time:,16:36:33,Log-Likelihood:,7343.1
No. Observations:,25227,AIC:,-14630.0
Df Residuals:,25201,BIC:,-14420.0
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.0639,0.006,190.418,0.000,1.053,1.075
treat[T.True],-0.2318,0.005,-46.404,0.000,-0.242,-0.222
interaction[T.True],0.0058,0.008,0.685,0.493,-0.011,0.022
"C(time_fixed_effects)[T.(2015, 2)]",-0.0344,0.008,-4.368,0.000,-0.050,-0.019
"C(time_fixed_effects)[T.(2015, 3)]",-0.0512,0.008,-6.505,0.000,-0.067,-0.036
"C(time_fixed_effects)[T.(2015, 4)]",-0.0668,0.008,-8.468,0.000,-0.082,-0.051
"C(time_fixed_effects)[T.(2015, 5)]",-0.0741,0.008,-9.395,0.000,-0.090,-0.059
"C(time_fixed_effects)[T.(2015, 6)]",-0.0709,0.008,-8.978,0.000,-0.086,-0.055
"C(time_fixed_effects)[T.(2015, 7)]",-0.0687,0.008,-8.713,0.000,-0.084,-0.053

0,1,2,3
Omnibus:,2933.806,Durbin-Watson:,0.733
Prob(Omnibus):,0.0,Jarque-Bera (JB):,9959.049
Skew:,-0.584,Prob(JB):,0.0
Kurtosis:,5.848,Cond. No.,25.0


In [33]:
np.exp(results.params[2])-1

0.0057961689937366945

#### Control group with county density btw min and max of treatment group density

In [34]:
# We choose to focus on milk prices
milk = nielsen[nielsen.product_group_descr == 'FRESH PRODUCE']


# The control group is composed by all states where nothing (no entry nor exit) happened.
control = milk[~np.isin(milk.guessed_store_county_fips, movements)].copy()
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


# The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.Opening_date + count.Closing_date == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]

treatment = milk[np.isin(milk.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


# We create our dummies for the regression
control['treat'] = False
control['interaction'] = False
control_m['time_fixed_effects'] = list(zip(control_m.purchase_year, control_m.purchase_month))
treatment['treat'] = True
treatment['interaction'] = (treatment.purchase_month > treatment.Opening_date.dt.month) & (treatment.purchase_year >= treatment.Opening_date.dt.year)
treatment['time_fixed_effects'] = list(zip(treatment.purchase_year, treatment.purchase_month))


# Final dataset for the regression :

df = pd.concat((control_m, treatment))[['upc_price', 'treat', 'interaction', 'time_fixed_effects']]
df = df[df.upc_price != 0]

Size of the control group: 2289.
Size of the treatment group: 89.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_m['time_fixed_effects'] = list(zip(control_m.purchase_year, control_m.purchase_month))


In [35]:
reg = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(time_fixed_effects)', data=df)
results = reg.fit()

In [36]:
results.summary()

0,1,2,3
Dep. Variable:,np.log(upc_price),R-squared:,0.072
Model:,OLS,Adj. R-squared:,0.072
Method:,Least Squares,F-statistic:,231.3
Date:,"Mon, 31 Oct 2022",Prob (F-statistic):,0.0
Time:,16:36:50,Log-Likelihood:,8659.7
No. Observations:,74195,AIC:,-17270.0
Df Residuals:,74169,BIC:,-17030.0
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.0767,0.004,278.879,0.000,1.069,1.084
treat[T.True],-0.2442,0.006,-42.033,0.000,-0.256,-0.233
interaction[T.True],0.0031,0.010,0.312,0.755,-0.016,0.023
"C(time_fixed_effects)[T.(2015, 2)]",-0.0332,0.005,-6.091,0.000,-0.044,-0.023
"C(time_fixed_effects)[T.(2015, 3)]",-0.0484,0.005,-8.883,0.000,-0.059,-0.038
"C(time_fixed_effects)[T.(2015, 4)]",-0.0739,0.005,-13.534,0.000,-0.085,-0.063
"C(time_fixed_effects)[T.(2015, 5)]",-0.0767,0.005,-14.006,0.000,-0.087,-0.066
"C(time_fixed_effects)[T.(2015, 6)]",-0.0799,0.005,-14.615,0.000,-0.091,-0.069
"C(time_fixed_effects)[T.(2015, 7)]",-0.0766,0.005,-13.995,0.000,-0.087,-0.066

0,1,2,3
Omnibus:,8364.069,Durbin-Watson:,0.868
Prob(Omnibus):,0.0,Jarque-Bera (JB):,30262.597
Skew:,-0.547,Prob(JB):,0.0
Kurtosis:,5.931,Cond. No.,24.9


In [42]:
np.exp(results.params[2])-1

-0.024562597288311894

## THIRD MODEL : adding entity effects

Regression model :
$$ Price_{i, t} = \alpha + \beta \cdot treat_i + \delta \cdot treat_i \cdot post_{t} + \sum_{\tau=Janv15}^{Dec16}\gamma_{\tau} \cdot \mathbb{1}(t=\tau) + \sum_{s \in USStates}\gamma_{s} \cdot \mathbb{1}(state = s) + \varepsilon_{i, t}$$

#### Entire control group

In [37]:
# We choose to focus on milk prices
milk = nielsen[nielsen.product_group_descr == 'FRESH PRODUCE']


# The control group is composed by all states where nothing (no entry nor exit) happened.
control = milk[~np.isin(milk.guessed_store_county_fips, movements)].copy()
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


# The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.Opening_date + count.Closing_date == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]

treatment = milk[np.isin(milk.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


# We create our dummies for the regression
control['treat'] = False
control['interaction'] = False
control['time_fixed_effects'] = list(zip(control.purchase_month, control.purchase_year))
treatment['treat'] = True
treatment['interaction'] = (treatment.purchase_month > treatment.Opening_date.dt.month) & (treatment.purchase_year >= treatment.Opening_date.dt.year)
treatment['time_fixed_effects'] = list(zip(treatment.purchase_month, treatment.purchase_year))


# Final dataset for the regression :

df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'time_fixed_effects', 'store_state']]
df = df[df.upc_price != 0]

Size of the control group: 2289.
Size of the treatment group: 89.


In [38]:
reg = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(time_fixed_effects) + C(store_state)', data=df)
results = reg.fit()

In [39]:
results.summary()

0,1,2,3
Dep. Variable:,np.log(upc_price),R-squared:,0.054
Model:,OLS,Adj. R-squared:,0.053
Method:,Least Squares,F-statistic:,38.78
Date:,"Mon, 31 Oct 2022",Prob (F-statistic):,0.0
Time:,16:37:12,Log-Likelihood:,-7345.7
No. Observations:,49205,AIC:,14840.0
Df Residuals:,49131,BIC:,15490.0
Df Model:,73,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6954,0.011,66.023,0.000,0.675,0.716
treat[T.True],-0.0003,0.008,-0.032,0.975,-0.016,0.015
interaction[T.True],-0.0092,0.013,-0.704,0.481,-0.035,0.016
"C(time_fixed_effects)[T.(1, 2016)]",0.0269,0.009,3.067,0.002,0.010,0.044
"C(time_fixed_effects)[T.(2, 2015)]",0.0008,0.009,0.091,0.927,-0.016,0.018
"C(time_fixed_effects)[T.(2, 2016)]",0.0334,0.009,3.801,0.000,0.016,0.051
"C(time_fixed_effects)[T.(3, 2015)]",-0.0151,0.009,-1.733,0.083,-0.032,0.002
"C(time_fixed_effects)[T.(3, 2016)]",0.0198,0.009,2.260,0.024,0.003,0.037
"C(time_fixed_effects)[T.(4, 2015)]",-0.0147,0.009,-1.674,0.094,-0.032,0.003

0,1,2,3
Omnibus:,8021.547,Durbin-Watson:,1.378
Prob(Omnibus):,0.0,Jarque-Bera (JB):,58647.009
Skew:,-0.588,Prob(JB):,0.0
Kurtosis:,8.217,Cond. No.,52.3


#### Control group with county density btw fisrt and third quartiles of treatment group density

In [40]:
# We choose to focus on milk prices
milk = nielsen[nielsen.product_group_descr == 'FRESH PRODUCE']


# The control group is composed by all states where nothing (no entry nor exit) happened.
control = milk[~np.isin(milk.guessed_store_county_fips, movements)].copy()
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


# The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.Opening_date + count.Closing_date == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]

treatment = milk[np.isin(milk.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


# We create our dummies for the regression
control['treat'] = False
control['interaction'] = False
control_m2['time_fixed_effects'] = list(zip(control_m2.purchase_month, control_m2.purchase_year))
treatment['treat'] = True
treatment['interaction'] = (treatment.purchase_month > treatment.Opening_date.dt.month) & (treatment.purchase_year >= treatment.Opening_date.dt.year)
treatment['time_fixed_effects'] = list(zip(treatment.purchase_month, treatment.purchase_year))


# Final dataset for the regression :

df = pd.concat((control_m2, treatment))[['upc_price', 'treat', 'interaction', 'time_fixed_effects', 'store_state']]
df = df[df.upc_price != 0]

Size of the control group: 2289.
Size of the treatment group: 89.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_m2['time_fixed_effects'] = list(zip(control_m2.purchase_month, control_m2.purchase_year))


In [41]:
reg = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(time_fixed_effects) + C(store_state)', data=df)
results = reg.fit()

In [42]:
results.summary()

0,1,2,3
Dep. Variable:,np.log(upc_price),R-squared:,0.333
Model:,OLS,Adj. R-squared:,0.331
Method:,Least Squares,F-statistic:,177.1
Date:,"Mon, 31 Oct 2022",Prob (F-statistic):,0.0
Time:,16:37:25,Log-Likelihood:,10275.0
No. Observations:,25227,AIC:,-20410.0
Df Residuals:,25155,BIC:,-19820.0
Df Model:,71,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.1283,0.009,125.058,0.000,1.111,1.146
treat[T.True],-0.2537,0.005,-52.118,0.000,-0.263,-0.244
interaction[T.True],0.0167,0.008,2.207,0.027,0.002,0.032
"C(time_fixed_effects)[T.(1, 2016)]",-0.1074,0.007,-15.294,0.000,-0.121,-0.094
"C(time_fixed_effects)[T.(2, 2015)]",-0.0342,0.007,-4.874,0.000,-0.048,-0.020
"C(time_fixed_effects)[T.(2, 2016)]",-0.1294,0.007,-18.427,0.000,-0.143,-0.116
"C(time_fixed_effects)[T.(3, 2015)]",-0.0511,0.007,-7.283,0.000,-0.065,-0.037
"C(time_fixed_effects)[T.(3, 2016)]",-0.1403,0.007,-19.950,0.000,-0.154,-0.127
"C(time_fixed_effects)[T.(4, 2015)]",-0.0665,0.007,-9.459,0.000,-0.080,-0.053

0,1,2,3
Omnibus:,3598.46,Durbin-Watson:,0.924
Prob(Omnibus):,0.0,Jarque-Bera (JB):,18654.856
Skew:,-0.591,Prob(JB):,0.0
Kurtosis:,7.043,Cond. No.,55.4


#### Control group with county density btw min and max of treatment group density

In [46]:
# We choose to focus on milk prices
milk = nielsen[nielsen.product_group_descr == 'FRESH PRODUCE']


# The control group is composed by all states where nothing (no entry nor exit) happened.
control = milk[~np.isin(milk.guessed_store_county_fips, movements)].copy()
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


# The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.Opening_date + count.Closing_date == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]

treatment = milk[np.isin(milk.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


# We create our dummies for the regression
control['treat'] = False
control['interaction'] = False
control_m['time_fixed_effects'] = list(zip(control_m.purchase_month, control_m.purchase_year))
treatment['treat'] = True
treatment['interaction'] = (treatment.purchase_month > treatment.Opening_date.dt.month) & (treatment.purchase_year >= treatment.Opening_date.dt.year)
treatment['time_fixed_effects'] = list(zip(treatment.purchase_month, treatment.purchase_year))


# Final dataset for the regression :

df = pd.concat((control_m, treatment))[['upc_price', 'treat', 'interaction', 'time_fixed_effects', 'store_state']]
df = df[df.upc_price != 0]

Size of the control group: 2279.
Size of the treatment group: 89.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_m['time_fixed_effects'] = list(zip(control_m.purchase_month, control_m.purchase_year))


In [47]:
reg = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(time_fixed_effects) + C(store_state)', data=df)
results = reg.fit()

In [48]:
results.summary()

0,1,2,3
Dep. Variable:,np.log(upc_price),R-squared:,0.179
Model:,OLS,Adj. R-squared:,0.178
Method:,Least Squares,F-statistic:,221.2
Date:,"Thu, 27 Oct 2022",Prob (F-statistic):,0.0
Time:,13:15:24,Log-Likelihood:,14419.0
No. Observations:,74189,AIC:,-28690.0
Df Residuals:,74115,BIC:,-28010.0
Df Model:,73,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.1253,0.006,187.640,0.000,1.114,1.137
treat[T.True],-0.0041,0.006,-0.750,0.453,-0.015,0.007
interaction[T.True],-0.0114,0.009,-1.238,0.216,-0.030,0.007
"C(time_fixed_effects)[T.(1, 2016)]",-0.1101,0.005,-21.798,0.000,-0.120,-0.100
"C(time_fixed_effects)[T.(2, 2015)]",-0.0336,0.005,-6.658,0.000,-0.044,-0.024
"C(time_fixed_effects)[T.(2, 2016)]",-0.1271,0.005,-25.108,0.000,-0.137,-0.117
"C(time_fixed_effects)[T.(3, 2015)]",-0.0492,0.005,-9.743,0.000,-0.059,-0.039
"C(time_fixed_effects)[T.(3, 2016)]",-0.1396,0.005,-27.585,0.000,-0.150,-0.130
"C(time_fixed_effects)[T.(4, 2015)]",-0.0737,0.005,-14.578,0.000,-0.084,-0.064

0,1,2,3
Omnibus:,10905.613,Durbin-Watson:,1.003
Prob(Omnibus):,0.0,Jarque-Bera (JB):,58891.682
Skew:,-0.602,Prob(JB):,0.0
Kurtosis:,7.195,Cond. No.,59.2


## THIRD MODEL : adding entity effects

Regression model :
$$
\begin{align}
Price_{i, t} &= \alpha + \beta \cdot treat_i + \sum_{\tau=Feb15}^{Dec16} \delta_{\tau} \cdot treat_i \cdot \mathbb{1}(t=\tau)  + \sum_{\tau=Feb15}^{Dec16}\gamma_{\tau} \cdot \mathbb{1}(t=\tau) + \sum_{s \in USStates}\gamma_{s} \cdot \mathbb{1}(state = s) + \varepsilon_{i, t}\\
&= \beta_i + \sum_{\tau=Feb15}^{Dec16} \delta_{\tau} \cdot treat_i \cdot \mathbb{1}(t=\tau)  + \rho_t + \varepsilon_{i, t}
\end{align}
$$