In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

In [2]:
nielsen15 = pd.read_csv('../../Nielsen/aggregated_nielsen_2015.csv')
nielsen16 = pd.read_csv('../..//Nielsen/aggregated_nielsen_2016.csv')
nielsen15['year'] = 2015
nielsen16['year'] = 2016

In [3]:
nielsen = pd.concat((nielsen15, nielsen16))
nielsen = nielsen[~nielsen.is_walmart]

In [4]:
## Entriy/exit dates
fandom = pd.read_csv('../data_collection/plein_de_data/fandom_traitées.csv', parse_dates=['Opening_date', 'Closing_date'])[['State', 'County_name', 'County_fips', 'Opening_date', 'Closing_date']]

# We drop the state in which we do not trust our data (some mistakes stil remain)
fandom = fandom[~np.isin(fandom.State, ('CA', 'GA', 'KS', 'LA', 'TX'))]

# We concentrate our study on the movements (entries & exits) during the fiscal years 2015 and 2016
movements = fandom[((fandom.Opening_date >= '2015-01-31') & (fandom.Opening_date <= '2017-01-31')) | ((fandom.Closing_date >= '2015-01-31') & (fandom.Closing_date <= '2017-01-31'))]
#movements['year'] = movements.Opening_date.dt.year
#movements['month'] = movements.Opening_date.dt.month

## FIRST MODEL : one month

Regression model :
$$ Price_{i, t} = \alpha + \beta \cdot treat_i + \gamma \cdot post_t + \delta \cdot treat_i \cdot post_{t} + \varepsilon_{i, t}$$

In [5]:
year = 2016
month = 1

In [6]:
# We choose to focus on milk prices
milk = nielsen[nielsen.product_group_descr == 'MILK']


# The control group is composed by all states where nothing (no entry nor exit) happened.
control = milk[~np.isin(milk.guessed_store_county_fips, movements)].copy()
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


# The treatment group is composed by the states where one entry took place in august 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.Opening_date + count.Closing_date == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index)) & (movements.Opening_date.dt.year == year) & (movements.Opening_date.dt.month == month)]

treatment = milk[np.isin(milk.guessed_store_county_fips, treatment_movements.County_fips )].copy()
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


# We create our dummies for the regression
control['treat'] = False
control['post'] = (control.purchase_month > month) & (control.purchase_year == year)
treatment['treat'] = True
treatment['post'] = (treatment.purchase_month > month) & (treatment.purchase_year == year)


# Final dataset for the regression :

df = pd.concat((control, treatment))[['upc_price', 'treat', 'post']]
df = df[df.upc_price != 0]

Size of the control group: 2279.
Size of the treatment group: 15.


In [7]:
reg = smf.ols(formula='np.log(upc_price) ~ treat * post', data=df)
results = reg.fit()

In [8]:
results.summary()

0,1,2,3
Dep. Variable:,np.log(upc_price),R-squared:,0.015
Model:,OLS,Adj. R-squared:,0.015
Method:,Least Squares,F-statistic:,220.3
Date:,"Thu, 27 Oct 2022",Prob (F-statistic):,7.49e-142
Time:,10:13:47,Log-Likelihood:,1814.8
No. Observations:,44561,AIC:,-3622.0
Df Residuals:,44557,BIC:,-3587.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.0080,0.001,673.336,0.000,1.005,1.011
treat[T.True],-0.0396,0.017,-2.366,0.018,-0.072,-0.007
post[T.True],-0.0561,0.002,-25.288,0.000,-0.060,-0.052
treat[T.True]:post[T.True],-0.0121,0.025,-0.489,0.625,-0.060,0.036

0,1,2,3
Omnibus:,4270.644,Durbin-Watson:,0.957
Prob(Omnibus):,0.0,Jarque-Bera (JB):,22451.454
Skew:,-0.319,Prob(JB):,0.0
Kurtosis:,6.418,Cond. No.,28.3


## SECOND MODEL : all months

Regression model :
$$ Price_{i, t} = \alpha + \beta \cdot treat_i + \delta \cdot treat_i \cdot post_{t} + \varepsilon_{i, t}$$

In [9]:
# We choose to focus on milk prices
milk = nielsen[nielsen.product_group_descr == 'MILK']


# The control group is composed by all states where nothing (no entry nor exit) happened.
control = milk[~np.isin(milk.guessed_store_county_fips, movements)].copy()
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


# The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.Opening_date + count.Closing_date == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]

treatment = milk[np.isin(milk.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


# We create our dummies for the regression
control['treat'] = False
control['interaction'] = False
treatment['treat'] = True
treatment['interaction'] = (treatment.purchase_month > treatment.Opening_date.dt.month) & (treatment.purchase_year >= treatment.Opening_date.dt.year)


# Final dataset for the regression :

df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction']]
df = df[df.upc_price != 0]

Size of the control group: 2279.
Size of the treatment group: 89.


In [10]:
reg = smf.ols(formula='np.log(upc_price) ~ treat + interaction', data=df)
results = reg.fit()

In [11]:
results.summary()

0,1,2,3
Dep. Variable:,np.log(upc_price),R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,12.01
Date:,"Thu, 27 Oct 2022",Prob (F-statistic):,6.08e-06
Time:,10:13:51,Log-Likelihood:,1977.3
No. Observations:,46318,AIC:,-3949.0
Df Residuals:,46315,BIC:,-3922.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.9824,0.001,890.825,0.000,0.980,0.985
treat[T.True],0.0240,0.006,3.810,0.000,0.012,0.036
interaction[T.True],-0.0502,0.011,-4.709,0.000,-0.071,-0.029

0,1,2,3
Omnibus:,4417.753,Durbin-Watson:,0.925
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21484.694
Skew:,-0.348,Prob(JB):,0.0
Kurtosis:,6.263,Cond. No.,10.6


In [12]:
np.exp(results.params[2])-1

-0.0489590882635037

## THIRD MODEL : adding time fixed effects

Regression model :
$$ Price_{i, t} = \alpha + \beta \cdot treat_i + \delta \cdot treat_i \cdot post_{t} + \sum_{\tau=Janv15}^{Dec16}\gamma_{\tau} \cdot \mathbb{1}(t=\tau) + \varepsilon_{i, t}$$

In [13]:
# We choose to focus on milk prices
milk = nielsen[nielsen.product_group_descr == 'MILK']


# The control group is composed by all states where nothing (no entry nor exit) happened.
control = milk[~np.isin(milk.guessed_store_county_fips, movements)].copy()
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


# The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.Opening_date + count.Closing_date == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]

treatment = milk[np.isin(milk.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


# We create our dummies for the regression
control['treat'] = False
control['interaction'] = False
control['time_fixed_effects'] = list(zip(control.purchase_year, control.purchase_month))
treatment['treat'] = True
treatment['interaction'] = (treatment.purchase_month > treatment.Opening_date.dt.month) & (treatment.purchase_year >= treatment.Opening_date.dt.year)
treatment['time_fixed_effects'] = list(zip(treatment.purchase_year, treatment.purchase_month))


# Final dataset for the regression :

df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'time_fixed_effects']]
df = df[df.upc_price != 0]

Size of the control group: 2279.
Size of the treatment group: 89.


In [14]:
reg = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(time_fixed_effects)', data=df)
results = reg.fit()

In [15]:
results.summary()

0,1,2,3
Dep. Variable:,np.log(upc_price),R-squared:,0.039
Model:,OLS,Adj. R-squared:,0.038
Method:,Least Squares,F-statistic:,74.25
Date:,"Thu, 27 Oct 2022",Prob (F-statistic):,0.0
Time:,10:13:55,Log-Likelihood:,2875.8
No. Observations:,46318,AIC:,-5700.0
Df Residuals:,46292,BIC:,-5472.0
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.0689,0.005,207.880,0.000,1.059,1.079
treat[T.True],0.0186,0.006,3.014,0.003,0.007,0.031
interaction[T.True],-0.0339,0.011,-3.216,0.001,-0.055,-0.013
"C(time_fixed_effects)[T.(2015, 2)]",-0.0281,0.007,-3.864,0.000,-0.042,-0.014
"C(time_fixed_effects)[T.(2015, 3)]",-0.0396,0.007,-5.457,0.000,-0.054,-0.025
"C(time_fixed_effects)[T.(2015, 4)]",-0.0671,0.007,-9.209,0.000,-0.081,-0.053
"C(time_fixed_effects)[T.(2015, 5)]",-0.0671,0.007,-9.193,0.000,-0.081,-0.053
"C(time_fixed_effects)[T.(2015, 6)]",-0.0754,0.007,-10.361,0.000,-0.090,-0.061
"C(time_fixed_effects)[T.(2015, 7)]",-0.0697,0.007,-9.569,0.000,-0.084,-0.055

0,1,2,3
Omnibus:,4702.472,Durbin-Watson:,0.93
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21687.701
Skew:,-0.406,Prob(JB):,0.0
Kurtosis:,6.253,Cond. No.,24.8


In [16]:
np.exp(results.params[2])-1

-0.03333556562495743

## THIRD MODEL : adding entity effects

Regression model :
$$ Price_{i, t} = \alpha + \beta \cdot treat_i + \delta \cdot treat_i \cdot post_{t} + \sum_{\tau=Janv15}^{Dec16}\gamma_{\tau} \cdot \mathbb{1}(t=\tau) + \sum_{s \in USStates}\gamma_{s} \cdot \mathbb{1}(state = s) + \varepsilon_{i, t}$$

In [17]:
# We choose to focus on milk prices
milk = nielsen[nielsen.product_group_descr == 'MILK']


# The control group is composed by all states where nothing (no entry nor exit) happened.
control = milk[~np.isin(milk.guessed_store_county_fips, movements)].copy()
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


# The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.Opening_date + count.Closing_date == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]

treatment = milk[np.isin(milk.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


# We create our dummies for the regression
control['treat'] = False
control['interaction'] = False
control['time_fixed_effects'] = list(zip(control.purchase_month, control.purchase_year))
treatment['treat'] = True
treatment['interaction'] = (treatment.purchase_month > treatment.Opening_date.dt.month) & (treatment.purchase_year >= treatment.Opening_date.dt.year)
treatment['time_fixed_effects'] = list(zip(treatment.purchase_month, treatment.purchase_year))


# Final dataset for the regression :

df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'time_fixed_effects', 'store_state']]
df = df[df.upc_price != 0]

Size of the control group: 2279.
Size of the treatment group: 89.


In [18]:
reg = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(time_fixed_effects) + C(store_state)', data=df)
results = reg.fit()

In [19]:
results.summary()

0,1,2,3
Dep. Variable:,np.log(upc_price),R-squared:,0.171
Model:,OLS,Adj. R-squared:,0.17
Method:,Least Squares,F-statistic:,130.7
Date:,"Thu, 27 Oct 2022",Prob (F-statistic):,0.0
Time:,10:14:01,Log-Likelihood:,6309.8
No. Observations:,46318,AIC:,-12470.0
Df Residuals:,46244,BIC:,-11820.0
Df Model:,73,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.0909,0.008,132.592,0.000,1.075,1.107
treat[T.True],-0.0057,0.006,-0.952,0.341,-0.017,0.006
interaction[T.True],-0.0206,0.010,-2.097,0.036,-0.040,-0.001
"C(time_fixed_effects)[T.(1, 2016)]",-0.0917,0.007,-13.533,0.000,-0.105,-0.078
"C(time_fixed_effects)[T.(2, 2015)]",-0.0284,0.007,-4.207,0.000,-0.042,-0.015
"C(time_fixed_effects)[T.(2, 2016)]",-0.1110,0.007,-16.353,0.000,-0.124,-0.098
"C(time_fixed_effects)[T.(3, 2015)]",-0.0400,0.007,-5.925,0.000,-0.053,-0.027
"C(time_fixed_effects)[T.(3, 2016)]",-0.1259,0.007,-18.575,0.000,-0.139,-0.113
"C(time_fixed_effects)[T.(4, 2015)]",-0.0666,0.007,-9.840,0.000,-0.080,-0.053

0,1,2,3
Omnibus:,6252.543,Durbin-Watson:,1.078
Prob(Omnibus):,0.0,Jarque-Bera (JB):,43500.878
Skew:,-0.454,Prob(JB):,0.0
Kurtosis:,7.66,Cond. No.,52.4


## THIRD MODEL : adding entity effects

Regression model :
$$
\begin{align}
Price_{i, t} &= \alpha + \beta \cdot treat_i + \sum_{\tau=Feb15}^{Dec16} \delta_{\tau} \cdot treat_i \cdot \mathbb{1}(t=\tau)  + \sum_{\tau=Feb15}^{Dec16}\gamma_{\tau} \cdot \mathbb{1}(t=\tau) + \sum_{s \in USStates}\gamma_{s} \cdot \mathbb{1}(state = s) + \varepsilon_{i, t}\\
&= \beta_i + \sum_{\tau=Feb15}^{Dec16} \delta_{\tau} \cdot treat_i \cdot \mathbb{1}(t=\tau)  + \rho_t + \varepsilon_{i, t}
\end{align}
$$