In [2]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

In [3]:
nielsen15 = pd.read_csv('../../Nielsen/aggregated_nielsen_2015.csv')
nielsen16 = pd.read_csv('../..//Nielsen/aggregated_nielsen_2016.csv')
nielsen15['year'] = 2015
nielsen16['year'] = 2016

nielsen = pd.concat((nielsen15, nielsen16))

In [4]:
density_path = r"C:\Users\inesn\Documents\Mines\2A\TR\econometrics\data_collection\plein_de_data\Average_Household_Size_and_Population_Density_-_County.csv"
density = pd.read_csv(density_path)

In [5]:
density = density[["GEOID", "B01001_calc_PopDensity"]]

In [6]:
# We choose to focus on milk prices
milk = nielsen[nielsen.product_group_descr == 'MILK']
#print(milk.shape[0])

In [7]:
milk_d = pd.merge(milk, density,  how='inner', left_on='guessed_store_county_fips', right_on='GEOID')
#print(milk_d.shape[0])
#milk_d.head()

In [8]:
## Entriy/exit dates
fandom = pd.read_csv('../data_collection/plein_de_data/fandom_traitées.csv', parse_dates=['Opening_date', 'Closing_date'])[['State', 'County_name', 'County_fips', 'Opening_date', 'Closing_date']]

# We drop the state in which we do not trust our data (some mistakes stil remain)
fandom = fandom[~np.isin(fandom.State, ('CA', 'GA', 'KS', 'LA', 'TX'))]

# We concentrate our study on the movements (entries & exits) during the fiscal years 2015 and 2016
movements = fandom[((fandom.Opening_date >= '2015-01-31') & (fandom.Opening_date <= '2017-01-31')) | ((fandom.Closing_date >= '2015-01-31') & (fandom.Closing_date <= '2017-01-31'))]
#movements['year'] = movements.Opening_date.dt.year
#movements['month'] = movements.Opening_date.dt.month

In [9]:
# The control group is composed by all states where nothing (no entry nor exit) happened.
control = milk_d[~np.isin(milk_d.guessed_store_county_fips, movements)].copy()
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


# The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.Opening_date + count.Closing_date == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]

treatment = milk_d[np.isin(milk_d.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


# We create our dummies for the regression
control['treat'] = False
control['interaction'] = False
treatment['treat'] = True
treatment['interaction'] = (treatment.purchase_month > treatment.Opening_date.dt.month) & (treatment.purchase_year >= treatment.Opening_date.dt.year)
treatment.describe()

Size of the control group: 2322.
Size of the treatment group: 89.


Unnamed: 0,guessed_store_county_fips,purchase_year,purchase_month,upc_price,upc_price_std,nb_of_obs,year,GEOID,B01001_calc_PopDensity,County_fips
count,3981.0,3981.0,3981.0,3981.0,3918.0,3981.0,3981.0,3981.0,3981.0,3981.0
mean,29728.964833,2015.502386,6.500377,2.713584,0.815189,72.80005,2015.502386,29728.964833,272.163924,29728.964833
std,16549.184944,0.500057,3.454851,0.465223,0.413785,143.02871,0.500057,16549.184944,530.198716,16549.184944
min,1043.0,2015.0,1.0,1.15,0.0,1.0,2015.0,1043.0,3.691471,1043.0
25%,12103.0,2015.0,3.0,2.386122,0.628218,12.0,2015.0,12103.0,46.316148,12103.0
50%,34029.0,2016.0,7.0,2.721429,0.788764,29.0,2016.0,34029.0,116.010549,34029.0
75%,45045.0,2016.0,10.0,3.01,0.946196,73.0,2016.0,45045.0,299.58893,45045.0
max,56021.0,2016.0,12.0,7.0125,8.662295,1618.0,2016.0,56021.0,4322.952569,56021.0


In [10]:
treatment_d = treatment[["GEOID","B01001_calc_PopDensity"]]
density_per_county_t = treatment_d.groupby("GEOID").agg(["mean"])
density_per_county_t.describe()

Unnamed: 0_level_0,B01001_calc_PopDensity
Unnamed: 0_level_1,mean
count,89.0
mean,265.11416
std,523.002419
min,3.691471
25%,42.854831
50%,107.326459
75%,288.317756
max,4322.952569


In [11]:
#criterium for control group : values in same range as for treatment
min_d = density_per_county_t.min()[('B01001_calc_PopDensity','mean')]
max_d = density_per_county_t.max()[('B01001_calc_PopDensity','mean')]
print(min_d, max_d)
print(control.shape[0])
control_m = control[(control['B01001_calc_PopDensity']>=min_d)&(control['B01001_calc_PopDensity']<=max_d)]
print(control_m.shape[0], control_m.shape[0]/control.shape[0])

3.69147125917798 4322.95256868476
76333
72075 0.9442180970222578


In [12]:
first_quartile = density_per_county_t.quantile(q=0.25)[('B01001_calc_PopDensity','mean')]
third_quartile = density_per_county_t.quantile(q=0.75)[('B01001_calc_PopDensity','mean')]
control_m2 = control[(control['B01001_calc_PopDensity']>=first_quartile)&(control['B01001_calc_PopDensity']<=third_quartile)]
print(control_m2.shape[0], control_m2.shape[0]/control.shape[0])

23105 0.3026869112965559


In [14]:
control_m = control_m[["is_walmart","store_state","guessed_store_county","guessed_store_county_fips","purchase_year","purchase_month","product_group_descr","upc_price","upc_price_std","nb_of_obs","year","treat","interaction"]]

In [15]:
control_m2 = control_m2[["is_walmart","store_state","guessed_store_county","guessed_store_county_fips","purchase_year","purchase_month","product_group_descr","upc_price","upc_price_std","nb_of_obs","year","treat","interaction"]]