In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

In [2]:
nielsen15 = pd.read_csv('../../Nielsen/aggregated_nielsen_2015.csv')
nielsen16 = pd.read_csv('../..//Nielsen/aggregated_nielsen_2016.csv')
nielsen15['year'] = 2015
nielsen16['year'] = 2016

In [3]:
nielsen = pd.concat((nielsen15, nielsen16))
nielsen = nielsen[~nielsen.is_walmart]

In [4]:
## Entriy/exit dates
fandom = pd.read_csv('../data_collection/plein_de_data/fandom_traitées.csv', parse_dates=['Opening_date', 'Closing_date'])[['State', 'County_name', 'County_fips', 'Opening_date', 'Closing_date']]

# We drop the state in which we do not trust our data (some mistakes stillremain)
fandom = fandom[~np.isin(fandom.State, ('CA', 'GA', 'KS', 'LA', 'TX'))]
nielsen = nielsen[~np.isin(nielsen.store_state, ('CA', 'GA', 'KS', 'LA', 'TX'))]

# We concentrate our study on the movements (entries & exits) during the fiscal years 2015 and 2016
movements = fandom[((fandom.Opening_date >= '2015-01-31') & (fandom.Opening_date <= '2017-01-31')) | ((fandom.Closing_date >= '2015-01-31') & (fandom.Closing_date <= '2017-01-31'))]
#movements['year'] = movements.Opening_date.dt.year
#movements['month'] = movements.Opening_date.dt.month

In [9]:
categories = [
    "FRESH PRODUCE",
    "BREAD AND BAKED GOODS",
    "MILK",
    "SNACKS",
    "PACKAGED MEATS-DELI",
    "CHEESE",
    "UNPREP MEAT/POULTRY/SEAFOOD-FRZN",
    "CARBONATED BEVERAGES",
    "CONDIMENTS, GRAVIES, AND SAUCES",
    "CANDY",
    "JUICE, DRINKS - CANNED, BOTTLED",
    "EGGS",
    "CEREAL",
    "PASTA",
    "COT CHEESE, SOUR CREAM, TOPPINGS",
    "PAPER PRODUCTS",
    "YOGURT"
]

In [10]:
state_for_cat = {"FRESH PRODUCE" : "NC",
    "BREAD AND BAKED GOODS" : "NC",
    "MILK" : "NC",
    "SNACKS" : "NC",
    "PACKAGED MEATS-DELI" : "NC",
    "CHEESE" : "NC",
    "UNPREP MEAT/POULTRY/SEAFOOD-FRZN" : "NC",
    "CARBONATED BEVERAGES" : "NC",
    "CONDIMENTS, GRAVIES, AND SAUCES" : "NC",
    "CANDY" : "NC",
    "JUICE, DRINKS - CANNED, BOTTLED" : "NC",
    "EGGS" : "NC",
    "CEREAL" : "NC",
    "PASTA" : "NC",
    "COT CHEESE, SOUR CREAM, TOPPINGS" : "NC",
    "PAPER PRODUCTS" : "NC",
    "YOGURT" : "NC"
                }

In [14]:
# We choose to focus on milk prices
category = "MILK"
product_group = nielsen[(nielsen.product_group_descr == category)]


# The control group is composed by all states where nothing (no entry nor exit) happened.
control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


# The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.State == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
treatment_movements = treatment_movements[(treatment_movements.Closing_date>='2015-01-31' ) & (treatment_movements.Closing_date<='2017-01-31') & (treatment_movements.Opening_date<'2015-01-31')]

treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")

treatment.groupby('store_state').nunique().head(60)

Size of the control group: 1822.
Size of the treatment group: 43.


Unnamed: 0_level_0,is_walmart,guessed_store_county,guessed_store_county_fips,purchase_year,purchase_month,product_group_descr,upc_price,upc_price_std,nb_of_obs,year,State,County_name,County_fips,Opening_date,Closing_date
store_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
AL,1,4,4,2,12,1,76,69,28,2,1,4,4,3,1
AR,1,3,3,2,12,1,40,35,31,2,1,3,3,3,1
CO,1,1,1,2,12,1,24,24,20,2,1,1,1,1,1
CT,1,2,2,2,12,1,48,48,43,2,1,2,2,2,2
IL,1,1,1,2,12,1,24,24,22,2,1,1,1,1,1
MD,1,1,1,2,12,1,24,24,17,2,1,1,1,1,1
MI,1,1,1,2,12,1,24,24,21,2,1,1,1,1,1
MO,1,4,4,2,12,1,78,67,41,2,1,4,4,4,3
MS,1,5,5,2,12,1,80,66,12,2,1,5,5,2,1
NC,1,9,9,2,12,1,189,187,68,2,1,9,9,8,2


In [32]:
# We choose to focus on milk prices
category = "MILK"
product_group = nielsen[(nielsen.product_group_descr == category)]


# The control group is composed by all states where nothing (no entry nor exit) happened.
control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


# The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.State == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
treatment_movements = treatment_movements[(treatment_movements.Closing_date>='2015-01-31' ) & (treatment_movements.Closing_date<='2017-01-31') & (treatment_movements.Opening_date<'2015-01-31')]

treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")

pd.set_option("display.max_rows", None)
treatment[treatment.store_state == "NC"]

Size of the control group: 1822.
Size of the treatment group: 43.


Unnamed: 0,is_walmart,store_state,guessed_store_county,guessed_store_county_fips,purchase_year,purchase_month,product_group_descr,upc_price,upc_price_std,nb_of_obs,year,State,County_name,County_fips,Opening_date,Closing_date
431,False,NC,CASWELL,37033,2015,1,MILK,2.202,0.601273,10,2015,NC,Caswell,37033,2014-10-01,2016-01-28
432,False,NC,CASWELL,37033,2015,2,MILK,3.045,0.205061,2,2015,NC,Caswell,37033,2014-10-01,2016-01-28
433,False,NC,CASWELL,37033,2015,3,MILK,2.75,0.735418,6,2015,NC,Caswell,37033,2014-10-01,2016-01-28
434,False,NC,CASWELL,37033,2015,4,MILK,3.148333,0.233274,6,2015,NC,Caswell,37033,2014-10-01,2016-01-28
435,False,NC,CASWELL,37033,2015,5,MILK,2.555,1.038447,8,2015,NC,Caswell,37033,2014-10-01,2016-01-28
436,False,NC,CASWELL,37033,2015,6,MILK,3.186,0.244397,5,2015,NC,Caswell,37033,2014-10-01,2016-01-28
437,False,NC,CASWELL,37033,2015,7,MILK,3.295,0.122338,4,2015,NC,Caswell,37033,2014-10-01,2016-01-28
438,False,NC,CASWELL,37033,2015,8,MILK,3.223333,0.251661,3,2015,NC,Caswell,37033,2014-10-01,2016-01-28
439,False,NC,CASWELL,37033,2015,9,MILK,3.293333,0.253824,6,2015,NC,Caswell,37033,2014-10-01,2016-01-28
440,False,NC,CASWELL,37033,2015,10,MILK,2.9975,0.874051,8,2015,NC,Caswell,37033,2014-10-01,2016-01-28


NameError: name 'results0' is not defined