In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import datetime

In [2]:
nielsen15 = pd.read_csv('../../Nielsen/aggregated_nielsen_2015.csv')
nielsen16 = pd.read_csv('../..//Nielsen/aggregated_nielsen_2016.csv')
nielsen15['year'] = 2015
nielsen16['year'] = 2016

In [3]:
nielsen = pd.concat((nielsen15, nielsen16))
nielsen = nielsen[~nielsen.is_walmart]

In [4]:
## Entriy/exit dates
fandom = pd.read_csv('../data_collection/plein_de_data/fandom_traitées.csv', parse_dates=['Opening_date', 'Closing_date'])[['State', 'County_name', 'County_fips', 'Opening_date', 'Closing_date']]

# We drop the state in which we do not trust our data (some mistakes stillremain)
fandom = fandom[~np.isin(fandom.State, ('CA', 'GA', 'KS', 'LA', 'TX'))]
nielsen = nielsen[~np.isin(nielsen.store_state, ('CA', 'GA', 'KS', 'LA', 'TX'))]

# We concentrate our study on the movements (entries & exits) during the fiscal years 2015 and 2016
#movements = fandom[((fandom.Opening_date >= '2015-01-31') & (fandom.Opening_date <= '2017-01-31')) | ((fandom.Closing_date >= '2015-01-31') & (fandom.Closing_date <= '2017-01-31'))]
# Correction des effets de bord M12
movements = fandom[((fandom.Opening_date >= '2014-01-31') & (fandom.Opening_date <= '2018-01-31')) | ((fandom.Closing_date >= '2014-01-31') & (fandom.Closing_date <= '2018-01-31'))]

In [None]:
categories=["MILK"]
nielsen['upc_price'] = nielsen.price_per_gal

In [15]:
hey = nielsen.groupby('product_group_descr').count()
categories = hey[hey.is_walmart<25000].index

In [None]:
categories = [
    "FRESH PRODUCE",
    "BREAD AND BAKED GOODS",
    "MILK",
    "SNACKS",
    "PACKAGED MEATS-DELI",
    "CHEESE",
    "UNPREP MEAT/POULTRY/SEAFOOD-FRZN",
    "CARBONATED BEVERAGES",
    "CONDIMENTS, GRAVIES, AND SAUCES",
    "CANDY",
    "JUICE, DRINKS - CANNED, BOTTLED",
    "EGGS",
    "CEREAL",
    "PASTA",
]

In [6]:
categories =[
    "COT CHEESE, SOUR CREAM, TOPPINGS",
    "PACKAGED MILK AND MODIFIERS",
    "SPICES, SEASONING, EXTRACTS",
    "SUGAR, SWEETENERS"
]

In [None]:
categories=[
    "BAKING MIXES",
    "MILK",
    "CHEESE",
    "COT CHEESE, SOUR CREAM, TOPPINGS",
    "CRACKERS"
    "CANDY",
    "PASTA",
    
]

In [16]:
pool = pd.DataFrame(nielsen.groupby(['is_walmart', 'store_state', 'guessed_store_county', 'guessed_store_county_fips', 'purchase_year', 'purchase_month']).mean()['upc_price']).reset_index()

categories = ['_']

In [6]:
post = 0

## PREMIER MODELE

$$Prices_{i, t} = \alpha + \beta treat_i + \gamma treat_i* post_t$$

In [22]:
# We choose to focus on milk prices
for category in categories:
    #product_group = nielsen[nielsen.product_group_descr == category]
    #product_group = pool
    product_group = nielsen

    # The control group is composed by all states where nothing (no entry nor exit) happened.
    control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()
    print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


    # The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
    count = movements.groupby('County_fips').count()
    count = count[count.State == 1] # No more than one movement in the treatement group
    treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
    treatment_movements = treatment_movements[(treatment_movements.Opening_date>='2015-01-31' ) & (treatment_movements.Opening_date<='2017-01-31') & ((treatment_movements.Closing_date>'2017-01-31') | (treatment_movements.Closing_date.apply(str) == 'NaT'))]

    treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
    treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
    print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


    # We create our dummies for the regression
    control['treat'] = False
    control['interaction'] = False

    treatment['treat'] = True
    #treatment['interaction'] = ((treatment.Opening_date.dt.year == treatment.purchase_year) & (treatment.Opening_date.dt.month > treatment.purchase_month)) | (treatment.purchase_year > treatment.Opening_date.dt.year)
    treatment['purchase_0'] = treatment.purchase_month + 12 * (treatment.purchase_year - 2015)
    treatment['opening_0'] = treatment.Opening_date.dt.month  + 12 * (treatment.Opening_date.dt.year - 2015)
    treatment['interaction'] = treatment.purchase_0 >= treatment.opening_0 + post


    # Final dataset for the regression :

    df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction']]
    df = df[df.upc_price != 0]
    reg0 = smf.ols(formula='np.log(upc_price) ~ treat + interaction', data=df)
    results0 = reg0.fit()
    #if abs(results0.params[2] / results0.bse[2]) > 2. :
    if True:
        print("=========================================================")
        print(category)
        print(f"Coef : {np.exp(results0.params[2])-1}")
        print(f"Coef/err : {abs(results0.params[2] / results0.bse[2])}")
        print(f"CI_up : {np.exp(results0.conf_int(alpha=0.05)[0][2])-1}")
        print(f"CI_down : {np.exp(results0.conf_int(alpha=0.05)[1][2])-1}")

Size of the control group: 1802.
Size of the treatment group: 65.
_
Coef : 0.012243573513392336
Coef/err : 3.053990985459992
CI_up : 0.0043688770208243355
CI_down : 0.020180011111512908


## DEUXIEME MODELE - monthly time fixed effects

In [19]:
# We choose to focus on milk prices
for category in categories:
    #product_group = nielsen[nielsen.product_group_descr == category]
    product_group = pool

    # The control group is composed by all states where nothing (no entry nor exit) happened.
    control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()
    print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


    # The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
    count = movements.groupby('County_fips').count()
    count = count[count.State == 1] # No more than one movement in the treatement group
    treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
    treatment_movements = treatment_movements[(treatment_movements.Opening_date>='2015-01-31' ) & (treatment_movements.Opening_date<='2017-01-31') & ((treatment_movements.Closing_date>'2017-01-31') | (treatment_movements.Closing_date.apply(str) == 'NaT'))]

    treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
    treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
    print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


    # We create our dummies for the regression
    control['treat'] = False
    control['interaction'] = False
    control['time_effects'] = list(zip(control.purchase_year, control.purchase_month))

    treatment['treat'] = True
    #treatment['interaction'] = ((treatment.Opening_date.dt.year == treatment.purchase_year) & (treatment.Opening_date.dt.month > treatment.purchase_month)) | (treatment.Opening_date.dt.year > treatment.purchase_year)
    treatment['purchase_0'] = treatment.purchase_month + 12 * (treatment.purchase_year - 2015)
    treatment['opening_0'] = treatment.Opening_date.dt.month  + 12 * (treatment.Opening_date.dt.year - 2015)
    treatment['interaction'] = treatment.purchase_0 >= treatment.opening_0 + post
    treatment['time_effects'] = list(zip(treatment.purchase_year, treatment.purchase_month))


    # Final dataset for the regression :

    df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'store_state', 'time_effects']]
    df = df[df.upc_price != 0]


    reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(time_effects)', data=df)
    results1 = reg1.fit()
    if True :
    #if  abs(results1.params[2] / results1.bse[2])> 2.:
        print("=========================================================")
        print(category)
        print(f"Coef : {np.exp(results1.params[2])-1}")
        print(f"Coef/err : {abs(results1.params[2] / results1.bse[2])}")
        print(f"CI_up : {np.exp(results1.conf_int(alpha=0.05)[0][2])-1}")
        print(f"CI_down : {np.exp(results1.conf_int(alpha=0.05)[1][2])-1}")

Size of the control group: 1802.
Size of the treatment group: 65.
_
Coef : 0.02223552240504789
Coef/err : 1.4577683184459918
CI_up : -0.0075483686655423154
CI_down : 0.052913240579445686


## TROISIEME MODELE - state effects

In [20]:
# We choose to focus on milk prices
for category in categories:
    #product_group = nielsen[nielsen.product_group_descr == category]
    product_group = pool

    # The control group is composed by all states where nothing (no entry nor exit) happened.
    control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()
    #print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


    # The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
    count = movements.groupby('County_fips').count()
    count = count[count.State == 1] # No more than one movement in the treatement group
    treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
    treatment_movements = treatment_movements[(treatment_movements.Opening_date>='2015-01-31' ) & (treatment_movements.Opening_date<='2017-01-31') & ((treatment_movements.Closing_date>'2017-01-31') | (treatment_movements.Closing_date.apply(str) == 'NaT'))]

    treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
    treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
    #print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


    # We create our dummies for the regression
    control['treat'] = False
    control['interaction'] = False
    control['time_effects'] = list(zip(control.purchase_year, control.purchase_month))

    treatment['treat'] = True
    treatment['purchase_0'] = treatment.purchase_month + 12 * (treatment.purchase_year - 2015)
    treatment['opening_0'] = treatment.Opening_date.dt.month  + 12 * (treatment.Opening_date.dt.year - 2015)
    treatment['interaction'] = treatment.purchase_0 >= treatment.opening_0 + post
    treatment['time_effects'] = list(zip(treatment.purchase_year, treatment.purchase_month))


    # Final dataset for the regression :

    df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'store_state', 'time_effects']]
    df = df[df.upc_price != 0]


    reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(time_effects) + C(store_state)', data=df)
    results1 = reg1.fit()
    print("=========================================================")
    print(category)
    if True:
    #if abs(results1.params[2] / results1.bse[2]) >= 2.:
        print(f"Coef : {np.exp(results1.params[2])-1}")
        print(f"Coef/err : {abs(results1.params[2] / results1.bse[2])}")
        print(f"CI_up : {np.exp(results1.conf_int(alpha=0.05)[0][2])-1}")
        print(f"CI_down : {np.exp(results1.conf_int(alpha=0.05)[1][2])-1}")

_
Coef : 0.03341797594021911
Coef/err : 2.3002084834970784
CI_up : 0.004873365074778313
CI_down : 0.06277343007982616


## QUATRIEME MODELE - state*time

In [23]:
# We choose to focus on milk prices
for category in categories:
    #product_group = nielsen[nielsen.product_group_descr == category]
    #product_group = pool
    product_group = nielsen

    # The control group is composed by all states where nothing (no entry nor exit) happened.
    control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()
    #print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


    # The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
    count = movements.groupby('County_fips').count()
    count = count[count.State == 1] # No more than one movement in the treatement group
    treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
    treatment_movements = treatment_movements[(treatment_movements.Opening_date>='2015-01-31' ) & (treatment_movements.Opening_date<='2017-01-31') & ((treatment_movements.Closing_date>'2017-01-31') | (treatment_movements.Closing_date.apply(str) == 'NaT'))]

    treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
    treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
    #print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


    # We create our dummies for the regression
    control['treat'] = False
    control['interaction'] = False
    control['time_effects'] = list(zip(control.purchase_year, control.purchase_month))

    treatment['treat'] = True
    treatment['purchase_0'] = treatment.purchase_month + 12 * (treatment.purchase_year - 2015)
    treatment['opening_0'] = treatment.Opening_date.dt.month  + 12 * (treatment.Opening_date.dt.year - 2015)
    treatment['interaction'] = treatment.purchase_0 >= treatment.opening_0 + post
    treatment['time_effects'] = list(zip(treatment.purchase_year, treatment.purchase_month))


    # Final dataset for the regression :

    df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'store_state', 'time_effects']]
    df = df[df.upc_price != 0]


    reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(time_effects) * C(store_state)', data=df)
    results1 = reg1.fit()
    print("=========================================================")
    print(category)
    if abs(results1.params[2] / results1.bse[2]) >= 2.:
        print(f"Coef : {np.exp(results1.params[2])-1}")
    print(f"Coef/err : {abs(results1.params[2] / results1.bse[2])}")
    if abs(results1.params[2] / results1.bse[2]) >= 2.:
        print(f"CI_up : {np.exp(results1.conf_int(alpha=0.05)[0][2])-1}")
        print(f"CI_down : {np.exp(results1.conf_int(alpha=0.05)[1][2])-1}")

: 

: 

In [35]:
# We choose to focus on milk prices
for category in ['CANDY', 'MILK', 'COFFEE', 'PAPER PRODUCTS', 'YOGURT']:
    product_group = nielsen[nielsen.product_group_descr == category]
    #product_group = pool
    #product_group = nielsen

    # The control group is composed by all states where nothing (no entry nor exit) happened.
    control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()
    print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")

    # We keep in the control group the only counties where we have data for the entire time period (24 months)
    nb_months = control.groupby('guessed_store_county_fips').count()
    control = control[np.isin(control.guessed_store_county_fips, nb_months[nb_months.is_walmart==24].index)]
    print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")

    nb_obs = control.groupby('guessed_store_county_fips').min()
    control = control[np.isin(control.guessed_store_county_fips, nb_obs[nb_obs.nb_of_obs > 3].index)]
    print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")

    # The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
    count = movements.groupby('County_fips').count()
    count = count[count.State == 1] # No more than one movement in the treatement group
    treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
    treatment_movements = treatment_movements[(treatment_movements.Opening_date>='2015-01-31' ) & (treatment_movements.Opening_date<='2017-01-31') & ((treatment_movements.Closing_date>'2017-01-31') | (treatment_movements.Closing_date.apply(str) == 'NaT'))]

    treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
    treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
    print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")



    # We create our dummies for the regression
    control['treat'] = False
    control['interaction'] = False
    control['time_effects'] = list(zip(control.purchase_year, control.purchase_month))

    treatment['treat'] = True
    treatment['purchase_0'] = treatment.purchase_month + 12 * (treatment.purchase_year - 2015)
    treatment['opening_0'] = treatment.Opening_date.dt.month  + 12 * (treatment.Opening_date.dt.year - 2015)
    treatment['interaction'] = treatment.purchase_0 >= treatment.opening_0 + post
    treatment['time_effects'] = list(zip(treatment.purchase_year, treatment.purchase_month))
    treatment = treatment[abs(treatment.purchase_0-treatment.opening_0) >= 2]


    # Final dataset for the regression :

    df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'store_state', 'time_effects']]
    df = df[df.upc_price != 0]


    reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(time_effects) * C(store_state)', data=df)
    results1 = reg1.fit()
    print("=========================================================")
    print(category)
    if abs(results1.params[2] / results1.bse[2]) >= 2.:
        print(f"Coef : {np.exp(results1.params[2])-1}")
    print(f"Coef/err : {abs(results1.params[2] / results1.bse[2])}")
    if abs(results1.params[2] / results1.bse[2]) >= 2.:
        print(f"CI_up : {np.exp(results1.conf_int(alpha=0.05)[0][2])-1}")
        print(f"CI_down : {np.exp(results1.conf_int(alpha=0.05)[1][2])-1}")

Size of the control group: 1751.
Size of the control group: 701.
Size of the control group: 460.
Size of the treatment group: 65.
CANDY
Coef : 0.058063436001930535
Coef/err : 3.8703008634540486
CI_up : 0.028246807350231995
CI_down : 0.08874467355666482
Size of the control group: 1753.
Size of the control group: 974.
Size of the control group: 660.
Size of the treatment group: 65.
MILK
Coef : -0.04109512362255707
Coef/err : 5.162073822958096
CI_up : -0.05625330864197686
CI_down : -0.025693472241679283
Size of the control group: 1610.
Size of the control group: 450.
Size of the control group: 246.
Size of the treatment group: 65.
COFFEE
Coef : 0.03699238687960982
Coef/err : 2.9354635273784924
CI_up : 0.012139500147311244
CI_down : 0.0624555313667321
Size of the control group: 1699.
Size of the control group: 648.
Size of the control group: 393.
Size of the treatment group: 65.
PAPER PRODUCTS
Coef : 0.10578211515236746
Coef/err : 6.0283033645176305
CI_up : 0.07021153549074
CI_down : 0.142