In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import datetime

In [41]:
nielsen15 = pd.read_csv('../../Nielsen/aggregated_nielsen_2015.csv')
nielsen16 = pd.read_csv('../..//Nielsen/aggregated_nielsen_2016.csv')
nielsen15['year'] = 2015
nielsen16['year'] = 2016

In [42]:
nielsen = pd.concat((nielsen15, nielsen16))
nielsen = nielsen[~nielsen.is_walmart]

In [43]:
## Entriy/exit dates
fandom = pd.read_csv('../data_collection/plein_de_data/fandom_traitÃ©es.csv', parse_dates=['Opening_date', 'Closing_date'])[['State', 'County_name', 'County_fips', 'Opening_date', 'Closing_date']]

# We drop the state in which we do not trust our data (some mistakes stillremain)
fandom = fandom[~np.isin(fandom.State, ('CA', 'GA', 'KS', 'LA', 'TX'))]
nielsen = nielsen[~np.isin(nielsen.store_state, ('CA', 'GA', 'KS', 'LA', 'TX'))]

# We concentrate our study on the movements (entries & exits) during the fiscal years 2015 and 2016
#movements = fandom[((fandom.Opening_date >= '2015-01-31') & (fandom.Opening_date <= '2017-01-31')) | ((fandom.Closing_date >= '2015-01-31') & (fandom.Closing_date <= '2017-01-31'))]
# Correction des effets de bord M12
movements = fandom[((fandom.Opening_date >= '2014-01-31') & (fandom.Opening_date <= '2018-01-31')) | ((fandom.Closing_date >= '2014-01-31') & (fandom.Closing_date <= '2018-01-31'))]

In [18]:
categories=["MILK"]
nielsen['upc_price'] = nielsen.price_per_gal

In [44]:
hey = nielsen.groupby('product_group_descr').count()
categories = hey[hey.is_walmart>=25000].index

In [None]:
categories = [
    "FRESH PRODUCE",
    "BREAD AND BAKED GOODS",
    "MILK",
    "SNACKS",
    "PACKAGED MEATS-DELI",
    "CHEESE",
    "UNPREP MEAT/POULTRY/SEAFOOD-FRZN",
    "CARBONATED BEVERAGES",
    "CONDIMENTS, GRAVIES, AND SAUCES",
    "CANDY",
    "JUICE, DRINKS - CANNED, BOTTLED",
    "EGGS",
    "CEREAL",
    "PASTA",
]

In [None]:
categories =[
    "COT CHEESE, SOUR CREAM, TOPPINGS",
    "PACKAGED MILK AND MODIFIERS",
    "SPICES, SEASONING, EXTRACTS",
    "SUGAR, SWEETENERS"
]

In [None]:
categories=[
    "BAKING MIXES",

    "MILK",
    "CHEESE",
    "COT CHEESE, SOUR CREAM, TOPPINGS",
    "CRACKERS"
    "CANDY",
    "PASTA",
    
]

## PREMIER MODELE

$$Prices_{i, t} = \alpha + \beta treat_i + \gamma treat_i* post_t$$

In [45]:
# We choose to focus on milk prices
for category in categories:
    product_group = nielsen[nielsen.product_group_descr == category]


    # The control group is composed by all states where nothing (no entry nor exit) happened.
    control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()
    print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


    # The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
    count = movements.groupby('County_fips').count()
    count = count[count.State == 1] # No more than one movement in the treatement group
    treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
    treatment_movements = treatment_movements[(treatment_movements.Opening_date>='2015-01-31' ) & (treatment_movements.Opening_date<='2017-01-31') & ((treatment_movements.Closing_date>'2017-01-31') | (treatment_movements.Closing_date.apply(str) == 'NaT'))]

    treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
    treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
    print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


    # We create our dummies for the regression
    control['treat'] = False
    control['interaction'] = False

    treatment['treat'] = True
    #treatment['interaction'] = ((treatment.Opening_date.dt.year == treatment.purchase_year) & (treatment.Opening_date.dt.month > treatment.purchase_month)) | (treatment.purchase_year > treatment.Opening_date.dt.year)
    treatment['purchase_0'] = treatment.purchase_month * (treatment.purchase_year == 2015) + (treatment.purchase_month +12) * (treatment.purchase_year == 2016)
    treatment['opening_0'] = treatment.Opening_date.dt.month * (treatment.Opening_date.dt.year == 2015) + (treatment.Opening_date.dt.month +12) * (treatment.Opening_date.dt.year == 2016)
    treatment['interaction'] = treatment.purchase_0 > treatment.opening_0


    # Final dataset for the regression :

    df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction']]
    df = df[df.upc_price != 0]
    reg0 = smf.ols(formula='np.log(upc_price) ~ treat + interaction', data=df)
    results0 = reg0.fit()
    #if abs(results0.params[2] / results0.bse[2]) > 2. :
    if True:
        print("=========================================================")
        print(category)
        print(f"Coef : {np.exp(results0.params[2])-1}")
        print(f"Coef/err : {abs(results0.params[2] / results0.bse[2])}")
        print(f"CI_up : {np.exp(results0.conf_int(alpha=0.05)[0][2])-1}")
        print(f"CI_down : {np.exp(results0.conf_int(alpha=0.05)[1][2])-1}")

Size of the control group: 1638.
Size of the treatment group: 63.
BAKED GOODS-FROZEN
Coef : -0.03124946653445093
Coef/err : 1.434725418787514
CI_up : -0.07236907689261429
CI_down : 0.011692875595463947
Size of the control group: 1666.
Size of the treatment group: 65.
BAKING MIXES
Coef : -0.024175540268677187
Coef/err : 1.0790288358420572
CI_up : -0.06660521507406114
CI_down : 0.020182876086546786
Size of the control group: 1684.
Size of the treatment group: 65.
BAKING SUPPLIES
Coef : 0.03158988914590588
Coef/err : 1.7232139234289958
CI_up : -0.00426544437115417
CI_down : 0.06873633477146157
Size of the control group: 1767.
Size of the treatment group: 65.
BREAD AND BAKED GOODS
Coef : 0.004362866466676607
Coef/err : 0.2790377182764925
CI_up : -0.025884872282050053
CI_down : 0.03554984296397912


KeyboardInterrupt: 

## DEUXIEME MODELE - monthly time fixed effects

In [46]:
# We choose to focus on milk prices
for category in categories:
    product_group = nielsen[nielsen.product_group_descr == category]


    # The control group is composed by all states where nothing (no entry nor exit) happened.
    control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()
    print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


    # The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
    count = movements.groupby('County_fips').count()
    count = count[count.State == 1] # No more than one movement in the treatement group
    treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
    treatment_movements = treatment_movements[(treatment_movements.Opening_date>='2015-01-31' ) & (treatment_movements.Opening_date<='2017-01-31') & ((treatment_movements.Closing_date>'2017-01-31') | (treatment_movements.Closing_date.apply(str) == 'NaT'))]

    treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
    treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
    print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


    # We create our dummies for the regression
    control['treat'] = False
    control['interaction'] = False
    control['time_effects'] = list(zip(control.purchase_year, control.purchase_month))

    treatment['treat'] = True
    #treatment['interaction'] = ((treatment.Opening_date.dt.year == treatment.purchase_year) & (treatment.Opening_date.dt.month > treatment.purchase_month)) | (treatment.Opening_date.dt.year > treatment.purchase_year)
    treatment['purchase_0'] = treatment.purchase_month * (treatment.purchase_year == 2015) + (treatment.purchase_month +12) * (treatment.purchase_year == 2016)
    treatment['opening_0'] = treatment.Opening_date.dt.month * (treatment.Opening_date.dt.year == 2015) + (treatment.Opening_date.dt.month +12) * (treatment.Opening_date.dt.year == 2016)
    treatment['interaction'] = treatment.purchase_0 > treatment.opening_0
    treatment['time_effects'] = list(zip(treatment.purchase_year, treatment.purchase_month))


    # Final dataset for the regression :

    df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'store_state', 'time_effects']]
    df = df[df.upc_price != 0]


    reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(time_effects)', data=df)
    results1 = reg1.fit()
    #if True :
    if  abs(results1.params[2] / results1.bse[2])> 2.:
        print("=========================================================")
        print(category)
        print(f"Coef : {np.exp(results1.params[2])-1}")
        print(f"Coef/err : {abs(results1.params[2] / results1.bse[2])}")
        print(f"CI_up : {np.exp(results1.conf_int(alpha=0.05)[0][2])-1}")
        print(f"CI_down : {np.exp(results1.conf_int(alpha=0.05)[1][2])-1}")

Size of the control group: 1638.
Size of the treatment group: 63.
BAKED GOODS-FROZEN
Coef : -0.048935119106123026
Coef/err : 2.2630839857961944
CI_up : -0.0893786714755106
CI_down : -0.0066953415913111325
Size of the control group: 1666.
Size of the treatment group: 65.
Size of the control group: 1684.
Size of the treatment group: 65.
Size of the control group: 1767.
Size of the treatment group: 65.
Size of the control group: 1596.
Size of the treatment group: 65.
Size of the control group: 1695.
Size of the treatment group: 65.
Size of the control group: 1751.
Size of the treatment group: 65.
Size of the control group: 1745.
Size of the treatment group: 65.
Size of the control group: 1708.
Size of the treatment group: 65.
Size of the control group: 1747.
Size of the treatment group: 65.
Size of the control group: 1610.
Size of the treatment group: 65.
Size of the control group: 1751.
Size of the treatment group: 65.
Size of the control group: 1692.
Size of the treatment group: 65.
Siz

## TROISIEME MODELE - state effects

In [47]:
# We choose to focus on milk prices
for category in categories:
    product_group = nielsen[nielsen.product_group_descr == category]


    # The control group is composed by all states where nothing (no entry nor exit) happened.
    control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()
    #print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


    # The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
    count = movements.groupby('County_fips').count()
    count = count[count.State == 1] # No more than one movement in the treatement group
    treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
    treatment_movements = treatment_movements[(treatment_movements.Opening_date>='2015-01-31' ) & (treatment_movements.Opening_date<='2017-01-31') & ((treatment_movements.Closing_date>'2017-01-31') | (treatment_movements.Closing_date.apply(str) == 'NaT'))]

    treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
    treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
    #print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


    # We create our dummies for the regression
    control['treat'] = False
    control['interaction'] = False
    control['time_effects'] = list(zip(control.purchase_year, control.purchase_month))

    treatment['treat'] = True
    treatment['purchase_0'] = treatment.purchase_month * (treatment.purchase_year == 2015) + (treatment.purchase_month +12) * (treatment.purchase_year == 2016)
    treatment['opening_0'] = treatment.Opening_date.dt.month * (treatment.Opening_date.dt.year == 2015) + (treatment.Opening_date.dt.month +12) * (treatment.Opening_date.dt.year == 2016)
    treatment['interaction'] = treatment.purchase_0 > treatment.opening_0
    treatment['time_effects'] = list(zip(treatment.purchase_year, treatment.purchase_month))


    # Final dataset for the regression :

    df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'store_state', 'time_effects']]
    df = df[df.upc_price != 0]


    reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(time_effects) + C(store_state)', data=df)
    results1 = reg1.fit()
    print("=========================================================")
    print(category)
    if abs(results1.params[2] / results1.bse[2]) >= 2.:
        print(f"Coef : {np.exp(results1.params[2])-1}")
        print(f"Coef/err : {abs(results1.params[2] / results1.bse[2])}")
        print(f"CI_up : {np.exp(results1.conf_int(alpha=0.05)[0][2])-1}")
        print(f"CI_down : {np.exp(results1.conf_int(alpha=0.05)[1][2])-1}")

BAKED GOODS-FROZEN
Coef : -0.061148901851209136
Coef/err : 2.8828989401717626
CI_up : -0.1005742020155922
CI_down : -0.019995438789414344
BAKING MIXES
BAKING SUPPLIES
BREAD AND BAKED GOODS
BREAKFAST FOOD
BUTTER AND MARGARINE
CANDY
Coef : 0.052723401713312246
Coef/err : 2.3178972395696547
CI_up : 0.007964144679744933
CI_down : 0.09947022060686339
CARBONATED BEVERAGES
CEREAL
CHEESE
COFFEE
Coef : 0.04351767543819429
Coef/err : 2.251662512041566
CI_up : 0.005531860421122081
CI_down : 0.08293847446652092
CONDIMENTS, GRAVIES, AND SAUCES
COOKIES
COT CHEESE, SOUR CREAM, TOPPINGS
CRACKERS
DESSERTS, GELATINS, SYRUP
DETERGENTS
DOUGH PRODUCTS
DRESSINGS/SALADS/PREP FOODS-DELI
Coef : 0.05312863593657591
Coef/err : 2.5480665095920396
CI_up : 0.01201779461585728
CI_down : 0.09590950843964041
EGGS
Coef : 0.05567401949545214
Coef/err : 3.4424056029225154
CI_up : 0.023604998942878375
CI_down : 0.08874774604327151
FRESH PRODUCE
FRUIT - CANNED
ICE CREAM, NOVELTIES
JAMS, JELLIES, SPREADS
JUICE, DRINKS - CAN

## QUATRIEME MODELE - state*time

In [23]:
# We choose to focus on milk prices
for category in categories:
    product_group = nielsen[nielsen.product_group_descr == category]


    # The control group is composed by all states where nothing (no entry nor exit) happened.
    control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()
    #print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


    # The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
    count = movements.groupby('County_fips').count()
    count = count[count.State == 1] # No more than one movement in the treatement group
    treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
    treatment_movements = treatment_movements[(treatment_movements.Opening_date>='2015-01-31' ) & (treatment_movements.Opening_date<='2017-01-31') & ((treatment_movements.Closing_date>'2017-01-31') | (treatment_movements.Closing_date.apply(str) == 'NaT'))]

    treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
    treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
    #print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


    # We create our dummies for the regression
    control['treat'] = False
    control['interaction'] = False
    control['time_effects'] = list(zip(control.purchase_year, control.purchase_month))

    treatment['treat'] = True
    treatment['interaction'] = ((treatment.Opening_date.dt.year == treatment.purchase_year) & (treatment.Opening_date.dt.month > treatment.purchase_month)) | (treatment.Opening_date.dt.year > treatment.purchase_year)
    treatment['time_effects'] = list(zip(treatment.purchase_year, treatment.purchase_month))


    # Final dataset for the regression :

    df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'store_state', 'time_effects']]
    df = df[df.upc_price != 0]


    reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(time_effects) * C(store_state)', data=df)
    results1 = reg1.fit()
    print("=========================================================")
    print(category)
    if abs(results1.params[2] / results1.bse[2]) >= 2.:
        print(f"Coef : {np.exp(results1.params[2])-1}")
    print(f"Coef/err : {abs(results1.params[2] / results1.bse[2])}")
    if abs(results1.params[2] / results1.bse[2]) >= 2.:
        print(f"CI_up : {np.exp(results1.conf_int(alpha=0.05)[0][2])-1}")
        print(f"CI_down : {np.exp(results1.conf_int(alpha=0.05)[1][2])-1}")

MILK
Coef/err : 0.27477684387827905
