In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import datetime

In [2]:
nielsen15 = pd.read_csv('../../Nielsen/aggregated_nielsen_2015.csv')
nielsen16 = pd.read_csv('../..//Nielsen/aggregated_nielsen_2016.csv')
nielsen15['year'] = 2015
nielsen16['year'] = 2016

In [3]:
nielsen = pd.concat((nielsen15, nielsen16))
nielsen = nielsen[~nielsen.is_walmart]

In [4]:
## Entriy/exit dates
fandom = pd.read_csv('../data_collection/plein_de_data/fandom_traitées.csv', parse_dates=['Opening_date', 'Closing_date'])[['State', 'County_name', 'County_fips', 'Opening_date', 'Closing_date']]

# We drop the state in which we do not trust our data (some mistakes stillremain)
fandom = fandom[~np.isin(fandom.State, ('CA', 'GA', 'KS', 'LA', 'TX'))]
nielsen = nielsen[~np.isin(nielsen.store_state, ('CA', 'GA', 'KS', 'LA', 'TX'))]

# We concentrate our study on the movements (entries & exits) during the fiscal years 2015 and 2016
movements = fandom[((fandom.Opening_date >= '2015-01-31') & (fandom.Opening_date <= '2017-01-31')) | ((fandom.Closing_date >= '2015-01-31') & (fandom.Closing_date <= '2017-01-31'))]
#movements['year'] = movements.Opening_date.dt.year
#movements['month'] = movements.Opening_date.dt.month

In [6]:
hey = nielsen.groupby('product_group_descr').count()
categories = hey[hey.is_walmart>=25000].index

In [None]:
categories = [
    "FRESH PRODUCE",
    "BREAD AND BAKED GOODS",
    "MILK",
    "SNACKS",
    "PACKAGED MEATS-DELI",
    "CHEESE",
    "UNPREP MEAT/POULTRY/SEAFOOD-FRZN",
    "CARBONATED BEVERAGES",
    "CONDIMENTS, GRAVIES, AND SAUCES",
    "CANDY",
    "JUICE, DRINKS - CANNED, BOTTLED",
    "EGGS",
    "CEREAL",
    "PASTA",
]

In [None]:
categories =[
    "COT CHEESE, SOUR CREAM, TOPPINGS",
    "PACKAGED MILK AND MODIFIERS",
    "SPICES, SEASONING, EXTRACTS",
    "SUGAR, SWEETENERS"
]

## PREMIER MODELE

$$Prices_{i, t} = \alpha + \beta treat_i + \gamma treat_i* post_t$$

In [12]:
# We choose to focus on milk prices
for category in categories:
    print("=========================================================")
    print(category)

    product_group = nielsen[nielsen.product_group_descr == category]


    # The control group is composed by all states where nothing (no entry nor exit) happened.
    control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()
    print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


    # The treatment group is composed by the states where one exit took place in 2016 and where this entry is the only movement
    count = movements.groupby('County_fips').count()
    count = count[count.State == 1] # No more than one movement in the treatement group
    treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
    treatment_movements = treatment_movements[(treatment_movements.Closing_date>='2015-01-31' ) & (treatment_movements.Closing_date<='2017-01-31') & (treatment_movements.Opening_date<'2015-01-31')]

    treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
    treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
    print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


    # We create our dummies for the regression
    control['treat'] = False
    control['interaction'] = False

    treatment['treat'] = True
    treatment['interaction'] = (treatment.purchase_month > treatment.Closing_date.dt.month) & (treatment.purchase_year >= treatment.Closing_date.dt.year)


    # Final dataset for the regression :

    df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction']]
    df = df[df.upc_price != 0]
    reg0 = smf.ols(formula='np.log(upc_price) ~ treat + interaction', data=df)
    results0 = reg0.fit()
    #if abs(results0.params[2] / results0.bse[2]) > 2. :
    if abs(results0.params[2] / results0.bse[2])>2.:
        print(f"Coef : {np.exp(results0.params[2])-1}")
        print(f"Coef/err : {abs(results0.params[2] / results0.bse[2])}")
        print(f"CI_up : {np.exp(results0.conf_int(alpha=0.05)[0][2])-1}")
        print(f"CI_down : {np.exp(results0.conf_int(alpha=0.05)[1][2])-1}")

BAKED GOODS-FROZEN
Size of the control group: 1706.
Size of the treatment group: 42.
BAKING MIXES
Size of the control group: 1735.
Size of the treatment group: 42.
BAKING SUPPLIES
Size of the control group: 1753.
Size of the treatment group: 43.
BREAD AND BAKED GOODS
Size of the control group: 1836.
Size of the treatment group: 43.
BREAKFAST FOOD
Size of the control group: 1665.
Size of the treatment group: 38.
BUTTER AND MARGARINE
Size of the control group: 1764.
Size of the treatment group: 42.
CANDY
Size of the control group: 1821.
Size of the treatment group: 42.
CARBONATED BEVERAGES
Size of the control group: 1815.
Size of the treatment group: 43.
CEREAL
Size of the control group: 1777.
Size of the treatment group: 41.
CHEESE
Size of the control group: 1816.
Size of the treatment group: 43.
COFFEE
Size of the control group: 1680.
Size of the treatment group: 40.
CONDIMENTS, GRAVIES, AND SAUCES
Size of the control group: 1821.
Size of the treatment group: 43.
COOKIES
Size of the co

## DEUXIEME MODELE - monthly time fixed effects

In [13]:
# We choose to focus on milk prices
for category in categories:
    product_group = nielsen[nielsen.product_group_descr == category]


    # The control group is composed by all states where nothing (no entry nor exit) happened.
    control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()
    #print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


    # The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
    count = movements.groupby('County_fips').count()
    count = count[count.State == 1] # No more than one movement in the treatement group
    treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
    treatment_movements = treatment_movements[(treatment_movements.Closing_date>='2015-01-31' ) & (treatment_movements.Closing_date<='2017-01-31') & (treatment_movements.Opening_date<'2015-01-31')]

    treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
    treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
    #print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


    # We create our dummies for the regression
    control['treat'] = False
    control['interaction'] = False
    control['time_effects'] = list(zip(control.purchase_year, control.purchase_month))

    treatment['treat'] = True
    treatment['interaction'] = (treatment.purchase_month > treatment.Closing_date.dt.month) & (treatment.purchase_year >= treatment.Closing_date.dt.year)
    treatment['time_effects'] = list(zip(treatment.purchase_year, treatment.purchase_month))


    # Final dataset for the regression :

    df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'store_state', 'time_effects']]
    df = df[df.upc_price != 0]


    reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(time_effects)', data=df)
    results1 = reg1.fit()
    if abs(results1.params[2] / results1.bse[2])>2 :
        print("=========================================================")
        print(category)
        print(f"Coef : {np.exp(results1.params[2])-1}")
        print(f"Coef/err : {abs(results1.params[2] / results1.bse[2])}")
        print(f"CI_up : {np.exp(results1.conf_int(alpha=0.05)[0][2])-1}")
        print(f"CI_down : {np.exp(results1.conf_int(alpha=0.05)[1][2])-1}")

EGGS
Coef : 0.08370880883884824
Coef/err : 3.41044528002763
CI_up : 0.03477939089443516
CI_down : 0.13495184837395557
NUTS
Coef : -0.084933692579375
Coef/err : 2.2640382177663776
CI_up : -0.15261533359637203
CI_down : -0.011846236809799349
PACKAGED MEATS-DELI
Coef : -0.052328250036368984
Coef/err : 2.3373390532642975
CI_up : -0.09409238106638107
CI_down : -0.008638710052687681
PIZZA/SNACKS/HORS DOEURVES-FRZN
Coef : -0.08124663130802412
Coef/err : 2.7031217066428668
CI_up : -0.13599898511988495
CI_down : -0.023024582210729894
SOUP
Coef : 0.06736942122474576
Coef/err : 2.1376839169120445
CI_up : 0.005432553829368247
CI_down : 0.13312173653668613


## TROISIEME MODELE - state effects

In [15]:
# We choose to focus on milk prices
for category in categories:
    product_group = nielsen[nielsen.product_group_descr == category]


    # The control group is composed by all states where nothing (no entry nor exit) happened.
    control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()
    #print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


    # The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
    count = movements.groupby('County_fips').count()
    count = count[count.State == 1] # No more than one movement in the treatement group
    treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
    treatment_movements = treatment_movements[(treatment_movements.Closing_date>='2015-01-31' ) & (treatment_movements.Closing_date<='2017-01-31') & (treatment_movements.Opening_date<'2015-01-31')]

    treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
    treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
    #print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


    # We create our dummies for the regression
    control['treat'] = False
    control['interaction'] = False
    control['time_effects'] = list(zip(control.purchase_year, control.purchase_month))

    treatment['treat'] = True
    treatment['interaction'] = (treatment.purchase_month > treatment.Closing_date.dt.month) & (treatment.purchase_year >= treatment.Closing_date.dt.year)
    treatment['time_effects'] = list(zip(treatment.purchase_year, treatment.purchase_month))


    # Final dataset for the regression :

    df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'store_state', 'time_effects']]
    df = df[df.upc_price != 0]


    reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(time_effects) + C(store_state)', data=df)
    results1 = reg1.fit()
    if abs(results1.params[2] / results1.bse[2]) >= 2.:
        print("=========================================================")
        print(category)
        print(f"Coef : {np.exp(results1.params[2])-1}")
        print(f"Coef/err : {abs(results1.params[2] / results1.bse[2])}")
        print(f"CI_up : {np.exp(results1.conf_int(alpha=0.05)[0][2])-1}")
        print(f"CI_down : {np.exp(results1.conf_int(alpha=0.05)[1][2])-1}")

EGGS
Coef : 0.06947366134762212
Coef/err : 3.0925562048098225
CI_up : 0.024901948466283352
CI_down : 0.11598374266718015
PACKAGED MEATS-DELI
Coef : -0.05590749016660068
Coef/err : 2.5966001077707728
CI_up : -0.09602915087169339
CI_down : -0.014005077726773618
PIZZA/SNACKS/HORS DOEURVES-FRZN
Coef : -0.0765301504521384
Coef/err : 2.6049658996542533
CI_up : -0.13022729572712854
CI_down : -0.019517905270565183
SOUP
Coef : 0.06987212542000099
Coef/err : 2.278690264523323
CI_up : 0.00948925523133215
CI_down : 0.13386681316227622


## QUATRIEME MODELE - state*time

In [17]:
# We choose to focus on milk prices
for category in categories:
    product_group = nielsen[nielsen.product_group_descr == category]


    # The control group is composed by all states where nothing (no entry nor exit) happened.
    control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()
    #print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


    # The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
    count = movements.groupby('County_fips').count()
    count = count[count.State == 1] # No more than one movement in the treatement group
    treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
    treatment_movements = treatment_movements[(treatment_movements.Closing_date>='2015-01-31' ) & (treatment_movements.Closing_date<='2017-01-31') & (treatment_movements.Opening_date<'2015-01-31')]

    treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
    treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
    #print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")


    # We create our dummies for the regression
    control['treat'] = False
    control['interaction'] = False
    control['time_effects'] = list(zip(control.purchase_year, control.purchase_month))

    treatment['treat'] = True
    treatment['interaction'] = (treatment.purchase_month > treatment.Closing_date.dt.month) & (treatment.purchase_year >= treatment.Closing_date.dt.year)
    treatment['time_effects'] = list(zip(treatment.purchase_year, treatment.purchase_month))


    # Final dataset for the regression :

    df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'store_state', 'time_effects']]
    df = df[df.upc_price != 0]


    reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(time_effects) * C(store_state)', data=df)
    results1 = reg1.fit()
    if abs(results1.params[2] / results1.bse[2]) >= 2.:
        print("=========================================================")
        print(category)
        print(f"Coef : {np.exp(results1.params[2])-1}")
        print(f"Coef/err : {abs(results1.params[2] / results1.bse[2])}")
        print(f"CI_up : {np.exp(results1.conf_int(alpha=0.05)[0][2])-1}")
        print(f"CI_down : {np.exp(results1.conf_int(alpha=0.05)[1][2])-1}")

BUTTER AND MARGARINE
Coef : -0.06191356908563139
Coef/err : 2.2376316457080394
CI_up : -0.11298884380694796
CI_down : -0.007897312540530055
NUTS
Coef : -0.10094124806600557
Coef/err : 2.6621235562446994
CI_up : -0.16868967937519508
CI_down : -0.02767159341700931


LinAlgError: SVD did not converge