Reression code
===

In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

In [2]:
# TODO change path
nielsen15 = pd.read_csv('../../Nielsen/aggregated_nielsen_2015.csv')
nielsen16 = pd.read_csv('../..//Nielsen/aggregated_nielsen_2016.csv')
nielsen = pd.concat((nielsen15, nielsen16))

In [3]:
# Selecting Walmart concurents
nielsen = nielsen[~nielsen.is_walmart]

In [4]:
# Importing entries/exits data
# TODO change path
fandom = pd.read_csv('../data_collection/plein_de_data/fandom_traitées.csv', parse_dates=['Opening_date', 'Closing_date'])[['State', 'County_name', 'County_fips', 'Opening_date', 'Closing_date']]

# Droping the state in which we do not trust our data (some mistakes still,remain)
fandom = fandom[~np.isin(fandom.State, ('CA', 'GA', 'KS', 'LA', 'TX'))]
nielsen = nielsen[~np.isin(nielsen.store_state, ('CA', 'GA', 'KS', 'LA', 'TX'))]

# We concentrate our study on the movements (entries & exits) that happened between 2014 and 2017 (fiscal year). We add 2014 and 2017 to eliminate side effects.
movements = fandom[((fandom.Opening_date >= '2015-01-31') & (fandom.Opening_date <= '2017-01-31')) | ((fandom.Closing_date >= '2015-01-31') & (fandom.Closing_date <= '2017-01-31'))]

In [12]:
# Dropping the observations around the opening date (0, 1, 3, 5, 7, 9 , 11)
dropping_period = 5

## 1. Entries

### A) Pooled values

In [5]:
# Taking the average price per category to have a first overview of the effect
pool = pd.DataFrame(nielsen.groupby(['is_walmart', 'store_state', 'guessed_store_county', 'guessed_store_county_fips', 'purchase_year', 'purchase_month']).mean()['upc_price']).reset_index()

In [8]:
print("=========================================================")
print("ENTRY - Pooled results")
print('---------------------------------------------------------')

product_group = pool


# The control group is composed by all states where nothing (no entry nor exit) happened.
control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()

# We keep in the control group the only counties where we have data for the entire time period (24 months)
nb_months = control.groupby('guessed_store_county_fips').count()
control = control[np.isin(control.guessed_store_county_fips, nb_months[nb_months.is_walmart==24].index)]
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")



# The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.State == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
treatment_movements = treatment_movements[(treatment_movements.Opening_date>='2015-01-31' ) & (treatment_movements.Opening_date<='2017-01-31') & ((treatment_movements.Closing_date>'2017-01-31') | (treatment_movements.Closing_date.apply(str) == 'NaT'))] # Treatment group composed by the entries

treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")



# Creating the dummies
control['treat'] = False
control['interaction'] = False
control['purchase_0'] = control.purchase_month + 12 * (control.purchase_year - 2015) # Creating one unique time variable
control['effects'] = list(zip(control.store_state, control.purchase_0)) # Crontrolling for time * state

treatment['treat'] = True
treatment['purchase_0'] = treatment.purchase_month + 12 * (treatment.purchase_year - 2015) # Creating one unique time variable
treatment['opening_0'] = treatment.Opening_date.dt.month  + 12 * (treatment.Opening_date.dt.year - 2015)
treatment['interaction'] = treatment.purchase_0 >= treatment.opening_0 # The post dummy equals to one in the treatment group, after the opening month
treatment = treatment[abs(treatment.purchase_0-treatment.opening_0) > dropping_period]
treatment['effects'] = list(zip(treatment.store_state, treatment.purchase_0)) # Crontrolling for time * state



# Final dataset for the regression :
df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'effects']]
df = df[df.upc_price != 0] # Dropping the errors in the set


reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(effects)', data=df)
results1 = reg1.fit()

print(f"Coef : {np.exp(results1.params[2])-1}")
print(f"Coef/err : {abs(results1.params[2] / results1.bse[2])}")

Pooled results
---------------------------------------------------------
Size of the control group: 1457.
Size of the treatment group: 91.
Coef : 0.03628233615121923
Coef/err : 2.2418082026301285


### B) Other categories

In [9]:
# Focusing on the categories with the highest number of observations
categories_count = nielsen.groupby('product_group_descr').count()
categories = categories_count[categories_count.is_walmart>=25000].index

# It could be :
# categories = nielsen.product_group_descr.unique()

In [13]:
for category in categories:
    print("=========================================================")
    print(f"ENTRY - {category}")
    print('---------------------------------------------------------')

    product_group = nielsen[nielsen.product_group_descr == category]


    # The control group is composed by all states where nothing (no entry nor exit) happened.
    control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()

    # Keeping in the control group the only counties where we have data for the entire time period (24 months)
    nb_months = control.groupby('guessed_store_county_fips').count()
    control = control[np.isin(control.guessed_store_county_fips, nb_months[nb_months.is_walmart==24].index)]

    # Keeping in the control group the only counties where 4 observations per month and category at least
    nb_obs = control.groupby('guessed_store_county_fips').min()
    control = control[np.isin(control.guessed_store_county_fips, nb_obs[nb_obs.nb_of_obs > 3].index)]
    print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


    # The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
    count = movements.groupby('County_fips').count()
    count = count[count.State == 1] # No more than one movement in the treatement group
    treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
    treatment_movements = treatment_movements[(treatment_movements.Opening_date>='2015-01-31' ) & (treatment_movements.Opening_date<='2017-01-31') & ((treatment_movements.Closing_date>'2017-01-31') | (treatment_movements.Closing_date.apply(str) == 'NaT'))] # Treatment group composed by the entries

    treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
    treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
    print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")



    # Creating the dummies
    control['treat'] = False
    control['interaction'] = False
    control['purchase_0'] = control.purchase_month + 12 * (control.purchase_year - 2015) # Creating one unique time variable
    control['effects'] = list(zip(control.store_state, control.purchase_0)) # Crontrolling for time * state

    treatment['treat'] = True
    treatment['purchase_0'] = treatment.purchase_month + 12 * (treatment.purchase_year - 2015) # Creating one unique time variable
    treatment['opening_0'] = treatment.Opening_date.dt.month  + 12 * (treatment.Opening_date.dt.year - 2015)
    treatment['interaction'] = treatment.purchase_0 >= treatment.opening_0 # The post dummy equals to one in the treatment group, after the opening month
    treatment = treatment[abs(treatment.purchase_0-treatment.opening_0) > dropping_period]
    treatment['effects'] = list(zip(treatment.store_state, treatment.purchase_0)) # Crontrolling for time * state



    # Final dataset for the regression :
    df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'effects']]
    df = df[df.upc_price != 0] # Dropping the errors in the set

    try:
        reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(effects)', data=df)
        results1 = reg1.fit()

        if abs(results1.params[2] / results1.bse[2]) > 2.:
            print(f"Coef : {np.exp(results1.params[2])-1}")
        print(f"Coef/err : {abs(results1.params[2] / results1.bse[2])}")

    except:
        print('Method does not converge')

ENTRY - BAKED GOODS-FROZEN
---------------------------------------------------------
Size of the control group: 118.
Size of the treatment group: 89.
Coef/err : 0.43005065876054466
ENTRY - BAKING MIXES
---------------------------------------------------------
Size of the control group: 209.
Size of the treatment group: 91.
Coef/err : 1.1525473486290227
ENTRY - BAKING SUPPLIES
---------------------------------------------------------
Size of the control group: 256.
Size of the treatment group: 91.
Coef/err : 1.4663504380069932
ENTRY - BREAD AND BAKED GOODS
---------------------------------------------------------
Size of the control group: 833.
Size of the treatment group: 91.
Coef/err : 0.5137064845539167
ENTRY - BREAKFAST FOOD
---------------------------------------------------------
Size of the control group: 269.
Size of the treatment group: 91.
Coef/err : 0.08379756453787691
ENTRY - BUTTER AND MARGARINE
---------------------------------------------------------
Size of the control g

KeyboardInterrupt: 

## 2. Exits

### A) Pooled values

In [None]:
# Taking the average price per category to have a first overview of the effect
pool = pd.DataFrame(nielsen.groupby(['is_walmart', 'store_state', 'guessed_store_county', 'guessed_store_county_fips', 'purchase_year', 'purchase_month']).mean()['upc_price']).reset_index()

In [14]:
print("=========================================================")
print("EXIT - Pooled results")
print('---------------------------------------------------------')

product_group = pool


# The control group is composed by all states where nothing (no entry nor exit) happened.
control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()

# We keep in the control group the only counties where we have data for the entire time period (24 months)
nb_months = control.groupby('guessed_store_county_fips').count()
control = control[np.isin(control.guessed_store_county_fips, nb_months[nb_months.is_walmart==24].index)]
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")



# The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.State == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
treatment_movements = treatment_movements[(treatment_movements.Closing_date>='2015-01-31' ) & (treatment_movements.Closing_date<='2017-01-31') & (treatment_movements.Opening_date<'2015-01-31')] # Treatment group composed by the exits

treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")



# Creating the dummies
control['treat'] = False
control['interaction'] = False
control['purchase_0'] = control.purchase_month + 12 * (control.purchase_year - 2015) # Creating one unique time variable
control['effects'] = list(zip(control.store_state, control.purchase_0)) # Crontrolling for time * state

treatment['treat'] = True
treatment['purchase_0'] = treatment.purchase_month + 12 * (treatment.purchase_year - 2015) # Creating one unique time variable
treatment['closing_0'] = treatment.Closing_date.dt.month  + 12 * (treatment.Closing_date.dt.year - 2015)
treatment['interaction'] = treatment.purchase_0 >= treatment.closing_0 # The post dummy equals to one in the treatment group, after the closing month
treatment = treatment[abs(treatment.purchase_0-treatment.closing_0) > dropping_period]
treatment['effects'] = list(zip(treatment.store_state, treatment.purchase_0)) # Crontrolling for time * state



# Final dataset for the regression :
df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'effects']]
df = df[df.upc_price != 0] # Dropping the errors in the set


reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(effects)', data=df)
results1 = reg1.fit()

print(f"Coef : {np.exp(results1.params[2])-1}")
print(f"Coef/err : {abs(results1.params[2] / results1.bse[2])}")

ENTRY - Pooled results
---------------------------------------------------------
Size of the control group: 1457.
Size of the treatment group: 26.
Coef : 0.036159872219974076
Coef/err : 1.271974218395699


### B) Other categories

In [15]:
# Focusing on the categories with the highest number of observations
categories_count = nielsen.groupby('product_group_descr').count()
categories = categories_count[categories_count.is_walmart>=25000].index

# It could be :
# categories = nielsen.product_group_descr.unique()

In [16]:
for category in categories:
    print("=========================================================")
    print(f"EXIT - {category}")
    print('---------------------------------------------------------')

    product_group = nielsen[nielsen.product_group_descr == category]


    # The control group is composed by all states where nothing (no entry nor exit) happened.
    control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()

    # Keeping in the control group the only counties where we have data for the entire time period (24 months)
    nb_months = control.groupby('guessed_store_county_fips').count()
    control = control[np.isin(control.guessed_store_county_fips, nb_months[nb_months.is_walmart==24].index)]

    # Keeping in the control group the only counties where 4 observations per month and category at least
    nb_obs = control.groupby('guessed_store_county_fips').min()
    control = control[np.isin(control.guessed_store_county_fips, nb_obs[nb_obs.nb_of_obs > 3].index)]
    print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


    # The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
    count = movements.groupby('County_fips').count()
    count = count[count.State == 1] # No more than one movement in the treatement group
    treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
    treatment_movements = treatment_movements[(treatment_movements.Closing_date>='2015-01-31' ) & (treatment_movements.Closing_date<='2017-01-31') & ((treatment_movements.Opening_date<'2015-01-31'))] # Treatment group composed by the entries

    treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
    treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
    print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")



    # Creating the dummies
    control['treat'] = False
    control['interaction'] = False
    control['purchase_0'] = control.purchase_month + 12 * (control.purchase_year - 2015) # Creating one unique time variable
    control['effects'] = list(zip(control.store_state, control.purchase_0)) # Crontrolling for time * state

    treatment['treat'] = True
    treatment['purchase_0'] = treatment.purchase_month + 12 * (treatment.purchase_year - 2015) # Creating one unique time variable
    treatment['closing_0'] = treatment.Closing_date.dt.month  + 12 * (treatment.Closing_date.dt.year - 2015)
    treatment['interaction'] = treatment.purchase_0 >= treatment.closing_0 # The post dummy equals to one in the treatment group, after the closing month
    treatment = treatment[abs(treatment.purchase_0-treatment.closing_0) > dropping_period]
    treatment['effects'] = list(zip(treatment.store_state, treatment.purchase_0)) # Crontrolling for time * state



    # Final dataset for the regression :
    df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'effects']]
    df = df[df.upc_price != 0] # Dropping the errors in the set

    try:
        reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(effects)', data=df)
        results1 = reg1.fit()

        if abs(results1.params[2] / results1.bse[2]) > 2.:
            print(f"Coef : {np.exp(results1.params[2])-1}")
        print(f"Coef/err : {abs(results1.params[2] / results1.bse[2])}")

    except:
        print('Method does not converge')

EXIT - BAKED GOODS-FROZEN
---------------------------------------------------------
Size of the control group: 118.
Size of the treatment group: 42.
Coef/err : 0.7022280341007358
EXIT - BAKING MIXES
---------------------------------------------------------
Size of the control group: 209.
Size of the treatment group: 42.
Coef/err : 0.3759251908962499
EXIT - BAKING SUPPLIES
---------------------------------------------------------
Size of the control group: 256.
Size of the treatment group: 43.
Coef/err : 0.031220037980654362
EXIT - BREAD AND BAKED GOODS
---------------------------------------------------------
Size of the control group: 833.
Size of the treatment group: 43.
Method does not converge
EXIT - BREAKFAST FOOD
---------------------------------------------------------
Size of the control group: 269.
Size of the treatment group: 38.
Coef/err : 0.7639617207316075
EXIT - BUTTER AND MARGARINE
---------------------------------------------------------
