Reression code
===

In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

In [2]:
# TODO change path
nielsen15 = pd.read_csv('../../Nielsen/aggregated_nielsen_2015.csv')
nielsen16 = pd.read_csv('../..//Nielsen/aggregated_nielsen_2016.csv')
nielsen = pd.concat((nielsen15, nielsen16))

In [4]:
nielsen.groupby('product_group_descr').sum().describe()

Unnamed: 0,is_walmart,guessed_store_county_fips,purchase_year,purchase_month,upc_price,upc_price_std,nb_of_obs
count,118.0,118.0,118.0,118.0,118.0,118.0,118.0
mean,23712.974576,1506986000.0,100058100.0,322919.830508,203797.895377,81332.492577,777185.0
std,10426.284581,716999600.0,47230220.0,152046.50204,136847.473231,78607.98907,1197041.0
min,12.0,7919545.0,469621.0,1603.0,793.045,5.329678,273.0
25%,16560.0,982176500.0,65851680.0,213015.75,117776.331412,38412.09264,126956.0
50%,27509.0,1653639000.0,109709900.0,354036.5,180256.745927,61524.760399,388549.0
75%,31931.0,2105430000.0,138670900.0,448288.75,259382.773406,105286.743333,787695.0
max,37170.0,2674661000.0,177640200.0,571912.0,820665.462272,553743.029262,9750427.0


In [3]:
# Selecting Walmart concurents
nielsen = nielsen[~nielsen.is_walmart]

In [4]:
# Importing entries/exits data
# TODO change path
fandom = pd.read_csv('../data_collection/plein_de_data/fandom_traitées.csv', parse_dates=['Opening_date', 'Closing_date'])[['State', 'County_name', 'County_fips', 'Opening_date', 'Closing_date']]

# Droping the state in which we do not trust our data (some mistakes still,remain)
fandom = fandom[~np.isin(fandom.State, ('CA', 'GA', 'KS', 'LA', 'TX'))]
nielsen = nielsen[~np.isin(nielsen.store_state, ('CA', 'GA', 'KS', 'LA', 'TX'))]

# We concentrate our study on the movements (entries & exits) that happened between 2014 and 2017 (fiscal year). We add 2014 and 2017 to eliminate side effects.
movements = fandom[((fandom.Opening_date >= '2015-01-31') & (fandom.Opening_date <= '2017-01-31')) | ((fandom.Closing_date >= '2015-01-31') & (fandom.Closing_date <= '2017-01-31'))]

In [5]:
# Dropping the observations around the opening date (0, 1, 3, 5, 7, 9 , 11)
dropping_period = 5

## 1. Entries

### A) Pooled values

In [6]:
# Taking the average price per category to have a first overview of the effect
pool = pd.DataFrame(nielsen.groupby(['is_walmart', 'store_state', 'guessed_store_county', 'guessed_store_county_fips', 'purchase_year', 'purchase_month']).mean()['upc_price']).reset_index()

In [19]:
print("=========================================================")
print("ENTRY - Pooled results")
print('---------------------------------------------------------')

product_group = pool


# The control group is composed by all states where nothing (no entry nor exit) happened.
control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()

# We keep in the control group the only counties where we have data for the entire time period (24 months)
nb_months = control.groupby('guessed_store_county_fips').count()
control = control[np.isin(control.guessed_store_county_fips, nb_months[nb_months.is_walmart==24].index)]
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")



# The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.State == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
treatment_movements = treatment_movements[(treatment_movements.Opening_date>='2015-01-31' ) & (treatment_movements.Opening_date<='2017-01-31') & ((treatment_movements.Closing_date>'2017-01-31') | (treatment_movements.Closing_date.apply(str) == 'NaT'))] # Treatment group composed by the entries

treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")



# Creating the dummies
control['treat'] = False
control['interaction'] = False
control['purchase_0'] = control.purchase_month + 12 * (control.purchase_year - 2015) # Creating one unique time variable
control['effects'] = list(zip(control.store_state, control.purchase_0)) # Crontrolling for time * state

treatment['treat'] = True
treatment['purchase_0'] = treatment.purchase_month + 12 * (treatment.purchase_year - 2015) # Creating one unique time variable
treatment['opening_0'] = treatment.Opening_date.dt.month  + 12 * (treatment.Opening_date.dt.year - 2015)
treatment['interaction'] = treatment.purchase_0 >= treatment.opening_0 # The post dummy equals to one in the treatment group, after the opening month
treatment = treatment[abs(treatment.purchase_0-treatment.opening_0) > dropping_period]
treatment['effects'] = list(zip(treatment.store_state, treatment.purchase_0)) # Crontrolling for time * state



# Final dataset for the regression :
df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'effects']]
df = df[df.upc_price != 0] # Dropping the errors in the set


reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(effects)', data=df)
results1 = reg1.fit()

print(f"Coef : {np.exp(results1.params[2])-1}")
print(f"pvalue : { results1.pvalues[2]}")
print(f"Std err : { results1.bse[2]}")
print(f"R2 : { results1.rsquared}")
print(f"N : { results1.nobs}")

ENTRY - Pooled results
---------------------------------------------------------
Size of the control group: 1457.
Size of the treatment group: 91.
Coef : 0.029341900413080824
pvalue : 0.03977734141684591
Std err : 0.01406509590724859
R2 : 0.1578651149374708
N : 36234.0


### B) Other categories

In [11]:
# Focusing on the categories with the highest number of observations
categories_count = nielsen.groupby('product_group_descr').count()
categories = categories_count[categories_count.is_walmart>=25000].index

# It could be :
# categories = nielsen.product_group_descr.unique()

In [12]:
data = []

for category in categories:
    print("=========================================================")
    print(f"ENTRY - {category}")
    print('---------------------------------------------------------')

    product_group = nielsen[nielsen.product_group_descr == category]


    # The control group is composed by all states where nothing (no entry nor exit) happened.
    control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()

    # Keeping in the control group the only counties where we have data for the entire time period (24 months)
    nb_months = control.groupby('guessed_store_county_fips').count()
    control = control[np.isin(control.guessed_store_county_fips, nb_months[nb_months.is_walmart==24].index)]

    # Keeping in the control group the only counties where 4 observations per month and category at least
    nb_obs = control.groupby('guessed_store_county_fips').min()
    control = control[np.isin(control.guessed_store_county_fips, nb_obs[nb_obs.nb_of_obs > 3].index)]
    print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


    # The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
    count = movements.groupby('County_fips').count()
    count = count[count.State == 1] # No more than one movement in the treatement group
    treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
    treatment_movements = treatment_movements[(treatment_movements.Opening_date>='2015-01-31' ) & (treatment_movements.Opening_date<='2017-01-31') & ((treatment_movements.Closing_date>'2017-01-31') | (treatment_movements.Closing_date.apply(str) == 'NaT'))] # Treatment group composed by the entries

    treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
    treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
    print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")



    # Creating the dummies
    control['treat'] = False
    control['interaction'] = False
    control['purchase_0'] = control.purchase_month + 12 * (control.purchase_year - 2015) # Creating one unique time variable
    control['effects'] = list(zip(control.store_state, control.purchase_0)) # Crontrolling for time * state

    treatment['treat'] = True
    treatment['purchase_0'] = treatment.purchase_month + 12 * (treatment.purchase_year - 2015) # Creating one unique time variable
    treatment['opening_0'] = treatment.Opening_date.dt.month  + 12 * (treatment.Opening_date.dt.year - 2015)
    treatment['interaction'] = treatment.purchase_0 >= treatment.opening_0 # The post dummy equals to one in the treatment group, after the opening month
    treatment = treatment[abs(treatment.purchase_0-treatment.opening_0) > dropping_period]
    treatment['effects'] = list(zip(treatment.store_state, treatment.purchase_0)) # Crontrolling for time * state



    # Final dataset for the regression :
    df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'effects']]
    df = df[df.upc_price != 0] # Dropping the errors in the set


    try:
        reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(effects)', data=df)
        results1 = reg1.fit()

        print(f"Coef : {np.exp(results1.params[2])-1}")
        print(f"Rapport: {results1.params[2]/results1.bse[2]} ")
        print(f"pvalue : { results1.pvalues[2]}")
        print(f"Std err : { results1.bse[2]}")
        print(f"R2 : { results1.rsquared}")
        print(f"N : { results1.nobs}")

    except:
        print('Method does not converge')

    if abs(results1.params[2]/results1.bse[2]) > 2.:
        data.append([category, np.exp(results1.params[2])-1, results1.bse[2], results1.pvalues[2], results1.rsquared, results1.nobs])

print(data)

ENTRY - BAKED GOODS-FROZEN
---------------------------------------------------------
Size of the control group: 118.
Size of the treatment group: 89.
Coef : -0.007435005539790707
Rapport: -0.43005065876054466 
pvalue : 0.6671896897021575
Std err : 0.01735326480876145
R2 : 0.3911374760510232
N : 3910.0
ENTRY - BAKING MIXES
---------------------------------------------------------
Size of the control group: 209.
Size of the treatment group: 91.
Coef : 0.018738614544618892
Rapport: 1.1525473486290227 
pvalue : 0.24914906265559905
Std err : 0.016107979967543717
R2 : 0.35539658038577016
N : 6153.0
ENTRY - BAKING SUPPLIES
---------------------------------------------------------
Size of the control group: 256.
Size of the treatment group: 91.
Coef : -0.018782851033047776
Rapport: -1.4663504380069932 
pvalue : 0.14260246024067363
Std err : 0.012931076169971106
R2 : 0.27664711028560507
N : 7290.0
ENTRY - BREAD AND BAKED GOODS
---------------------------------------------------------
Size of th

## 2. Exits

### A) Pooled values

In [20]:
# Taking the average price per category to have a first overview of the effect
pool = pd.DataFrame(nielsen.groupby(['is_walmart', 'store_state', 'guessed_store_county', 'guessed_store_county_fips', 'purchase_year', 'purchase_month']).mean()['upc_price']).reset_index()

In [21]:
print("=========================================================")
print("EXIT - Pooled results")
print('---------------------------------------------------------')

product_group = pool


# The control group is composed by all states where nothing (no entry nor exit) happened.
control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()

# We keep in the control group the only counties where we have data for the entire time period (24 months)
nb_months = control.groupby('guessed_store_county_fips').count()
control = control[np.isin(control.guessed_store_county_fips, nb_months[nb_months.is_walmart==24].index)]
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")



# The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.State == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
treatment_movements = treatment_movements[(treatment_movements.Closing_date>='2015-01-31' ) & (treatment_movements.Closing_date<='2017-01-31') & (treatment_movements.Opening_date<'2015-01-31')] # Treatment group composed by the exits

treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")



# Creating the dummies
control['treat'] = False
control['interaction'] = False
control['purchase_0'] = control.purchase_month + 12 * (control.purchase_year - 2015) # Creating one unique time variable
control['effects'] = list(zip(control.store_state, control.purchase_0)) # Crontrolling for time * state

treatment['treat'] = True
treatment['purchase_0'] = treatment.purchase_month + 12 * (treatment.purchase_year - 2015) # Creating one unique time variable
treatment['closing_0'] = treatment.Closing_date.dt.month  + 12 * (treatment.Closing_date.dt.year - 2015)
treatment['interaction'] = treatment.purchase_0 >= treatment.closing_0 # The post dummy equals to one in the treatment group, after the closing month
treatment = treatment[abs(treatment.purchase_0-treatment.closing_0) > dropping_period]
treatment['effects'] = list(zip(treatment.store_state, treatment.purchase_0)) # Crontrolling for time * state



# Final dataset for the regression :
df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'effects']]
df = df[df.upc_price != 0] # Dropping the errors in the set


reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(effects)', data=df)
results1 = reg1.fit()

print(f"Coef : {np.exp(results1.params[2])-1}")
print(f"pvalue : { results1.pvalues[2]}")
print(f"Std err : { results1.bse[2]}")
print(f"R2 : { results1.rsquared}")
print(f"N : { results1.nobs}")

EXIT - Pooled results
---------------------------------------------------------
Size of the control group: 1457.
Size of the treatment group: 43.
Coef : -0.01195424470475881
pvalue : 0.5877117693741167
Std err : 0.022182142733557227
R2 : 0.15426413736214695
N : 35499.0


### B) Other categories

In [6]:
# Focusing on the categories with the highest number of observations
categories_count = nielsen.groupby('product_group_descr').count()
categories = categories_count[categories_count.is_walmart>=25000].index

# It could be :
# categories = nielsen.product_group_descr.unique()

In [7]:
data = []

for category in categories:
    print("=========================================================")
    print(f"EXIT - {category}")
    print('---------------------------------------------------------')

    product_group = nielsen[nielsen.product_group_descr == category]


    # The control group is composed by all states where nothing (no entry nor exit) happened.
    control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()

    # Keeping in the control group the only counties where we have data for the entire time period (24 months)
    nb_months = control.groupby('guessed_store_county_fips').count()
    control = control[np.isin(control.guessed_store_county_fips, nb_months[nb_months.is_walmart==24].index)]

    # Keeping in the control group the only counties where 4 observations per month and category at least
    nb_obs = control.groupby('guessed_store_county_fips').min()
    control = control[np.isin(control.guessed_store_county_fips, nb_obs[nb_obs.nb_of_obs > 3].index)]
    print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")


    # The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
    count = movements.groupby('County_fips').count()
    count = count[count.State == 1] # No more than one movement in the treatement group
    treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
    treatment_movements = treatment_movements[(treatment_movements.Closing_date>='2015-01-31' ) & (treatment_movements.Closing_date<='2017-01-31') & ((treatment_movements.Opening_date<'2015-01-31'))] # Treatment group composed by the entries

    treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
    treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
    print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")



    # Creating the dummies
    control['treat'] = False
    control['interaction'] = False
    control['purchase_0'] = control.purchase_month + 12 * (control.purchase_year - 2015) # Creating one unique time variable
    control['effects'] = list(zip(control.store_state, control.purchase_0)) # Crontrolling for time * state

    treatment['treat'] = True
    treatment['purchase_0'] = treatment.purchase_month + 12 * (treatment.purchase_year - 2015) # Creating one unique time variable
    treatment['closing_0'] = treatment.Closing_date.dt.month  + 12 * (treatment.Closing_date.dt.year - 2015)
    treatment['interaction'] = treatment.purchase_0 >= treatment.closing_0 # The post dummy equals to one in the treatment group, after the closing month
    treatment = treatment[abs(treatment.purchase_0-treatment.closing_0) > dropping_period]
    treatment['effects'] = list(zip(treatment.store_state, treatment.purchase_0)) # Crontrolling for time * state



    # Final dataset for the regression :
    df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'effects']]
    df = df[df.upc_price != 0] # Dropping the errors in the set

    try:
        reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(effects)', data=df)
        results1 = reg1.fit()

        print(f"Coef : {np.exp(results1.params[2])-1}")
        print(f"Rapport: {results1.params[2]/results1.bse[2]} ")
        print(f"pvalue : { results1.pvalues[2]}")
        print(f"Std err : { results1.bse[2]}")
        print(f"R2 : { results1.rsquared}")
        print(f"N : { results1.nobs}")

    except:
        print('Method does not converge')

    if results1.params[2]/results1.bse[2] > 2.:
        data.append([category, np.exp(results1.params[2])-1, results1.bse[2], results1.pvalues[2], results1.rsquared, results1.nobs])

print(data)

EXIT - BAKED GOODS-FROZEN
---------------------------------------------------------
Size of the control group: 118.
Size of the treatment group: 42.
Coef : 0.020960855087863495
Rapport: 0.7022280341007358 
pvalue : 0.4826049206960059
Std err : 0.029540544756636075
R2 : 0.4129158128833499
N : 3214.0
EXIT - BAKING MIXES
---------------------------------------------------------
Size of the control group: 209.
Size of the treatment group: 42.
Coef : 0.01007536490850991
Rapport: 0.3759251908962499 
pvalue : 0.7069899460106576
Std err : 0.02666739828402053
R2 : 0.3618792380268543
N : 5407.0
EXIT - BAKING SUPPLIES
---------------------------------------------------------
Size of the control group: 256.
Size of the treatment group: 43.
Coef : 0.0007602257260281853
Rapport: 0.031220037980654362 
pvalue : 0.9750951593152173
Std err : 0.02434131890851756
R2 : 0.2589770384443839
N : 6526.0
EXIT - BREAD AND BAKED GOODS
---------------------------------------------------------
Size of the control gr