In [5]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import datetime

In [6]:
nielsen15 = pd.read_csv('../../Nielsen/aggregated_nielsen_2015.csv')
nielsen16 = pd.read_csv('../..//Nielsen/aggregated_nielsen_2016.csv')
nielsen15['year'] = 2015
nielsen16['year'] = 2016

nielsen = pd.concat((nielsen15, nielsen16))
nielsen = nielsen[~nielsen.is_walmart]

## Entriy/exit dates
fandom = pd.read_csv('../data_collection/plein_de_data/fandom_traitées.csv', parse_dates=['Opening_date', 'Closing_date'])[['State', 'County_name', 'County_fips', 'Opening_date', 'Closing_date']]

# We drop the state in which we do not trust our data (some mistakes stillremain)
fandom = fandom[~np.isin(fandom.State, ('CA', 'GA', 'KS', 'LA', 'TX'))]
nielsen = nielsen[~np.isin(nielsen.store_state, ('CA', 'GA', 'KS', 'LA', 'TX'))]

movements = fandom[((fandom.Opening_date >= '2014-01-31') & (fandom.Opening_date <= '2018-01-31')) | ((fandom.Closing_date >= '2014-01-31') & (fandom.Closing_date <= '2018-01-31'))]

In [7]:
#For each county and each month we take the mean of the prices of all categories
pool = nielsen.groupby(['store_state','guessed_store_county','guessed_store_county_fips','purchase_month','purchase_year']).mean().reset_index()
product_group = pool

# The control group is composed of all states where nothing (no entry nor exit) happened.
control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()

# We keep in the control group the only counties where we have data for the entire time period (24 months)
nb_months = control.groupby('guessed_store_county_fips').count()
control = control[np.isin(control.guessed_store_county_fips, nb_months[nb_months.is_walmart==24].index)]
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")

# The treatment group is composed of the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.State == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
treatment_movements = treatment_movements[(treatment_movements.Opening_date>='2015-01-31' ) & (treatment_movements.Opening_date<='2017-01-31') & ((treatment_movements.Closing_date>'2017-01-31') | (treatment_movements.Closing_date.apply(str) == 'NaT'))]

treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")

# We create our dummies for the regression
control['treat'] = False
control['interaction'] = False
control['purchase_0'] = control.purchase_month + 12 * (control.purchase_year - 2015)
control['effects'] = list(zip(control.store_state, control.purchase_0))

treatment['treat'] = True
treatment['purchase_0'] = treatment.purchase_month + 12 * (treatment.purchase_year - 2015)
treatment['opening_0'] = treatment.Opening_date.dt.month  + 12 * (treatment.Opening_date.dt.year - 2015)
treatment['interaction'] = treatment.purchase_0 >= treatment.opening_0
treatment['effects'] = list(zip(treatment.store_state, treatment.purchase_0))

#we take only months before 5 months before the entry and months after 5 months after the entry
treatment = treatment[abs(treatment.purchase_0-treatment.opening_0) >= 6]

#if the treatment group is not empty
if treatment.shape[0]>0 :
    
    # Final dataset for the regression :
    df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'effects']]
    df = df[df.upc_price != 0]

    reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(effects)', data=df)
    results1 = reg1.fit()
    print("=========================================================")
    if True :
    #abs(results1.params[2] / results1.bse[2]) >= 2.:
        print(f"Post-t coef : {np.exp(results1.params[2])-1}")
    print(f"Coef/err : {abs(results1.params[2] / results1.bse[2])}")
    if True :
    #abs(results1.params[2] / results1.bse[2]) >= 2.:
        print(f"CI_up : {np.exp(results1.conf_int(alpha=0.05)[0][2])-1}")
        print(f"CI_down : {np.exp(results1.conf_int(alpha=0.05)[1][2])-1}")
        print(f"p_value : {results1.pvalues[2]}")
        print(f"nobs : {results1.nobs}")
        print(f"R squared : {results1.rsquared}")

  pool = nielsen.groupby(['store_state','guessed_store_county','guessed_store_county_fips','purchase_month','purchase_year']).mean().reset_index()


Size of the control group: 1390.
Size of the treatment group: 65.
Post-t coef : 0.05181727564731764
Coef/err : 3.0254155812853054
CI_up : 0.01794926244929229
CI_down : 0.08681210563306951
p_value : 0.0024848156425117287
nobs : 34263.0
R squared : 0.15575449526156848


### Parallel trend test : pre-treatment interaction dummy

#### On pooled dataset (all categories)

In [11]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import datetime

In [12]:
nielsen15 = pd.read_csv('../../Nielsen/aggregated_nielsen_2015.csv')
nielsen16 = pd.read_csv('../..//Nielsen/aggregated_nielsen_2016.csv')
nielsen15['year'] = 2015
nielsen16['year'] = 2016

nielsen = pd.concat((nielsen15, nielsen16))
nielsen = nielsen[~nielsen.is_walmart]

## Entriy/exit dates
fandom = pd.read_csv('../data_collection/plein_de_data/fandom_traitées.csv', parse_dates=['Opening_date', 'Closing_date'])[['State', 'County_name', 'County_fips', 'Opening_date', 'Closing_date']]

# We drop the state in which we do not trust our data (some mistakes stillremain)
fandom = fandom[~np.isin(fandom.State, ('CA', 'GA', 'KS', 'LA', 'TX'))]
nielsen = nielsen[~np.isin(nielsen.store_state, ('CA', 'GA', 'KS', 'LA', 'TX'))]

movements = fandom[((fandom.Opening_date >= '2014-01-31') & (fandom.Opening_date <= '2018-01-31')) | ((fandom.Closing_date >= '2014-01-31') & (fandom.Closing_date <= '2018-01-31'))]

In [13]:
#For each county and each month we take the mean of the prices of all categories
pool = nielsen.groupby(['store_state','guessed_store_county','guessed_store_county_fips','purchase_month','purchase_year']).mean().reset_index()
product_group = pool

# The control group is composed of all states where nothing (no entry nor exit) happened.
control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()

# We keep in the control group the only counties where we have data for the entire time period (24 months)
nb_months = control.groupby('guessed_store_county_fips').count()
control = control[np.isin(control.guessed_store_county_fips, nb_months[nb_months.is_walmart==24].index)]
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")

# The treatment group is composed of the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.State == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
treatment_movements = treatment_movements[(treatment_movements.Opening_date>='2015-01-31' ) & (treatment_movements.Opening_date<='2017-01-31') & ((treatment_movements.Closing_date>'2017-01-31') | (treatment_movements.Closing_date.apply(str) == 'NaT'))]

treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")

# We create our dummies for the regression
control['treat'] = False
control['interaction'] = False
control['purchase_0'] = control.purchase_month + 12 * (control.purchase_year - 2015)
control['effects'] = list(zip(control.store_state, control.purchase_0))
control['p_interaction'] = False

treatment['treat'] = True
treatment['purchase_0'] = treatment.purchase_month + 12 * (treatment.purchase_year - 2015)
treatment['opening_0'] = treatment.Opening_date.dt.month  + 12 * (treatment.Opening_date.dt.year - 2015)
treatment['interaction'] = treatment.purchase_0 >= treatment.opening_0
treatment['effects'] = list(zip(treatment.store_state, treatment.purchase_0))
#we add a pre-treatment interaction
treatment['p_interaction'] = treatment.purchase_0 < (treatment.opening_0 - 9)

#we take only months before 5 months before the entry and months after 5 months after the entry
treatment = treatment[abs(treatment.purchase_0-treatment.opening_0) >= 6]

#if the treatment group is not empty
if treatment.shape[0]>0 :
    
    # Final dataset for the regression :
    df = pd.concat((control, treatment))[['upc_price', 'treat', 'interaction', 'effects', "p_interaction"]]
    df = df[df.upc_price != 0]

    reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + p_interaction + C(effects)', data=df)
    results1 = reg1.fit()
    print("=========================================================")
    if True :
    #abs(results1.params[2] / results1.bse[2]) >= 2.:
        print(f"Post-t coef : {np.exp(results1.params[2])-1}")
    print(f"Coef/err : {abs(results1.params[2] / results1.bse[2])}")
    if True :
    #abs(results1.params[2] / results1.bse[2]) >= 2.:
        print(f"CI_up : {np.exp(results1.conf_int(alpha=0.05)[0][2])-1}")
        print(f"CI_down : {np.exp(results1.conf_int(alpha=0.05)[1][2])-1}")
        print(f"p_value : {results1.pvalues[2]}")
        print(f"nobs : {results1.nobs}")
        print(f"R squared : {results1.rsquared}")
        print(f"Pre-t coef : {np.exp(results1.params[3])-1}")
        print(f"Coef/err : {abs(results1.params[3] / results1.bse[3])}")
        print(f"CI_up : {np.exp(results1.conf_int(alpha=0.05)[0][3])-1}")
        print(f"CI_down : {np.exp(results1.conf_int(alpha=0.05)[1][3])-1}")
        print(f"p_value : {results1.pvalues[3]}")

  pool = nielsen.groupby(['store_state','guessed_store_county','guessed_store_county_fips','purchase_month','purchase_year']).mean().reset_index()


Size of the control group: 1390.
Size of the treatment group: 65.
Post-t coef : 0.026605341786368975
Coef/err : 1.2084541040099148
CI_up : -0.01619791476910415
CI_down : 0.07127088222928069
p_value : 0.22688124263607964
nobs : 34263.0
R squared : 0.15583189018149124
Pre-t coef : -0.038607818970261554
Coef/err : 1.7450540584628422
CI_up : -0.08019743826145986
CI_down : 0.004862308709081997
p_value : 0.08098470726973861


### Parallel trend test : placebo treatment group

#### On pooled dataset (all categories)

In [12]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import datetime
from random import choice


nielsen15 = pd.read_csv('../../Nielsen/aggregated_nielsen_2015.csv')
nielsen16 = pd.read_csv('../..//Nielsen/aggregated_nielsen_2016.csv')
nielsen15['year'] = 2015
nielsen16['year'] = 2016

nielsen = pd.concat((nielsen15, nielsen16))
nielsen = nielsen[~nielsen.is_walmart]

## Entriy/exit dates
fandom = pd.read_csv('../data_collection/plein_de_data/fandom_traitées.csv', parse_dates=['Opening_date', 'Closing_date'])[['State', 'County_name', 'County_fips', 'Opening_date', 'Closing_date']]

# We drop the state in which we do not trust our data (some mistakes stillremain)
fandom = fandom[~np.isin(fandom.State, ('CA', 'GA', 'KS', 'LA', 'TX'))]
nielsen = nielsen[~np.isin(nielsen.store_state, ('CA', 'GA', 'KS', 'LA', 'TX'))]

movements = fandom[((fandom.Opening_date >= '2014-01-31') & (fandom.Opening_date <= '2018-01-31')) | ((fandom.Closing_date >= '2014-01-31') & (fandom.Closing_date <= '2018-01-31'))]

In [17]:
#For each county and each month we take the mean of the prices of all categories
pool = nielsen.groupby(['store_state','guessed_store_county','guessed_store_county_fips','purchase_month','purchase_year']).mean().reset_index()
product_group = pool

# The control group is composed by all states where nothing (no entry nor exit) happened.
control = product_group[~np.isin(product_group.guessed_store_county_fips, movements)].copy()

# We keep in the control group the only counties where we have data for the entire time period (24 months)
nb_months = control.groupby('guessed_store_county_fips').count()
control = control[np.isin(control.guessed_store_county_fips, nb_months[nb_months.is_walmart==24].index)]
print(f"Size of the control group: {len(control.guessed_store_county_fips.unique())}.")

# The treatment group is composed by the states where one entry took place in 2016 and where this entry is the only movement
count = movements.groupby('County_fips').count()
count = count[count.State == 1] # No more than one movement in the treatement group
treatment_movements = movements[(np.isin(movements.County_fips, count.index))]
treatment_movements = treatment_movements[(treatment_movements.Opening_date>='2015-01-31' ) & (treatment_movements.Opening_date<='2017-01-31') & ((treatment_movements.Closing_date>'2017-01-31') | (treatment_movements.Closing_date.apply(str) == 'NaT'))]

treatment = product_group[np.isin(product_group.guessed_store_county_fips, treatment_movements.County_fips )].copy()
treatment = treatment.merge(treatment_movements, left_on='guessed_store_county_fips', right_on='County_fips')
print(f"Size of the treatment group: {len(treatment.guessed_store_county_fips.unique())}.")

#if the treatment group is not empty
if treatment.shape[0] > 0 : 

    ### Creation of the placebo treatment group

        #We create an 'Opening_date' column for the control group (because we have to add an Opening_date to the
    #placebo treatment group counties)

    control['Opening_date']=0

    #We gather relevant information on the actual treatment group in an array

    treatment_info = treatment.groupby(['guessed_store_county_fips','store_state','Opening_date']).nunique()['is_walmart'].reset_index().to_numpy()


    #We select our placebo treatment group counties

    new_t_county_fips_list = []
    # error will be True if we can't find counties in the control group to be used as placebo treatment group counties
    error = False
    #total nb of counties we want in the placebo treatment group
    nb_of_t_counties = treatment_info.shape[0]
    #list of indexes in treatment_info of the actual Tg counties for which we could not find a placebo in the same state
    index_of_missing_counties = []
    for k in range(nb_of_t_counties): 
        #for each actual Tg county, we find a placebo replacement
        #we get the county state
        state = treatment_info[k][1]
        #we get the county opening date
        opening_date = treatment_info[k][2]
        #list of the candidate county_fips for our placebo (same state as the actual Tg county, not the same county as another already existing placebo county)
        c_county_list = control[(control.store_state == state)&(~np.isin(control.guessed_store_county_fips, new_t_county_fips_list))].guessed_store_county_fips.unique().copy().tolist() 
        nb_of_c_counties = len(c_county_list) #nb of candidates
        if nb_of_c_counties > 0 :
            new_t_county_fips = choice(c_county_list) #random choice of one candidate
            new_t_county_fips_list.append(new_t_county_fips)
            #we add the right opening_date to placebo county
            control.loc[control.guessed_store_county_fips==new_t_county_fips, 'Opening_date'] = opening_date 
        else : 
            #if we did not find suitable candidates in the right state
            index_of_missing_counties.append(k)

    nb_of_missing_counties = len(index_of_missing_counties)
    if nb_of_missing_counties != 0 : 
        #if we did not find suitable candidates in the right state for at least one state, we search candidates whatever the state
        for k in index_of_missing_counties:
            remaining_c_county_list = control[~np.isin(control.guessed_store_county_fips, new_t_county_fips_list)].guessed_store_county_fips.unique().copy().tolist()
            print(f'remaining_c_county_list len : {len(remaining_c_county_list)}')
            if len(remaining_c_county_list) <= 0 :
                error = True
            else :
                new_t_county_fips = choice(remaining_c_county_list)
                new_t_county_fips_list.append(new_t_county_fips)
                control.loc[control.guessed_store_county_fips==new_t_county_fips, 'Opening_date'] = opening_date
    if not error :

        #we keep in our placebo Tg only the selected placebo counties
        new_treatment = control[np.isin(control.guessed_store_county_fips, new_t_county_fips_list)].copy()
        new_control = control[~np.isin(control.guessed_store_county_fips, new_t_county_fips_list)].copy()

        ### End of the creation of the placebo treatment group

        
        
        # We create our dummies for the regression
        new_control['treat'] = False
        new_control['interaction'] = False
        new_control['purchase_0'] = new_control.purchase_month + 12 * (new_control.purchase_year - 2015)
        new_control['effects'] = list(zip(new_control.store_state, new_control.purchase_0))

        new_treatment['treat'] = True
        new_treatment['purchase_0'] = new_treatment.purchase_month + 12 * (new_treatment.purchase_year - 2015)
        new_treatment['opening_0'] = pd.DatetimeIndex(new_treatment.Opening_date).month  + 12 * (pd.DatetimeIndex(new_treatment.Opening_date).year - 2015)
        new_treatment['interaction'] = new_treatment.purchase_0 >= new_treatment.opening_0
        new_treatment['effects'] = list(zip(new_treatment.store_state, new_treatment.purchase_0))

        new_treatment = new_treatment[abs(new_treatment.purchase_0-new_treatment.opening_0) >= 6]

        #if the treatment group is not empty
        if new_treatment.shape[0]>0 :
            
            # Final dataset for the regression :

            df = pd.concat((new_control, new_treatment))[['upc_price', 'treat', 'interaction', 'effects']]
            df = df[df.upc_price != 0]

            reg1 = smf.ols(formula='np.log(upc_price) ~ treat + interaction + C(effects)', data=df)
            results1 = reg1.fit()
            print("=========================================================")
            if abs(results1.params[2] / results1.bse[2]) >= 2.:
                print(f"Interaction Coef : {np.exp(results1.params[2])-1}")
            print(f"Coef/err : {abs(results1.params[2] / results1.bse[2])}")
            print(f"CI_up : {np.exp(results1.conf_int(alpha=0.05)[0][2])-1}")
            print(f"CI_down : {np.exp(results1.conf_int(alpha=0.05)[1][2])-1}")
            print(f"p_value : {results1.pvalues[2]}")
            print(f"nobs : {results1.nobs}")
            print(f"R squared : {results1.rsquared}")

  pool = nielsen.groupby(['store_state','guessed_store_county','guessed_store_county_fips','purchase_month','purchase_year']).mean().reset_index()


Size of the control group: 1390.
Size of the treatment group: 65.
remaining_c_county_list len : 1326
Interaction Coef : 0.05682741600373542
Coef/err : 3.308891076225717
CI_up : 0.022786846981806885
CI_down : 0.0920009291408097
p_value : 0.0009377071200183043
nobs : 32705.0
R squared : 0.15359030055551126
