In [1]:
import pandas as pd
import numpy as np

### Philibert's dataset

In [2]:
panelist = pd.read_table("../../Nielsen/panelists_2016.tsv").set_index('Household_Cd')[['Fips_State_Cd', 'Fips_State_Desc', 'Fips_County_Cd', 'Fips_County_Desc', 'Panelist_ZipCd']]
panelist['household_county_fips'] = np.vectorize(int)(panelist.Fips_State_Cd * 1e3 + panelist.Fips_County_Cd)
panelist['household_zip3'] = panelist.Panelist_ZipCd // 100
panelist = panelist[['Fips_State_Desc', 'Fips_County_Desc', 'household_county_fips', 'household_zip3']].rename(columns={'Fips_State_Desc': 'household_state', 'Fips_County_Desc': 'household_county'})

purchases = pd.read_csv("../../Nielsen/purchases_subset_2016.csv")
purchases['upc_price'] = purchases.total_price_paid / purchases.quantity
purchases = purchases[['trip_code_uc', 'upc', 'upc_price']]

trips = pd.read_table("../../Nielsen/trips_2016.tsv", parse_dates=['purchase_date']).set_index('trip_code_uc')[['purchase_date', 'retailer_code', 'store_code_uc', 'store_zip3', 'household_code']]

In [4]:
zip_to_state = pd.read_table('zip_prefixes.txt', header=0, names=['zip3','state', 'distib_center', 'towns'])[['zip3', 'state']]

non_null = trips[trips.store_code_uc != 0]
non_null = non_null.merge(zip_to_state, left_on='store_zip3', right_on='zip3')[['purchase_date', 'retailer_code', 'store_code_uc', 'state', 'store_zip3', 'household_code']].rename(columns={'state': 'store_state'})

non_null_merged = non_null.merge(panelist, left_on='household_code', right_on='Household_Cd')[['retailer_code', 'store_code_uc', 'store_state', 'store_zip3', 'household_state', 'household_county_fips', 'household_county', 'household_zip3']]

before_deletion = len(non_null_merged)
non_null_state = non_null_merged[non_null_merged.household_state == non_null_merged.store_state] # TOUT RENOMMER

non_null_zip3 = non_null_state[non_null_state.household_zip3 == non_null_state.store_zip3]

df = non_null_zip3
                                                                    
from random import choice

def my_mode(self):
    return choice(list(pd.Series.mode(self)))
                            
store_loc = df.groupby(['store_code_uc'])[['retailer_code', 'store_state', 'household_county_fips', 'household_county']].agg(my_mode).rename(columns={'household_county_fips': 'guessed_store_county_fips', 'household_county': 'guessed_store_county'})
                                                                                               
def my_max(x):
    try:
        return max(x)
    except:
        return x
def my_min(x):
    try:
        return min(x)
    except:
        return x
def my_sum(x):
    try:
        return sum(x)
    except:
        return x
def my_len(x):
    try:
        return len(x)
    except:
        return 1
                                                                                            
trips_loc = trips[trips.store_code_uc != 0].reset_index().merge(store_loc.reset_index(), on=['retailer_code', 'store_code_uc']).set_index('trip_code_uc')
                                                                                               
prices = purchases.merge(trips_loc, on='trip_code_uc')[['trip_code_uc','purchase_date', 'retailer_code', 'store_code_uc', 'store_state', 'guessed_store_county', 'guessed_store_county_fips', 'upc', 'upc_price']]
prices['purchase_month'] = prices.purchase_date.dt.month

### Trials

#### Candidates

In [6]:
df_test = prices.groupby(['retailer_code'])['store_code_uc'].nunique().rename("nb_of_stores_per_retailer").to_frame()
pd.set_option("display.max_rows", None)
df_test.sort_values("nb_of_stores_per_retailer", ascending=False)

Unnamed: 0_level_0,nb_of_stores_per_retailer
retailer_code,Unnamed: 1_level_1
6920,3289
3997,2352
4914,1761
4904,1662
6901,1332
130,1177
181,762
32,683
4901,617
79,606


#### Tests

In [62]:
r_code_list = [6920, 3997, 4914, 4904, 6901, 130, 181, 32, 4901, 79, 156, 24]

In [63]:
SC_list = [("NJ", "OCEAN", '2016-07-13', "SC"),("SC", "DARLINGTON", '2016-05-11', "SC"),("SD", "MINNEHAHA", '2016-08-24', "SC"),
           ("TN", "RUTHERFORD", '2016-08-24', "SC"), ("TX", "TOM GREEN", '2016-05-11', "SC"),
           ("VA", "ROCKINGHAM", '2016-07-13', "SC")]
NM_list = [("FL","MARION",'2016-08-03',"NM"), ("LA", "TERREBONNE", '2016-08-31', "NM"),("NM", "OTERO", '2016-06-15', "NM")]

In [64]:
for r_code in r_code_list :
    print("_________________\n")
    print(r_code)
    for list_element in SC_list :
        state, county, opening_date, store_type = list_element
        print(state, county, store_type)
        prices_t = prices[(prices.store_state == state)
                         &(prices.guessed_store_county == county)&(prices.retailer_code == r_code)].sort_values("purchase_month")
        prices_t["purchase_before_opening"] = prices_t.purchase_date < opening_date
        prices_t = prices_t.groupby("store_code_uc")["purchase_before_opening"].any()
        #.rename("store_code_uc").to_frame()
        pd.set_option("display.max_rows", None)
        print(prices_t)

_________________

6920
NJ OCEAN SC
store_code_uc
4841059    True
7764479    True
Name: purchase_before_opening, dtype: bool
SC DARLINGTON SC
store_code_uc
4246331    True
8244765    True
Name: purchase_before_opening, dtype: bool
SD MINNEHAHA SC
store_code_uc
2886136    True
3757133    True
5767438    True
Name: purchase_before_opening, dtype: bool
TN RUTHERFORD SC
store_code_uc
3916789     True
4739715     True
6989482     True
8288188    False
Name: purchase_before_opening, dtype: bool
TX TOM GREEN SC
store_code_uc
465045     True
3511697    True
Name: purchase_before_opening, dtype: bool
VA ROCKINGHAM SC
Series([], Name: purchase_before_opening, dtype: bool)
_________________

3997
NJ OCEAN SC
store_code_uc
1227685    True
Name: purchase_before_opening, dtype: bool
SC DARLINGTON SC
Series([], Name: purchase_before_opening, dtype: bool)
SD MINNEHAHA SC
store_code_uc
5175473    True
7171299    True
Name: purchase_before_opening, dtype: bool
TN RUTHERFORD SC
Series([], Name: purchase_