In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
rcat = pd.read_csv('../../Nielsen/retailers.tsv', sep='\t')

### Philibert's dataset

In [3]:
panelist = pd.read_table("../../Nielsen/panelists_2016.tsv").set_index('Household_Cd')[['Fips_State_Cd', 'Fips_State_Desc', 'Fips_County_Cd', 'Fips_County_Desc', 'Panelist_ZipCd']]
panelist['household_county_fips'] = np.vectorize(int)(panelist.Fips_State_Cd * 1e3 + panelist.Fips_County_Cd)
panelist['household_zip3'] = panelist.Panelist_ZipCd // 100
panelist = panelist[['Fips_State_Desc', 'Fips_County_Desc', 'household_county_fips', 'household_zip3']].rename(columns={'Fips_State_Desc': 'household_state', 'Fips_County_Desc': 'household_county'})

purchases = pd.read_csv("../../Nielsen/purchases_subset_2016.csv")
purchases['upc_price'] = purchases.total_price_paid / purchases.quantity
purchases = purchases[['trip_code_uc', 'upc', 'upc_price', 'upc_ver_uc']]

trips = pd.read_table("../../Nielsen/trips_2016.tsv", parse_dates=['purchase_date']).set_index('trip_code_uc')[['purchase_date', 'retailer_code', 'store_code_uc', 'store_zip3', 'household_code']]

In [4]:
zip_to_state = pd.read_table('zip_prefixes.txt', header=0, names=['zip3','state', 'distib_center', 'towns'])[['zip3', 'state']]

non_null = trips[trips.store_code_uc != 0]
non_null = non_null.merge(zip_to_state, left_on='store_zip3', right_on='zip3')[['purchase_date', 'retailer_code', 'store_code_uc', 'state', 'store_zip3', 'household_code']].rename(columns={'state': 'store_state'})

non_null_merged = non_null.merge(panelist, left_on='household_code', right_on='Household_Cd')[['retailer_code', 'store_code_uc', 'store_state', 'store_zip3', 'household_state', 'household_county_fips', 'household_county', 'household_zip3']]

before_deletion = len(non_null_merged)
non_null_state = non_null_merged[non_null_merged.household_state == non_null_merged.store_state] # TOUT RENOMMER

non_null_zip3 = non_null_state[non_null_state.household_zip3 == non_null_state.store_zip3]

df = non_null_zip3
                                                                    
from random import choice

def my_mode(self):
    return choice(list(pd.Series.mode(self)))
                            
store_loc = df.groupby(['store_code_uc'])[['retailer_code', 'store_state', 'household_county_fips', 'household_county']].agg(my_mode).rename(columns={'household_county_fips': 'guessed_store_county_fips', 'household_county': 'guessed_store_county'})
                                                                                               
def my_max(x):
    try:
        return max(x)
    except:
        return x
def my_min(x):
    try:
        return min(x)
    except:
        return x
def my_sum(x):
    try:
        return sum(x)
    except:
        return x
def my_len(x):
    try:
        return len(x)
    except:
        return 1
                                                                                            
trips_loc = trips[trips.store_code_uc != 0].reset_index().merge(store_loc.reset_index(), on=['retailer_code', 'store_code_uc']).set_index('trip_code_uc')
                                                                                               
prices = purchases.merge(trips_loc, on='trip_code_uc')[['trip_code_uc','purchase_date', 'retailer_code', 'store_code_uc', 'store_state', 'guessed_store_county', 'guessed_store_county_fips', 'upc', 'upc_ver_uc', 'upc_price']]
prices['purchase_month'] = prices.purchase_date.dt.month

In [5]:
prices = pd.merge(rcat, prices, how='inner', on='retailer_code')

In [6]:
upc_descr = pd.read_table('../../Nielsen/products.tsv', encoding = "ISO-8859-1")
# Some (rare) upc do not have any group (NaN). We drop them.
upc_descr = upc_descr[upc_descr.product_group_code.apply(str)!='nan']
upc_descr = upc_descr.astype({'product_group_code': int})

# A unique product is a combination of upc and upc_ver_uc. Two products having the same upc but a different upc_ver_uc usually are very very similar, but sometimes don't belong to the same group. In this case we drop them.
upc_descr = upc_descr.merge(upc_descr.groupby('upc')['product_group_descr'].nunique(), on='upc').rename(columns={'product_group_descr_y': 'nb_of_groups_per_upc', 'product_group_descr_x': 'product_group_descr'})
upc_descr = upc_descr[upc_descr.nb_of_groups_per_upc == 1][['upc', 'upc_ver_uc', 'product_group_code', 'product_group_descr']]

In [7]:
data = pd.merge(prices, upc_descr, how='inner', on=['upc', 'upc_ver_uc'])

In [8]:
data["channel_type"].unique()

array(['Grocery', 'Discount Store', 'Drug Store'], dtype=object)

### Statistics

In [9]:
data.head()

Unnamed: 0,retailer_code,channel_type,trip_code_uc,purchase_date,store_code_uc,store_state,guessed_store_county,guessed_store_county_fips,upc,upc_ver_uc,upc_price,purchase_month,product_group_code,product_group_descr
0,2,Grocery,1068002184,2016-08-26,6663447,NJ,SALEM,34033,1821482514,1,6.41,8,4509,PET CARE
1,120,Grocery,1074384219,2016-10-20,5514028,CT,NEW HAVEN,9009,1821482514,1,6.74,10,4509,PET CARE
2,151,Grocery,1066944988,2016-10-28,7835339,OH,FRANKLIN,39049,1821482514,1,5.99,10,4509,PET CARE
3,221,Grocery,1072295724,2016-11-19,6220518,MN,BLUE EARTH,27013,1821482514,1,6.79,11,4509,PET CARE
4,257,Grocery,1074967345,2016-01-03,2514625,TN,WASHINGTON,47179,1821482514,1,6.68,1,4509,PET CARE


In [13]:
data[data.retailer_code == 848].head()

Unnamed: 0,retailer_code,channel_type,trip_code_uc,purchase_date,store_code_uc,store_state,guessed_store_county,guessed_store_county_fips,upc,upc_ver_uc,upc_price,purchase_month,product_group_code,product_group_descr
7,848,Grocery,1066885564,2016-01-11,5347082,LA,JEFFERSON,22051,1821482514,1,4.97,1,4509,PET CARE
1838,848,Grocery,1066699450,2016-01-22,4355543,AR,WASHINGTON,5143,2100064346,1,1.98,1,1015,"SALAD DRESSINGS, MAYO, TOPPINGS"
1839,848,Grocery,1071953388,2016-05-19,4001732,OK,TULSA,40143,2100064346,1,1.98,5,1015,"SALAD DRESSINGS, MAYO, TOPPINGS"
1840,848,Grocery,1075636750,2016-06-26,4001732,OK,TULSA,40143,2100064346,1,1.98,6,1015,"SALAD DRESSINGS, MAYO, TOPPINGS"
1841,848,Grocery,1065603308,2016-11-17,4001732,OK,TULSA,40143,2100064346,1,1.98,11,1015,"SALAD DRESSINGS, MAYO, TOPPINGS"


In [34]:
pd.set_option("display.max_rows", None)
data.groupby("product_group_descr").count()

Unnamed: 0_level_0,retailer_code,channel_type,trip_code_uc,purchase_date,store_code_uc,store_state,guessed_store_county,guessed_store_county_fips,upc,upc_ver_uc,upc_price,purchase_month,product_group_code
product_group_descr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AUTOMOTIVE,27,27,27,27,27,27,27,27,27,27,27,27,27
BABY NEEDS,201,201,201,201,201,201,201,201,201,201,201,201,201
BAKED GOODS-FROZEN,1544,1544,1544,1544,1544,1544,1544,1544,1544,1544,1544,1544,1544
BAKING MIXES,278,278,278,278,278,278,278,278,278,278,278,278,278
BAKING SUPPLIES,2718,2718,2718,2718,2718,2718,2718,2718,2718,2718,2718,2718,2718
BATTERIES AND FLASHLIGHTS,340,340,340,340,340,340,340,340,340,340,340,340,340
BEER,4583,4583,4583,4583,4583,4583,4583,4583,4583,4583,4583,4583,4583
BREAD AND BAKED GOODS,17876,17876,17876,17876,17876,17876,17876,17876,17876,17876,17876,17876,17876
BREAKFAST FOOD,6084,6084,6084,6084,6084,6084,6084,6084,6084,6084,6084,6084,6084
BREAKFAST FOODS-FROZEN,1765,1765,1765,1765,1765,1765,1765,1765,1765,1765,1765,1765,1765


In [40]:
cat = "BATTERIES AND FLASHLIGHTS"
data[data["product_group_descr"]== cat].groupby("channel_type").count()

Unnamed: 0_level_0,retailer_code,trip_code_uc,purchase_date,store_code_uc,store_state,guessed_store_county,guessed_store_county_fips,upc,upc_ver_uc,upc_price,purchase_month,product_group_code,product_group_descr
channel_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Grocery,340,340,340,340,340,340,340,340,340,340,340,340,340


In [41]:
data.groupby("channel_type").count()

Unnamed: 0_level_0,retailer_code,trip_code_uc,purchase_date,store_code_uc,store_state,guessed_store_county,guessed_store_county_fips,upc,upc_ver_uc,upc_price,purchase_month,product_group_code,product_group_descr
channel_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Discount Store,98563,98563,98563,98563,98563,98563,98563,98563,98563,98563,98563,98563,98563
Drug Store,9276,9276,9276,9276,9276,9276,9276,9276,9276,9276,9276,9276,9276
Grocery,763013,763013,763013,763013,763013,763013,763013,763013,763013,763013,763013,763013,763013


In [43]:
data["retailer_id"==848]

KeyError: False