In [1]:
import pandas as pd
import numpy as np
from tools_nielsen import *

## LOADING THE DATA
panelist = pd.read_table("../../Nielsen/panelists_2016.tsv").set_index('Household_Cd')[['Fips_State_Cd', 'Fips_State_Desc', 'Fips_County_Cd', 'Fips_County_Desc', 'Panelist_ZipCd']]
panelist['household_county_fips'] = np.vectorize(int)(panelist.Fips_State_Cd * 1e3 + panelist.Fips_County_Cd)
panelist['household_zip3'] = panelist.Panelist_ZipCd // 100
panelist = panelist[['Fips_State_Desc', 'Fips_County_Desc', 'household_county_fips', 'household_zip3']].rename(columns={'Fips_State_Desc': 'household_state', 'Fips_County_Desc': 'household_county'})

purchases = pd.read_csv("../../Nielsen/purchases_subset_2016.csv")
purchases['upc_price'] = purchases.total_price_paid / purchases.quantity
purchases = purchases[['trip_code_uc', 'upc', 'upc_ver_uc', 'upc_price']]

trips = pd.read_table("../../Nielsen/trips_2016.tsv", parse_dates=['purchase_date']).set_index('trip_code_uc')[['purchase_date', 'retailer_code', 'store_code_uc', 'store_zip3', 'household_code']]
initial_trips_len = len(trips)

# Some store are unnumerotated - we drop them from the data
print(f"Trips - Proportion of unnumerotated stores : {round(len(trips[trips.store_code_uc==0])/len(trips),4)*100}% (data droped, over {len(trips)} trips)")
trips_non_null = trips[trips.store_code_uc != 0]


## GETTING THE STORE STATE
zip_to_state = pd.read_table('zip_prefixes.txt', header=0, names=['zip3','state', 'distib_center', 'towns'])[['zip3', 'state']]
# From the zip3 code, the state can be determined (cf. zip_prefixes.txt)

trips_non_null = trips.merge(zip_to_state, left_on='store_zip3', right_on='zip3')[['purchase_date', 'retailer_code', 'store_code_uc', 'state', 'store_zip3', 'household_code']].rename(columns={'state': 'store_state'})


## GETTING THE STORE COUNTY - mode
# The store county is assumed to be the one where the maximum of households that visited this store comes form.

# We first merge trips and panelist to get the origin county of the households.
trips_merged = trips_non_null.merge(panelist, left_on='household_code', right_on='Household_Cd')[['retailer_code', 'store_code_uc', 'store_state', 'store_zip3', 'household_state', 'household_county_fips', 'household_county', 'household_zip3']]

# Then, we delete all the househoslds that do not come from the store state.
before_deletion = len(trips_merged)
trips_state = trips_merged[trips_merged.household_state == trips_merged.store_state] # TOUT RENOMMER
suppression_rate_state = 1-len(trips_state)/before_deletion
print(f"Trips - Suppression rate : {round(suppression_rate_state, 4)*100}% (proportion of households that do not shop in their own state.)")

# Then, we delete all the housolds that do not come from the store zip3 zone.
trips_zip3 = trips_state[trips_state.household_zip3 == trips_state.store_zip3]
print(f"Trips - Total suppression rate : {round(1 - len(trips_zip3)/initial_trips_len, 4)*100}% (proportion of households that do not shop in their own zip3 zone and unnumerotated stores)")

# Then, we select the mode :

store_loc = trips_zip3.groupby(['store_code_uc'])[['retailer_code', 'store_state', 'household_county_fips', 'household_county']].agg(my_mode).rename(columns={'household_county_fips': 'guessed_store_county_fips', 'household_county': 'guessed_store_county'})

stat = trips_zip3.groupby(['store_code_uc'])[['household_county_fips', 'household_county']].agg(pd.Series.value_counts)
stat['max_obs'] = stat.household_county.apply(my_max)
stat['nb_obs'] = stat.household_county.apply(my_sum)
stat['max_freq'] = stat.max_obs / stat.nb_obs
stat['distinct_counties'] = stat.household_county.apply(my_len)
stat['nb_min'] = stat.household_county.apply(my_min)
stat['criteria'] = ((stat.max_obs >= 3 * (stat.nb_obs - (stat.distinct_counties - 2) * stat.nb_min - stat.max_obs)) & (stat.distinct_counties!=1)) | ((stat.distinct_counties==1) & (stat.nb_obs>=4))


Trips - Proportion of unnumerotated stores : 48.68% (data droped, over 10745635 trips)
Trips - Suppression rate : 2.75% (proportion of households that do not shop in their own state.)
Trips - Total suppression rate : 57.38999999999999% (proportion of households that do not shop in their own zip3 zone and unnumerotated stores)


In [13]:
store_loc['is_walmart'] = store_loc.retailer_code == 6089
trips_loc = trips[trips.store_code_uc != 0].reset_index().merge(store_loc.reset_index(), on=['retailer_code', 'store_code_uc']).set_index('trip_code_uc')

In [14]:
trips_loc = trips_loc.reset_index().merge(stat[['criteria']], left_on='store_code_uc', right_index=True)

trips_loc = trips_loc[trips_loc.criteria][['trip_code_uc', 'purchase_date', 'retailer_code', 'is_walmart', 'store_code_uc', 'store_zip3', 'household_code', 'store_state', 'guessed_store_county_fips', 'guessed_store_county']]

print(f"Total trips suppression rate : {round(len(trips_loc)/initial_trips_len, 4) * 100}%")


## ADDING UPC GROUP (product category)

upc_descr = pd.read_table('../../Nielsen/nielsen_extracts/HMS/Master_Files/Latest/products.tsv', encoding = "ISO-8859-1")[['upc', 'upc_ver_uc', 'product_group_code', 'product_group_descr']]

# Some (rare) upc do not have any group (NaN). We drop them.
upc_descr = upc_descr[upc_descr.product_group_code.apply(str)!='nan']
upc_descr = upc_descr.astype({'product_group_code': int})

# A unique product is a combination of upc and upc_ver_uc. Two products having the same upc but a different upc_ver_uc are very very similar, but it happens they do no not belong to same group. We jus drop them then.
upc_descr = upc_descr.merge(upc_descr.groupby('upc')['product_group_descr'].nunique(), on='upc').rename(columns={'product_group_descr_y': 'nb_of_groups_per_upc', 'product_group_descr_x': 'product_group_descr'})
upc_descr = upc_descr[upc_descr.nb_of_groups_per_upc == 1][['upc', 'upc_ver_uc', 'product_group_code', 'product_group_descr']]


Total trips suppression rate : 45.57%


In [4]:
purchases = purchases.merge(upc_descr, on=['upc', 'upc_ver_uc'])

In [None]:




## AGGREGATING THE DATA

# Merging
prices = purchases.merge(trips_loc, on='trip_code_uc')[['trip_code_uc', 'purchase_date', 'retailer_code', 'store_code_uc', 'store_state', 'guessed_store_county', 'guessed_store_county_fips', 'upc', 'upc_ver_uc','product_group_descr', 'upc_price', 'is_walmart']]
prices['purchase_month'] = prices.purchase_date.dt.month

avg = pd.DataFrame(prices.groupby(['retailer_code', 'store_state', 'guessed_store_county', 'guessed_store_county_fips', 'purchase_month', 'product_group_descr']).mean()[['upc_price']])

std = pd.DataFrame(prices.groupby(['retailer_code', 'store_state', 'guessed_store_county', 'guessed_store_county_fips', 'purchase_month', 'product_group_descr']).std()[['upc_price']]).rename(columns={'upc_price': 'upc_price_std'})

print(std.head(50))

                                                                                                                          upc_price_std
retailer_code store_state guessed_store_county guessed_store_county_fips purchase_month product_group_descr                            
2             DE          KENT                 10001                     1              CARBONATED BEVERAGES                   0.091924
                                                                                        FRESH PRODUCE                          0.288675
                                                                         2              CARBONATED BEVERAGES                   0.200475
                                                                                        CRACKERS                                    NaN
                                                                                        FRESH PRODUCE                          0.542245
                                                

In [15]:




## AGGREGATING THE DATA

# Merging
prices = purchases.merge(trips_loc, on='trip_code_uc')[['trip_code_uc', 'purchase_date', 'retailer_code', 'store_code_uc', 'store_state', 'guessed_store_county', 'guessed_store_county_fips', 'upc', 'upc_ver_uc','product_group_descr', 'upc_price', 'is_walmart']]
prices['purchase_month'] = prices.purchase_date.dt.month

avg = pd.DataFrame(prices.groupby(['is_walmart', 'store_state', 'guessed_store_county', 'guessed_store_county_fips', 'purchase_month', 'product_group_descr']).mean()[['upc_price']])

std = pd.DataFrame(prices.groupby(['is_walmart', 'store_state', 'guessed_store_county', 'guessed_store_county_fips', 'purchase_month', 'product_group_descr']).std()[['upc_price']]).rename(columns={'upc_price': 'upc_price_std'})

print(std.head(50))

                                                                                                                       upc_price_std
is_walmart store_state guessed_store_county guessed_store_county_fips purchase_month product_group_descr                            
False      AL          AUTAUGA              1001                      1              BREAD AND BAKED GOODS                       NaN
                                                                                     COFFEE                                      NaN
                                                                                     CONDIMENTS, GRAVIES, AND SAUCES             NaN
                                                                                     FRESHENERS AND DEODORIZERS                  NaN
                                                                                     MILK                               1.265606e+00
                                                                     

In [17]:
avg_prices = avg.merge(std, left_index=True, right_index=True)

In [16]:
len(avg_prices)

280344

In [18]:
len(avg_prices)

217816

In [22]:
avg_prices.to_csv('aggregated_nielsen.csv')

In [23]:
hey = pd.read_csv('aggregated_nielsen.csv')

In [24]:
hey

Unnamed: 0,is_walmart,store_state,guessed_store_county,guessed_store_county_fips,purchase_month,product_group_descr,upc_price,upc_price_std
0,False,AL,AUTAUGA,1001,1,BREAD AND BAKED GOODS,0.9900,
1,False,AL,AUTAUGA,1001,1,COFFEE,3.0900,
2,False,AL,AUTAUGA,1001,1,"CONDIMENTS, GRAVIES, AND SAUCES",1.8800,
3,False,AL,AUTAUGA,1001,1,FRESHENERS AND DEODORIZERS,1.0000,
4,False,AL,AUTAUGA,1001,1,MILK,3.1475,1.265606
...,...,...,...,...,...,...,...,...
217811,False,WY,UINTA,56041,12,MILK,2.7400,0.339411
217812,False,WY,WASHAKIE,56043,1,CANDY,0.4100,
217813,False,WY,WASHAKIE,56043,12,CANDY,3.3900,
217814,False,WY,WESTON,56045,3,BREAD AND BAKED GOODS,2.1900,
