# This is a Test Report for Instacart Datasets based on Spotlight 

## Data Preprocessing (to Sequence data)

In [1]:
import numpy as np
import pandas as pd
import os
import random
import math
import gensim

from nltk.tokenize import word_tokenize

DEFAULT_USER_COL = "user_ids"
DEFAULT_ITEM_COL = "item_ids"
DEFAULT_ORDER_COL = "order_ids"
DEFAULT_RATING_COL = "ratings"
DEFAULT_LABEL_COL = "label"
DEFAULT_TIMESTAMP_COL = "timestamp"
DEFAULT_PREDICTION_COL = "prediction"
DEFAULT_FLAG_COL = "flag"
data_base_dir = '../../datasets/dunnhumby/raw/'

%matplotlib inline

In [2]:
transaction_data = data_base_dir + 'transaction_data.csv'
transaction_df = pd.read_csv(transaction_data)
transaction_df.head()

Unnamed: 0,household_key,BASKET_ID,DAY,PRODUCT_ID,QUANTITY,SALES_VALUE,STORE_ID,RETAIL_DISC,TRANS_TIME,WEEK_NO,COUPON_DISC,COUPON_MATCH_DISC
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [3]:
product_data = data_base_dir + 'product.csv'
products  = pd.read_csv(product_data)
products.head()

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [4]:
len(products.index)

92353

In [5]:
products['MANUFACTURER'].nunique(),products['DEPARTMENT'].nunique()

(6476, 44)

In [6]:
def get_bin_rep(number):
    width = int(math.ceil(np.log2(number)))
    bin_rep = []
    for _i in range(number):
        bin_rep.append([int(x) for x in np.binary_repr(_i, width=width)])
    return np.asarray(bin_rep)

get binary repesentation for MANUFACTURER

In [7]:
manu_bin_rep_list = get_bin_rep(products['MANUFACTURER'].nunique())
manufacturer_dic = {}
index = 0
for manu in products['MANUFACTURER'].unique():
    manufacturer_dic[manu]=index
    index+=1
def get_manu_vector(manufacturer):
    return manu_bin_rep_list[manufacturer_dic[manufacturer]]

get binary repesentation for DEPARTMENT

In [8]:
department_bin_rep_list = get_bin_rep(products['DEPARTMENT'].nunique())
department_dic = {}
index = 0
for product in products['DEPARTMENT'].unique():
    department_dic[product]=index
    index+=1
def get_depart_vector(department):
    return department_bin_rep_list[department_dic[department]]

In [9]:
def my_tokenize(words):
    return word_tokenize(words.replace('-',' ').replace('/',' ').replace(':',' ').replace('&',' '))

In [10]:
COMMODITY_DESC = products['COMMODITY_DESC'].unique()
word_dic={}
for words in COMMODITY_DESC:
    for word in my_tokenize(words):
        if word in word_dic:
            word_dic[word]=word_dic[word]+1
        else:
            word_dic[word]=1
    
SUB_COMMODITY_DESC = products['SUB_COMMODITY_DESC'].unique()
for words in SUB_COMMODITY_DESC:
    for word in my_tokenize(words):
        if word in word_dic:
            word_dic[word]=word_dic[word]+1
        else:
            word_dic[word]=1
len(word_dic)

2374

In [11]:
frequent_count = 0
frequency = 5
frequent_word_dic={}
bin_position = 0
for key,value in word_dic.items():
    if value>=frequency:
        frequent_word_dic[key]=bin_position
        bin_position+=1
len(frequent_word_dic)

379

In [12]:
# get mean repesentation of word list.
def get_one_hot(words):
    words = my_tokenize(words)
    count = 0
    one_hot = np.zeros(len(frequent_word_dic))
    for word in words:
        if word in frequent_word_dic:
            one_hot[frequent_word_dic[word]]=1
            count+=1
    if count ==0:
        return np.random.normal(size=len(frequent_word_dic))
    return one_hot

In [13]:
products.head()

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [14]:
features = {}
# count = 0
# MANUFACTURER	DEPARTMENT	BRAND	COMMODITY_DESC	SUB_COMMODITY_DESC
for index,_row in products.iterrows():
    _product_id = _row['PRODUCT_ID']
    _MANUFACTURER = _row['MANUFACTURER']
    _DEPARTMENT = _row['DEPARTMENT']
    _COMMODITY_DESC = _row['COMMODITY_DESC']
    _SUB_COMMODITY_DESC = _row['SUB_COMMODITY_DESC']
    MANUFACTURER_vec = get_manu_vector(_MANUFACTURER)
    DEPARTMENT_vec = get_depart_vector(_DEPARTMENT)
    _DESC_vec = get_one_hot(_COMMODITY_DESC+_SUB_COMMODITY_DESC)
    name_vec1 = np.concatenate((MANUFACTURER_vec, DEPARTMENT_vec))
    name_vec = np.concatenate((name_vec1,_DESC_vec))
    features[_product_id] = name_vec
len(features)

92353

In [15]:
fea_out_file = data_base_dir + 'item_feature.csv'
fea_out = open(fea_out_file,'w')
fea_out.write('item_id,feature\n')
for key,value in features.items():
    fea_out.write(str(key)+','+' '.join(map(str, value))+'\n')
fea_out.close()

In [16]:
len(features[25671])

398

In [17]:
for key,value in features.items():
    print()
    break




In [18]:
map(str, [22,3])

<map at 0x7f1a67ff72e8>

In [21]:
train,test,validate = data_util.load_dataset(data_str=data_str,percent=PERCENT)
item_fea_dict = data_util.load_item_fea(fea_type=fea_type)
features[12484608]

array([0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0.