# This is a Test Report for Instacart Datasets based on Spotlight 

## Data Preprocessing (to Sequence data)

In [15]:
import os
import random

import gensim
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize

DEFAULT_USER_COL = "user_ids"
DEFAULT_ITEM_COL = "item_ids"
DEFAULT_ORDER_COL = "order_ids"
DEFAULT_RATING_COL = "ratings"
DEFAULT_LABEL_COL = "label"
DEFAULT_TIMESTAMP_COL = "timestamp"
DEFAULT_PREDICTION_COL = "prediction"
DEFAULT_FLAG_COL = "flag"
data_base_dir = "./datasets/instacart_25/"

%matplotlib inline

In [10]:
!ls ./datasets/instacart_25/raw

__MACOSX	     instacart_25.zip		    orders.csv
aisles.csv	     item_feature_w2v.csv	    orders.csv.zip
aisles.csv.zip	     order_products__prior.csv	    products.csv
departments.csv      order_products__prior.csv.zip  products.csv.zip
departments.csv.zip  order_products__train.csv	    sample_submission.csv.zip
instacart_25	     order_products__train.csv.zip


In [16]:
products_file = data_base_dir + "raw/products.csv"
products = pd.read_csv(
    products_file, usecols=["product_id", "product_name", "aisle_id", "department_id"]
)
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [17]:
word_list = products["product_name"].unique()
len(word_list)

49688

In [18]:
import torch
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")

# get mean repesentation of word list.
def get_name_w2c(words):
    input_ids = torch.tensor(tokenizer.encode(words)).unsqueeze(0)
    outputs = model(input_ids)
    vector = torch.mean(outputs[0], 1).detach().numpy().squeeze(0)
    return vector

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=483.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [24]:
aisles_file = data_base_dir + "raw/aisles.csv"
aisles = pd.read_csv(aisles_file, usecols=["aisle_id", "aisle"])
aisle_id_list = aisles["aisle_id"].unique()
n_aisle = len(aisle_id_list)
aisles_dict = {}
aisle_one_hot = np.eye(n_aisle)
for i in range(n_aisle):
    aisles_dict[aisle_id_list[i]] = aisle_one_hot[i]
len(aisles_dict)

134

In [20]:
department_file = data_base_dir + "raw/departments.csv"
department_df = pd.read_csv(department_file, usecols=["department_id", "department"])
department_id_list = department_df["department_id"].unique()
n_department = len(department_id_list)
departments_dict = {}
department_one_hot = np.eye(n_department)
for i in range(n_department):
    departments_dict[department_id_list[i]] = department_one_hot[i]
len(departments_dict)

21

In [21]:
a = np.array([5, 6])
b = np.array([2, 6])
np.concatenate((a, b), axis=0)

array([5, 6, 2, 6])

In [22]:
product_ids = products["product_id"].unique()
filter_list = [" Chips"]
product_ids = np.array([x for x in product_ids if x not in filter_list])
product_ids.shape

(49688,)

In [25]:
features = {}
# count = 0
for index, _row in products.iterrows():
    _product_id = _row["product_id"]
    _product_name = _row["product_name"]
    _aisle_id = _row["aisle_id"]
    _department_id = _row["department_id"]
    if _product_id in product_ids:
        name_vec = get_name_w2c(_product_name)
        #         print(name_vec.shape,words)
        nama_vec = np.concatenate((name_vec, aisles_dict[_aisle_id]))
        #         print(len(name_vec))
        nama_vec = np.concatenate((name_vec, departments_dict[_department_id]))
        #         print(len(name_vec))
        #     count+=1
        features[_product_id] = name_vec
#     print(features[_product_id])
len(features)

49688

In [37]:
fea_out_file = data_base_dir + "raw/item_feature_bert.csv"
fea_out = open(fea_out_file, "w")
fea_out.write("item_id,feature\n")
for _product_id in product_ids:
    fea_out.write(str(_product_id) + "," + " ".join(map(str, features[_product_id])) + "\n")
#     break
fea_out.close()

In [38]:
len(features[3])

768