# This is a Test Report for Instacart Datasets based on Spotlight 

## Data Preprocessing (to Sequence data)

In [1]:
import os
import random

import gensim
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize

DEFAULT_USER_COL = "user_ids"
DEFAULT_ITEM_COL = "item_ids"
DEFAULT_ORDER_COL = "order_ids"
DEFAULT_RATING_COL = "ratings"
DEFAULT_LABEL_COL = "label"
DEFAULT_TIMESTAMP_COL = "timestamp"
DEFAULT_PREDICTION_COL = "prediction"
DEFAULT_FLAG_COL = "flag"
data_base_dir = "./datasets/instacart_25/"

%matplotlib inline

In [30]:
products_file = data_base_dir + "raw/products.csv"
products = pd.read_csv(
    products_file, usecols=["product_id", "product_name", "aisle_id", "department_id"]
)
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [3]:
word_list = products["product_name"].unique()
len(word_list)

49688

In [4]:
GoogleNews_pretrain_dir = "./GoogleNews-vectors-negative300.bin.gz"
model = gensim.models.KeyedVectors.load_word2vec_format(
    GoogleNews_pretrain_dir, binary=True
)

In [5]:
len(model.vocab)

3000000

In [6]:
np.mean(model["Chips"])

-0.0036587396

In [7]:
w2v_list = []
w2v_list.append(model["Chips"])
w2v_list.append(model["vector"])
w2v_narray = np.array(w2v_list)
w2v_narray.shape

(2, 300)

In [8]:
np.mean(w2v_narray, axis=0).shape

(300,)

In [9]:
# get mean repesentation of word list.
def get_name_w2c(words):
    count = 0
    w2v_list = []
    for word in words:
        if word in model:
            count += 1
            w2v_list.append(model[word])
    if count == 0:
        return np.random.normal(size=300)
    w2v_narray = np.array(w2v_list)
    return np.mean(w2v_narray, axis=0)

In [10]:
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [12]:
aisles_file = data_base_dir + "raw/aisles.csv"
aisles = pd.read_csv(aisles_file, usecols=["aisle_id", "aisle"])
aisle_id_list = aisles["aisle_id"].unique()
n_aisle = len(aisle_id_list)
aisles_dict = {}
aisle_one_hot = np.eye(n_aisle)
for i in range(n_aisle):
    aisles_dict[aisle_id_list[i]] = aisle_one_hot[i]
len(aisles_dict)

134

In [13]:
department_file = data_base_dir + "raw/departments.csv"
department_df = pd.read_csv(department_file, usecols=["department_id", "department"])
department_id_list = department_df["department_id"].unique()
n_department = len(department_id_list)
departments_dict = {}
department_one_hot = np.eye(n_department)
for i in range(n_department):
    departments_dict[department_id_list[i]] = department_one_hot[i]
len(departments_dict)

21

In [14]:
a = np.array([5, 6])
b = np.array([2, 6])
np.concatenate((a, b), axis=0)

array([5, 6, 2, 6])

In [15]:
product_ids = products["product_id"].unique()
filter_list = [" Chips"]
product_ids = np.array([x for x in product_ids if x not in filter_list])
product_ids.shape

(49688,)

In [16]:
features = {}
# count = 0
for index, _row in products.iterrows():
    _product_id = _row["product_id"]
    _product_name = _row["product_name"]
    _aisle_id = _row["aisle_id"]
    _department_id = _row["department_id"]
    if _product_id in product_ids:
        words = word_tokenize(_product_name)
        name_vec = get_name_w2c(words)
        #         print(name_vec.shape,words)
        nama_vec = np.concatenate((name_vec, aisles_dict[_aisle_id]))
        #         print(len(name_vec))
        nama_vec = np.concatenate((name_vec, departments_dict[_department_id]))
        #         print(len(name_vec))
        #     count+=1
        features[_product_id] = name_vec
#     print(features[_product_id])
len(features)

49688

In [28]:
fea_out_file = data_base_dir + "raw/item_feature_w2v.csv"
fea_out = open(fea_out_file, "w")
fea_out.write("item_id,feature\n")
for _product_id in product_ids:
    fea_out.write(str(_product_id) + "," + " ".join(map(str, features[_product_id])) + "\n")
#     break
fea_out.close()

In [27]:
_product_id

1