In [None]:
import gc
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import json
import sklearn.metrics
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from scipy.sparse import dok_matrix, coo_matrix
from sklearn.utils.multiclass import  type_of_target
from joblib import Parallel, delayed
import multiprocessing
from datetime import datetime

# Utilities

In [None]:
# Ref: https://github.com/sh1ng/imba/blob/master/Product2VecSkipGram.py
import warnings
from concurrent.futures import ThreadPoolExecutor
from functools import reduce

import tensorflow as tf
import numpy as np
import math
from sklearn.utils import shuffle

tf.compat.v1.disable_eager_execution()
class Product2VecSkipGram:
    def __init__(self, data, cv_data, batch_size, num_skips, skip_window,
                 vocabulary_size, embedding_size=32,
                 num_negative_sampled=64, len_ratio = 0.5):
        self.data = data
        self.cv_data = cv_data
        self.data_index = 0
        self.batch_size = batch_size
        self.num_skips = num_skips
        self.skip_window = skip_window
        self.embedding_size = embedding_size
        self.num_negative_sampled = num_negative_sampled
        self.vocabulary_size = vocabulary_size
        self.len_ratio = len_ratio
        assert batch_size % num_skips == 0
        assert num_skips <= 2 * skip_window
        self.build_graph()

    def predict(self, products):
        result = []
        for i in range(0, len(products), self.batch_size):
            batch = products[i:i+self.batch_size]
            batch = self.sess.run(self.gathered, feed_dict={self.train_inputs: batch})
            result.append(batch)
        return np.concatenate(result, axis=0)

    def train(self, num_steps, cv_every_n_steps, cv_steps, lrs):
        with ThreadPoolExecutor(max_workers=2) as executor:
            average_loss = 0
            learning_rate = 1.0
            current = executor.submit(self.generate_batch)
            for step in range(num_steps):
                if step in lrs:
                    learning_rate = lrs[step]
                batch_inputs, batch_labels = current.result()
                current = executor.submit(self.generate_batch)
                feed_dict = {self.train_inputs: batch_inputs,
                             self.train_labels: batch_labels,
                             self.learning_rate: learning_rate}

                _, loss_val = self.sess.run([self.optimizer, self.loss], feed_dict=feed_dict)
                average_loss += loss_val

                if step % 2000 == 0:
                    if step > 0:
                        average_loss /= 2000
                    print('Average loss at step ', step, ': ', average_loss)
                    average_loss = 0
                if step % cv_every_n_steps == 0:
                    self.data = shuffle(self.data, random_state=0)
                    self.save_model(step)
                    cv_loss = 0
                    for batch_inputs, batch_labels in self.generate_test(cv_steps):
                        feed_dict = {self.train_inputs: batch_inputs,
                                     self.train_labels: batch_labels,
                                     self.learning_rate: learning_rate}
                        loss_val = self.sess.run(self.loss, feed_dict=feed_dict)
                        cv_loss += loss_val
                    print('CV',cv_loss / cv_steps)

    def save_model(self, step):
        self.saver.save(self.sess, 'models/prod2vec_skip_gram', global_step=step)

    def load_model(self, path):
        self.saver.restore(self.sess, path)

    def build_graph(self):
        self.train_inputs = tf.compat.v1.placeholder(tf.int32, shape=[self.batch_size])
        self.train_labels = tf.compat.v1.placeholder(tf.int32, shape=[self.batch_size])
        self.learning_rate = tf.compat.v1.placeholder(tf.float32)

        # variables
        embeddings = tf.Variable(tf.random.uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0))

        softmax_weights = tf.Variable(tf.random.truncated_normal([self.embedding_size, self.vocabulary_size],
                                                          stddev=1.0 / math.sqrt(self.embedding_size)))
        softmax_biases = tf.Variable(tf.zeros([self.vocabulary_size]))

        self.gathered = tf.gather(embeddings, self.train_inputs)

        prediction = tf.matmul(self.gathered, softmax_weights) + softmax_biases
        self.loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.train_labels, logits=prediction))

        self.optimizer = tf.compat.v1.train.GradientDescentOptimizer(self.learning_rate).minimize(self.loss)

        self.sess = tf.compat.v1.Session()
        self.sess.run(tf.compat.v1.global_variables_initializer())
        self.saver = tf.compat.v1.train.Saver()


    def inc(self):
        self.data_index = (self.data_index + 1) % len(self.data)

    def inc_cv(self, data_index):
        return (data_index + 1) % len(self.cv_data)

    def generate_batch(self):
        batch = np.ndarray(shape=(self.batch_size), dtype=np.int32)
        labels = np.ndarray(shape=(self.batch_size), dtype=np.int32)
        counter = 0
        while counter < self.batch_size:
            current = self.data.iloc[self.data_index]
            if len(current) == 1:
                warnings.warn("lenght is one", RuntimeWarning)
                self.inc()
                continue

            span = min(2 * self.skip_window + 1, len(current))

            x = target = np.random.randint(0, len(current))

            targets_to_avoid = [x]

            for j in range(self.num_skips):  # target varies!!! X constant!
                while target in targets_to_avoid and len(targets_to_avoid) != span:
                    target = np.random.randint(0, span)
                if len(targets_to_avoid) == span or counter == self.batch_size:
                    break
                targets_to_avoid.append(target)
                batch[counter] = current[x]
                labels[counter] = current[target]
                counter += 1
            self.inc()

        return batch, labels

    def generate_test(self, num_steps):
        data_index = 0
        for _ in range(num_steps):
            batch = np.ndarray(shape=(self.batch_size), dtype=np.int32)
            labels = np.ndarray(shape=(self.batch_size), dtype=np.int32)

            counter = 0
            while counter < self.batch_size:
                current = self.cv_data.iloc[data_index]
                if len(current) == 1:
                    warnings.warn("lenght is one", RuntimeWarning)
                    data_index = self.inc_cv(data_index)
                    continue

                span = min(2 * self.skip_window + 1, len(current))

                x = target = np.random.randint(0, len(current))

                targets_to_avoid = [x]

                for j in range(self.num_skips):  # target varies!!! X constant!
                    while target in targets_to_avoid and len(targets_to_avoid) != span:
                        target = np.random.randint(0, span)
                    if len(targets_to_avoid) == span or counter == self.batch_size:
                        break
                    targets_to_avoid.append(target)
                    batch[counter] = current[x]
                    labels[counter] = current[target]
                    counter += 1
                data_index = self.inc_cv(data_index)

            yield batch, labels


# Get the products purchased by the user till now

In [None]:
order_prior = pd.read_csv("data/order_products__prior.csv")
orders = pd.read_csv("data/orders.csv")

# Get all the Prior orders.
orders = orders[orders.eval_set == 'prior']

# Get the order_id and its respective user_id.
orders_user = orders[['order_id', 'user_id']]

# Merge the prior_orders with order_user in order to get the user who purchased
# a respective product(s).
user_product = pd.merge(order_prior, orders_user, on='order_id')

# Fetch the user_id and product_id and remove duplicates so we can get what
# unique product was bought by a given user.
user_product = user_product.loc[:, ['user_id', 'product_id']].drop_duplicates()

# Save the user purchased products for further use.
user_product.to_pickle('data/previous_products.pkl')

# link each user's order with product bought by till now and divide them in train and test set

In [None]:
order_train = pd.read_csv("data/order_products__train.csv",
                          dtype={'order_id': np.uint32,
                                 'product_id': np.uint16,
                                 'reordered': bool})

orders = pd.read_csv("data/orders.csv")

user_product = pd.read_pickle('data/previous_products.pkl')

# Get all the orders except for priors from the orders.csv
orders = orders.loc[(orders.eval_set=='train') | (orders.eval_set=='test'), :]

# Connect the product_id and order_id based on the eval_set
user_product = pd.merge(
    user_product, orders[['order_id', 'user_id', 'eval_set']],
    on='user_id').drop(['user_id'], axis=1)

order_train.drop(['add_to_cart_order'], axis=1, inplace=True)


# Merge the train orders based on order_id and product_id to get the r
# reorderd bool value and eval_set for given order and product id
current = pd.merge(order_train, user_product,
               on=['order_id', 'product_id'], how='right')
current.reordered.fillna(False, inplace=True)
print(current.shape)
# Save the chunk of the data
current.to_pickle('data/train_test_set.pkl')

# User Product Features

In [None]:
order_prior = pd.read_csv("data/order_products__prior.csv",
                          dtype={
                              'order_id': np.uint32,
                              'product_id': np.uint16,
                              'add_to_cart_order': np.uint8,
                              'reordered': bool}
                          )
order_train = pd.read_csv("data/order_products__train.csv",
                          dtype={
                              'order_id': np.uint32,
                              'product_id': np.uint16,
                              'add_to_cart_order': np.uint8,
                              'reordered': bool})
orders = pd.read_csv("data/orders.csv")

products = pd.read_csv("data/products.csv")

order_train = pd.read_pickle('data/train_test_set.pkl')

# Get the ordered products with corresponding order_id
orders_products = pd.merge(orders, order_prior, on="order_id")

# Get the department and aisle id of the respective product
orders_products_products = pd.merge(
    orders_products,
    products[['product_id', 'department_id', 'aisle_id']],
    on='product_id')

# get the unique products bought by user from a given department and
# sum of reorder count
user_dep_stat = orders_products_products.groupby(
    ['user_id', 'department_id']).agg(
        {
        'product_id': lambda x: x.nunique(),
        'reordered': 'sum'
        })
# Rename the columns
user_dep_stat.rename(columns={'product_id': 'dep_products',
                                'reordered': 'dep_reordered'}, inplace=True)

# Reset index
user_dep_stat.reset_index(inplace=True)
user_dep_stat.to_pickle('data/user_department_products.pkl')

# Perform same steps for aisle and get the products purchased from given aisle
# and its reorder sum
user_aisle_stat = orders_products_products.groupby(
    ['user_id', 'aisle_id']).agg(
        {
         'product_id': lambda x: x.nunique(),
         'reordered': 'sum'
         })
user_aisle_stat.rename(columns={'product_id': 'aisle_products',
                                'reordered': 'aisle_reordered'}, inplace=True)
user_aisle_stat.reset_index(inplace=True)
user_aisle_stat.to_pickle('data/user_aisle_products.pkl')

In [None]:
aggregated = temp.copy()
# Get the last and second last period of the shopping and calculate the mean,
# median of the periods
aggregated['last'] = aggregated.periods.apply(lambda x: x[-1])
aggregated['prev1'] = aggregated.periods.apply(
    lambda x: x[-2] if len(x) > 1 else np.nan)
aggregated['prev2'] = aggregated.periods.apply(
    lambda x: x[-3] if len(x) > 2 else np.nan)
aggregated['median'] = aggregated.periods.apply(lambda x: np.median(x[:-1]))
aggregated['mean'] = aggregated.periods.apply(lambda x: np.mean(x[:-1]))
aggregated.drop('periods', axis=1, inplace=True)

aggregated.to_pickle('data/product_periods_stat.pkl')

# Calculate the cumulative sum of order's days_since_prior_order

In [None]:
order_prior = pd.read_csv(
    "data/order_products__prior.csv",
    dtype={'order_id': np.uint32,
           'product_id': np.uint16,
           'add_to_cart_order': np.uint8,
           'reordered': bool})
order_train = pd.read_csv(
    "data/order_products__train.csv",
    dtype={'order_id': np.uint32,
           'product_id': np.uint16,
           'add_to_cart_order': np.uint8,
           'reordered': bool})

orders = pd.read_csv("data/orders.csv")

labels = pd.read_pickle('data/train_test_set.pkl')
user_product = pd.read_pickle('data/previous_products.pkl')

# Calculate the cumulative sum of the day_since_prior_order
order_cumsum = orders[
                      ['user_id', 'order_number',
                       'days_since_prior_order']
                      ].groupby(['user_id', 'order_number']).agg({
                          'days_since_prior_order': 'sum'
                      })
order_cumsum = order_cumsum.groupby(level=0).cumsum().reset_index()
order_cumsum = order_cumsum.rename(
    columns={'days_since_prior_order':'days_since_prior_order_cumsum'})


order_cumsum.to_pickle('data/order_cumsum.pkl')
print("saved order_cumsum")

# Merge it with orders to get the order_id w.r.t days_since_prior_order
order_cumsum = pd.merge(
    order_cumsum,
    orders, on=['user_id', 'order_number']
    )[['user_id', 'order_number', 'days_since_prior_order_cumsum', 'order_id']]

# Get the products bought in prior orders along with its order_id
order_product = pd.merge(
    order_prior, orders, on='order_id'
    )[['order_id', 'product_id', 'eval_set']]

# Get the test and train order and product_id
order_product_train_test = labels[['order_id', 'product_id', 'eval_set']]

# merge the prior and prior,train and test orders along with its product_id
order_product = pd.concat([order_product, order_product_train_test])

order_product = pd.merge(order_product, order_cumsum, on='order_id')

In [None]:
temp = order_product.groupby(['user_id', 'product_id', 'order_number']).agg({
    'days_since_prior_order_cumsum':'sum'
    })
print(f"1st part done. {temp}")
temp = temp.astype(np.float16)
temp = temp.groupby(level=[0, 1])
print(f"2nd part done {temp.dtype}")

In [None]:
temp = temp.apply(lambda x: np.diff(np.nan_to_num(x)))
temp = temp.to_frame('periods').reset_index()

# Prod2Vec

In [None]:
order_prior = pd.read_csv("data/order_products__prior.csv")
orders = pd.read_csv("data/orders.csv")

data = pd.merge(order_prior, orders, on='order_id')

data = order_prior.sort_values(['order_id']).groupby('order_id')['product_id']\
    .apply(lambda x: x.tolist()).to_frame('products').reset_index()
data = pd.merge(data, orders, on='order_id')
data.to_pickle('data/prod2vec.pkl')

# Skip gram

In [None]:
# Ref: https://github.com/sh1ng/imba/blob/master/skip_gram_train.py
np.random.seed(2017)
products = pd.read_csv('data/instacart/products.csv')
df = pd.read_pickle('data/prod2vec.pkl').products
print('initial size', len(df))

df_train, df_cv = train_test_split(df, test_size=0.1, random_state=2017)
batch_size = 1024
rates = {
    100000: 0.5,
    200000: 0.25,
    500000: 0.1
    }
model = Product2VecSkipGram(df_train, df_cv, batch_size,
                            1, 1, np.max(products.product_id) + 1)
model.train(120001, 20000, len(df_cv) // batch_size, rates)

In [None]:
# Ref: https://github.com/sh1ng/imba/blob/master/skip_gram_get.py
np.random.seed(2017)
products = pd.read_csv('data/instacart/products.csv')
df = pd.read_pickle(
    'data/prod2vec.pkl').products.tolist()
print('initial size', len(df))

df_train, df_cv = train_test_split(df, test_size = 0.1, random_state=2017)
batch_size = 128
rates = {100000: 0.5,
         200000: 0.25,
         500000: 0.1}
model = Product2VecSkipGram(df_train, df_cv, len(products),
                            1, 1, np.max(products.product_id) + 1)
model.sess = tf.compat.v1.Session()
model.sess.run(tf.compat.v1.global_variables_initializer())
model.saver = tf.compat.v1.train.Saver()
model.load_model('models/prod2vec_skip_gram-120000')
embd = model.predict(products.product_id.values)
products = pd.concat([products, pd.DataFrame(embd)], axis=1)
products.to_pickle('data/product_embeddings.pkl')

# Prepare order streak

In [None]:
"""
@author: Faron
"""

'''
Calculates (user, product) order_streak for the last n orders.

- abs(order_streak) is length of streak
- sgn(order_streak) encodes type of streak (non-ordered vs ordered)
'''

DATA_DIR = "data/"
PRIOR_FILE = "order_products__prior"
ORDERS_FILE = "orders"


def load_input_data():
    PATH = "{}{}{}".format(DATA_DIR, PRIOR_FILE, ".csv")
    prior = pd.read_csv(PATH)

    PATH = "{}{}{}".format(DATA_DIR, ORDERS_FILE, ".csv")
    orders = pd.read_csv(PATH)
    return prior, orders


def apply_parallel(df_groups, _func):
    nthreads = multiprocessing.cpu_count() >> 1
    print("nthreads: {}".format(nthreads))

    res = Parallel(n_jobs=nthreads)(delayed(_func)(grp.copy()) for _, grp in df_groups)
    return pd.concat(res)


def add_order_streak(df):
    tmp = df.copy()
    tmp.user_id = 1

    UP = tmp.pivot(index="product_id", columns='order_number').fillna(-1)
    UP.columns = UP.columns.droplevel(0)

    x = np.abs(UP.diff(axis=1).fillna(2)).values[:, ::-1]
    df.set_index("product_id", inplace=True)
    df['order_streak'] = np.multiply(np.argmax(x, axis=1) + 1, UP.iloc[:, -1])
    df.reset_index(drop=False, inplace=True)
    return df


prior, orders = load_input_data()

print("orders: {}".format(orders.shape))
print("take only recent 5 orders per user:")
orders = orders.groupby(['user_id']).tail(5 + 1)
print("orders: {}".format(orders.shape))

prior = orders.merge(prior, how='inner', on="order_id")
prior = prior[['user_id', 'product_id', 'order_number']]
print("prior: {}".format(prior.shape))

user_groups = prior.groupby('user_id')
s = datetime.now()
df = apply_parallel(user_groups, add_order_streak)
e = datetime.now()
print("time elapsed: {}".format(e - s))

df = df.drop("order_number", axis=1).drop_duplicates().reset_index(drop=True)
df = df[['user_id', 'product_id', 'order_streak']]
df.to_csv("order_streaks.csv", index=False)