# Task: Basket Completion

The recommendation task of basket completion is a key part of many online retail applications. Basket completion involves computing predictions for the next item that should be added to a shopping basket, given a collection of items that the user has already added to the basket.


# Dataset

Amazon Baby Registries - This is a public dataset that consists of registries of baby products
from 15 different categories (such as ’feeding’, ’diapers’,’toys’, etc.), where the item catalog and registries for each category are disjoint. Each category therefore provides a small dataset, with a maximum of 15,000 purchased baskets per category. 



# Solution:

**DNN with 1Dimensional CNNs** - Similar to what we presented in the TextCNN section of our course.

![TextCNN](./images/textCNN.png)



## About the Conv2D operator in TF


**Inputs:** 

* In_Tensor = [ batch_size; seq_length; embedding_size; 1]

* Filter = [filter_size; embedding_size; 1; num_conv_filters]

**Output:**
* Out_Tensor = [batch_size; new_length; 1; num_conv_filters]

* where: new_length = seq_length - filter_size + 1






# Metrics:

### * MPR: Mean Percentile Ranking
### * Precision@k


-----------------------------
-----------------------------



# Questions:

### Q1. Add a second convolutional layer in Gen_CNN_Model() and compare its performance against the initial architecture

### Q2. Tune the convolution parameters and compare the results!


### IMPORTS

In [None]:
import numpy as np
from tqdm import tqdm
import random
from random import shuffle
import math
import matplotlib
import matplotlib.pyplot as plt
import copy
import os
import csv
import itertools
from collections import Counter
from scipy.spatial.distance import cosine
from scipy.stats import norm
from sklearn.metrics import roc_auc_score
import tensorflow as tf
from keras.datasets import imdb
from keras.models import Sequential, Model
from keras.utils import to_categorical
from keras.layers import Dense, LSTM, Dropout, Bidirectional, Input, Masking, Conv1D, MaxPooling1D
from keras.layers import Reshape,Conv2D, MaxPooling2D, Flatten, Concatenate
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras import losses


random.seed(1234)
np.random.seed(1234)
tf.set_random_seed(1234)

LOCAL_PATH = "./data/"



### Utils

In [None]:
##Various utils

def get_basket_set_size(data, batch_size, seq_length):
    batch = list()
    while len(batch)<batch_size:
        line = np.random.randint(0, len(data))
        basket = data[line]
        if len(basket)>=seq_length:
            batch.append(list(np.random.choice(basket, size=seq_length, replace=False)))
    return np.array(batch)


def counters_per_prod(data):
    counter = Counter()
    for elem in data:
        counter.update(elem)
    list_of_keys = np.unique(np.array(list(counter.keys())))
    print("Vocabulary size with unique items", len(list_of_keys))
    return counter, len(list_of_keys)


def get_popularity_dist(training_data, vocabulary_size):
    counter = Counter()
    for elem in training_data:
        counter.update(elem)
    popularity_dist = [float(counter[i]) for i in range(vocabulary_size)]
    return popularity_dist


## data reading

def read_data(data_file_path):
    data = [[]]
    with open(data_file_path) as data_csv_file:
        data_csv_reader = csv.reader(data_csv_file, delimiter = ',')
        for row in data_csv_reader:
            if len(row) == 1:
                continue
            data.append([(int(i) - 1) for i in row])
    del data[0]
    return data


def load_data(dataset):
    if dataset == "Amazon":
        data = read_data(LOCAL_PATH + "1_100_100_100_apparel_regs.csv")
        folder = LOCAL_PATH + "Amazon/"

    print("min elem = ", min([elem for ss in data for elem in ss]), " max elem = ", max([elem for ss in data for elem in ss]))
    return data, folder


### batching functions

def create_batch(self):
    seq_length = np.random.randint(8)+2
    batches = get_basket_set_size(self.training_data, self.batch_size, seq_length)
    train_words, label_words = batches[:,:-1], batches[:,1:]
    return train_words, label_words


def generate_random_bundles(list_of_sizes, number_of_products):
    products = np.arange(number_of_products)
    sets = list()
    for size in list_of_sizes:
        sets.append(np.random.choice(products, size=size, replace=False))
    return sets


def get_test_list_batches(self, max_basket_size, number_of_baskets_per_size):
    test_data = copy.copy(self.test_data)
    self.test_list_batches = list()

    for elem in test_data:
        random.shuffle(elem)
        
    for i in range(2, max_basket_size):
        self.test_list_batches.append(get_basket_set_size(test_data, number_of_baskets_per_size, i))

    self.test_list_batches.insert(0, [])
    self.test_list_batches.insert(0, [])
    print(self.test_list_batches[0][:5])
    print(self.test_list_batches[1][:5])
    print(self.test_list_batches[2][:5])
    print(self.test_list_batches[3][:5])
    return self.test_list_batches


### performance functions

def print_results_predictions(last_predictions, batch_inputs, targets, vocabulary_size):
    predictions_for_targets = 100*np.mean(np.array([last_predictions[i][int(targets[i])] for i in range(len(batch_inputs))]))
    predictions_for_random = 100*np.mean(np.array([last_predictions[i][np.random.choice(vocabulary_size-1, 20, replace=False)] for i in range(len(batch_inputs))]))

    predictions_top_one = [1 if targets[i] in list(np.argpartition(-last_predictions[i], 1)[:1]) else 0 for i in range(len(batch_inputs))]
    predictions_top_one = 100*np.count_nonzero(np.array(predictions_top_one))/len(batch_inputs)
    predictions_top_five = [1 if targets[i] in list(np.argpartition(-last_predictions[i], 5)[:5]) else 0 for i in range(len(batch_inputs))]
    predictions_top_five = 100*np.count_nonzero(np.array(predictions_top_five))/len(batch_inputs)

    percent = int(vocabulary_size*0.01)
    predictions_top_one_percent = [1 if targets[i] in list(np.argpartition(-last_predictions[i], percent)[:percent]) else 0 for i in range(len(batch_inputs))]
    predictions_top_one_percent = 100*np.count_nonzero(np.array(predictions_top_one_percent))/len(batch_inputs)

    predictions_sorted = [1-len(np.where(last_predictions[i]>last_predictions[i][int(targets[i])])[0])/vocabulary_size for i in range(len(batch_inputs))]
    MPR_sorted = 100*np.mean(np.array(predictions_sorted))
    predictions_sorted_random = [1-(len(np.where(last_predictions[i]>last_predictions[i][int(np.random.randint(vocabulary_size-1))])[0])/vocabulary_size) for i in range(len(batch_inputs))]
    return predictions_top_one, predictions_top_one_percent, MPR_sorted
   

def print_info_on_data(data):
    Counterr = Counter([len(elem) for elem in data])
    num_seq = len(data)
    proportion_below = 20
    num_seq_smaller_than = sum([Counterr[i] for i in range(1,proportion_below+1)])
    print("Number of users = "+ str(len(data)))
    print('max length basket= ', max([len(elem) for elem in data]))
    print('vocabulary size ', max([max(elem) for elem in data]))
    print('proportion of baskets with size below ', str(proportion_below),' ', 100*num_seq_smaller_than/num_seq)
    print(data[:4])

### Actual interesting code starts here...

In [None]:
class Model(object):
    def __init__(self, type_of_data, dataset):
        self.data, self.folder = load_data(dataset)
        self.dataset = dataset
        self.type_of_data = type_of_data
        shuffle(self.data)
    
        #Splitting the data
        self.proportion_training, self.proportion_test = 0.8, 0.2
        self.num_training_instances, self.num_test_instances = int(len(self.data)*self.proportion_training), int(len(self.data)*self.proportion_test)
        self.test_data, self.training_data = self.data[:self.num_test_instances], self.data[self.num_test_instances:]
        print("Length data, train and test ", len(self.data), len(self.training_data), len(self.test_data))
        self.test_list_batches = get_test_list_batches(self, max_basket_size=10, number_of_baskets_per_size=1000)
        self.test_data_size = min(2500, len(self.test_data))
        self.test_baskets = [self.test_data[i] for i in sorted(random.sample(range(len(self.test_data)), self.test_data_size))]

        _, self.number_of_products = counters_per_prod(self.data)
        self.vocabulary_size = self.number_of_products if (type_of_data=="real") else 10000
        self.embedding_size = 64
        self.batch_size = 100
        self.epoch = 10
        self.neg_sampled = 50
        self.seq_length= 25
        self.use_pretrained_embeddings=False

        self.popularity_distribution = np.array(get_popularity_dist(self.training_data, self.vocabulary_size))
        print("Vocabulary size", self.vocabulary_size)



In [None]:
class Gen_CNN_Model(object):

    def __init__(self, model):
        self.model_params = model
        #Shared params
        
        #Number of products
        self.vocabulary_size = model.vocabulary_size
        
        #Size of embedding vector for each product
        self.embedding_size = model.embedding_size
        
        #Max basket size (sequence length)
        self.seq_length = model.seq_length
        
        self.batch_size = model.batch_size
        self.num_epochs = model.epoch
        self.number_of_training_step = model.number_of_training_steps
        self.embedding_matrix = model.embedding_matrix
        self.use_pretrained_embeddings = model.use_pretrained_embeddings
        self.training_data, self.test_data, self.test_list_batches, self.test_baskets = model.training_data, model.test_data, model.test_list_batches, model.test_baskets
        
        #CNN specific params
        self.neg_sampled = 50
        self.learning_rate = 1e-1
        
        
        #Convolution paramsss !
        self.number_of_convolutions = 2
        self.num_filters = 100
        self.filter_sizes = [2, 3, 5, 10]
        self.max_pooling_window = 4
     
        self.second_filter_size = 3
        self.num_second_filters = 100
        self.second_max_pooling_window = 4

        
        self.printing_step = 100
        self.X_train, self.Y_train, self.X_test, self.Y_test = list(), list(), list(), list()


    def create_embedding_layer(self):
        with tf.name_scope("model"):
            if (self.use_pretrained_embeddings):
                print("Initializing with the previous embedding matrix")
                self.embeddings = Embedding(self.vocabulary_size, self.embedding_size, weights=[self.embedding_matrix], input_length=self.seq_length, trainable=True)
            else:
                print("Initializing with a random embedding matrix")
                self.embeddings = Embedding(self.vocabulary_size, self.embedding_size, embeddings_initializer='uniform', input_length=self.seq_length)
            return self.embeddings

    def convolve_4d_matrix(self, input_matrix, filter_shape, filter_size, num_filters, pooling_window_size):
        
        # tf.constant - Create the bias vector of size [num_filters] and value 0.1
        bias = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b-%s" % filter_size,trainable=True)
        
        # Create the filter tensor with filter_shape and initialize it using truncated_normal
        self.filter = tf.Variable(tf.truncated_normal(filter_shape,mean= 0.0, stddev=0.1), name="filter-%s" % filter_size,trainable=True)
                
        # Computes a 2D convolution given 4D input and filter tensors.
        # Given an input tensor of shape [batch, in_height, in_width, in_channels] and 
        #  a filter / kernel tensor of shape [filter_height, filter_width, in_channels=1, out_channels], 
        #  this op performs the following:
        # - Input:
        #   * InputMatrix - 4D Tensor [batch_size, seq_length, embedding_size, 1]
        #   * Filter      - 4D Tensor [filter_size, embedding_size, 1, num_filters] 
        # - Flattens the filter to a 2-D matrix with shape: 
        #   [filter_height * filter_width * in_channels=1, output_channels]
        # - Extracts image patches from the input tensor to form a virtual tensor of shape: 
        #   [batch, out_height, out_width, filter_height * filter_width * in_channels].
        # - For each patch, right-multiplies the filter matrix and the image patch vector.
        #
        # padding = "VALID" means no padding so shrinks the input tensor
        #
        # OUTPUT DIMENSIONS: 4D [batch_size, new_length=seq_length-filter_size+1, 1, num_filters=100]
        conv1 = tf.nn.conv2d(input_matrix, self.filter, strides=[1, 1, 1, 1], padding="VALID", name="conv1")
        self.conv1 = conv1 

        # tf.nn.bias_add(conv1, bias) - ADDS BIAS (1D VECTOR) TO EACH CONVOLVED VCT IN CONV1 (100x)
        # RELU = max(0, input)
        h1 = tf.nn.relu(tf.nn.bias_add(conv1, bias), name="relu1")

        # tf.nn.max_pool
        # h1 - example: [n-gram=2, embedding_size=100, 1, num_filters=100]
        # ksize: The size of the pooling window for each dimension of the input tensor.
        # e.g. [in the batch dim 1 per example, post_conv_filter_dim, width=1, 1 per filter]
        # Compute max value per each filter
        # OUTPUT DIMENSIONS: 4D [batch_size, 1, 1, num_filter=100]        
        if (pooling_window_size>0):
            pooled = tf.nn.max_pool(h1, ksize= [1, pooling_window_size, 1, 1], strides=[1,1,1,1], padding='VALID', name="pool1")
        else:
            pooled = h1            
        return tf.transpose(pooled, perm=[0,1,3,2])
                
        
        
    def creating_layer(self, input_tensor, dropout):
        
        embed = self.embeddings(input_tensor)
        
        #Reshapes the batch tensor from 3D to 4D since this it the input exp. by conv2D
        self.embedded_chars_expanded = tf.expand_dims(embed, -1)
        
        pooled_outputs = []
        
        with tf.name_scope("conv-maxpool"):
            
            for filter_size in self.filter_sizes:
                
                # Declare the filter shape [filter_height, filter_width, in_channels, out_channels]
                filter_shape = [filter_size, self.embedding_size, 1, self.num_filters]
                pooled = self.convolve_4d_matrix(self.embedded_chars_expanded, filter_shape, filter_size, \
                    self.num_filters, pooling_window_size=self.max_pooling_window)
                
                #############################
                ## Q1: 2nd conv goes here! ##
                #############################
                
                pooled = tf.reduce_max(pooled, axis=1, keepdims=True)
                
                # Add the current pooled output
                pooled_outputs.append(pooled)

        # Total filters: num_filters_per_type * num_filter_types
        num_filters_total_per_cnn = self.num_second_filters #self.num_filters * len(self.filter_sizes)
        
        # TF.concat along the 3rd axis 
        # OUTPUT DIMENSIONS: 4D [batch_size, 1, 1, num_filters=100 * 4]
        self.h_pool1 = tf.concat(pooled_outputs, 2)
        
        # TF.reshape
        # OUTPUT DIMENSIONS: 2D [batch_size, num_filters=100 * 4]
        self.h_drop1 = tf.reshape(self.h_pool1, [-1, self.num_filters * len(self.filter_sizes)]) if (self.number_of_convolutions==1) \
            else tf.reshape(self.h_pool1, [-1, self.num_second_filters * len(self.filter_sizes)])
        
        self.h_drop1 = tf.nn.dropout(self.h_drop1, dropout)
        return self.h_drop1

    
    def compute_loss(self, output, label_tensor):
        with tf.name_scope("output_layer"):
            self.nce_weights = tf.Variable(tf.truncated_normal([self.vocabulary_size, self.num_filters * len(self.filter_sizes)], stddev=1.0 / math.sqrt(self.embedding_size)))
            self.nce_biases = tf.Variable(tf.zeros([self.vocabulary_size]))

            #logits = tf.add((tf.matmul(output, tf.transpose(self.nce_weights))), self.nce_biases)
            #self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits = logits, labels= label_tensor))
            
            self.loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(
                weights=self.nce_weights, biases=self.nce_biases, labels=label_tensor,
                inputs=output, num_sampled=self.neg_sampled, num_classes=self.vocabulary_size))      

            self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.loss)
        return self.loss

    
    def create_graph(self):
        with tf.name_scope("inputs"):
            self.train_words = tf.placeholder(tf.int32, shape=[None, None], name="train_inputs")
            self.label_words = tf.placeholder(tf.int32, shape=[None, None], name="train_labels")
            self.dropout = tf.placeholder(tf.float32, name="dropout")
        
        self.create_embedding_layer()
        
        #Use TF.tile to repeat the elements until target seq. length
        self.context_words = tf.tile(self.train_words, tf.constant([1, self.seq_length]))[:,:self.seq_length]
        
        self.output = self.creating_layer(self.context_words, self.dropout)
        
        self.loss = self.compute_loss(self.output, tf.reshape(self.label_words[:,-1], (-1,1)))
        with tf.name_scope('optimizer'):
            self.global_step = tf.Variable(0, name="global_step", trainable=False)
            self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.loss)
            
            optimizer_plot = tf.train.GradientDescentOptimizer(self.learning_rate)
            grads_and_vars = optimizer_plot.compute_gradients(self.loss)
            self.train_op = optimizer_plot.apply_gradients(grads_and_vars, global_step=self.global_step)
        
        self.get_predictions(self.output)
        self.get_scores(self.output)
        
    def get_predictions(self, output):
        self.last_predictions = tf.nn.softmax(tf.matmul(output,tf.transpose(self.nce_weights))+self.nce_biases)
        return self.last_predictions
    
    def get_scores(self, output):
        self.before_softmax = tf.matmul(output,tf.transpose(self.nce_weights))+self.nce_biases
        return self.before_softmax

    def train_model_with_tensorflow(self):
        self.create_graph()
        self._sess = tf.Session()
        self._sess.run(tf.global_variables_initializer())
        total_loss, data_idx, step = 0, 0, 0
        steps_per_epoch = int(len(self.X_train)/self.batch_size)
        MPRs, precision1s, precision1ps = list(), list(), list()
        train_words, label_words = create_batch(self)
        _, loss, h_drop1 = self._sess.run([self.optimizer, self.loss, self.h_drop1], feed_dict={self.train_words:train_words, self.label_words:label_words, self.dropout:1})
        print(h_drop1.shape)
        
        while (step < self.number_of_training_step):
            
            train_words, label_words = create_batch(self)
            _, loss = self._sess.run([self.optimizer, self.loss], feed_dict={self.train_words:train_words, self.label_words:label_words, self.dropout:1})
            
            total_loss += loss
            step += 1

            if (step % self.printing_step == 0) or (step==1):
                print("Step "+ str(step) +" and loss " + str(total_loss/self.printing_step))
                total_loss = 0

                input_w, target_w, last_predictions = list(), list(), list()
                for elem in self.test_baskets:
                    train_words, label_words = np.reshape(np.array(elem[:-1]), (1,-1)), np.reshape(np.array(elem[1:]), (-1, 1))
                    input_w.append(train_words)
                    target_w.append(label_words[-1])
                    
                    last_predictions.append(np.reshape(self._sess.run([self.before_softmax], feed_dict={self.train_words:train_words, self.label_words:label_words, self.dropout:1})[0], [-1]))
                
                precision1, precision1p, MPR = print_results_predictions(np.array(last_predictions), input_w, np.array(target_w).flatten(), self.vocabulary_size)
                MPRs.append(MPR)
                precision1s.append(precision1)
                precision1ps.append(precision1p)
                print("MPR "+ str(round(MPR, 3))+ " Prec@1 "+ str(precision1)+ " Prec@1pct "+ str(precision1p))
                print("")

        data = np.array([MPRs, precision1s, precision1ps])
        np.save("textCNN_results", data)
        


In [None]:
class CNN(Model):
    def __init__(self, type_of_data, dataset, number_of_training_steps):
        Model.__init__(self, type_of_data, dataset)
        embedding_matrix = np.load("embedding_matrix.npy") if (os.path.isfile("embedding_matrix.npy")) else np.zeros((1,1))
        self.embedding_matrix = embedding_matrix
        self.number_of_training_steps = number_of_training_steps
        cnn = Gen_CNN_Model(self)
        cnn.train_model_with_tensorflow()


cnn = CNN(type_of_data="real", dataset="Amazon", number_of_training_steps=20000)
