# Load the imports

In [1]:
import warnings
warnings.filterwarnings("ignore")
import gc
import pandas as pd
import numpy as np
import os
import _pickle as cpickle
import arboretum
import lightgbm as lgb
from operator import itemgetter
import joblib
import json
import sklearn.metrics
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from scipy.sparse import dok_matrix, coo_matrix
from sklearn.utils.multiclass import  type_of_target

# Load the datasets

In [2]:
# Load the original files
aisles = pd.read_csv("data/aisles.csv",
                     dtype={'aisle': 'category'})
departments = pd.read_csv("data/departments.csv",
                          dtype={'department': 'category'})
order_prior = pd.read_csv("data/order_products__prior.csv",
                          dtype={'reordered': bool})
train_orders = pd.read_csv("data/order_products__train.csv",
                          dtype={'reordered': bool})
orders = pd.read_csv("data/orders.csv",
                     dtype={'eval_set': 'category'})

products = pd.read_csv("data/products.csv")

# Load the feature files created
product_embeddings = pd.read_pickle('data/product_embeddings.pkl')
order_train_chunk = pd.read_pickle("data/train_test_set.pkl")
product_periods = pd.read_pickle("data/product_periods_stat.pkl").fillna(9999)
user_dep_stat = pd.read_pickle("data/user_department_products.pkl")
user_aisle_stat = pd.read_pickle("data/user_aisle_products.pkl")
order_streaks = pd.read_csv("data/order_streaks.csv")

# Get the train and test orders

In [3]:
order_test = order_train_chunk[order_train_chunk.eval_set == "test"][['order_id',
                                                          'product_id']]
order_train = order_train_chunk[order_train_chunk.eval_set == "train"][['order_id',
                                                            'product_id',
                                                            'reordered']]
embedings = list(range(32))
product_embeddings = product_embeddings[embedings + ['product_id']]

# Compute other features

In [4]:
prob = pd.merge(order_prior, orders, on='order_id')

# Calculate the count of user ordered a given product and how many times
# the product was reorderd by the user
prob = prob.groupby(
    ['product_id', 'user_id']).agg(
    {
    'reordered':'sum',
    'user_id': 'size'
    })

# Rename the grouped columns
prob.rename(
    columns={'sum': 'reordered', 'user_id': 'total'},
    inplace=True
    )

# Calculate the ratio of reorder
prob['reorder_prob'] = prob.reordered / prob.total

# Calculate the mean of the product reordered
prob = prob.groupby('product_id').agg({'reorder_prob': 'mean'})
prob = prob.rename(columns={'mean': 'reorder_prob'}).reset_index()


## Calculate the product statistics

In [5]:
# Calculate the count of product was totally ordered and how many times it was
# reordered
prod_stat = order_prior.groupby('product_id').agg(
    {
    'reordered': ['sum', 'size'],
    'add_to_cart_order':'mean'
    })

# Set the column to level 1
prod_stat.columns = prod_stat.columns.levels[1]
# Rename the columns
prod_stat.rename(columns={'sum':'prod_reorders',
                          'size':'prod_orders',
                          'mean': 'prod_add_to_card_mean'}, inplace=True)
prod_stat.reset_index(inplace=True)

prod_stat['reorder_ration'] = prod_stat['prod_reorders'] / prod_stat['prod_orders']

prod_stat = pd.merge(prod_stat, prob, on='product_id')

## Calculate user statistics

In [6]:
# Get the max order_number for a given user_id
# calculate the sum, mean, median of days_since_prior_order
user_stat = orders[orders.eval_set == 'prior'].groupby('user_id').agg(
    {
        'order_number': 'max',
        'days_since_prior_order': ['sum','mean','median']
     })

# Drop the 0th level of column generated from groupby
user_stat.columns = user_stat.columns.droplevel(0)
user_stat.rename(columns={'max': 'user_orders',
                          'sum': 'user_order_starts_at',
                          'mean': 'user_mean_days_since_prior',
                          'median': 'user_median_days_since_prior'},
                 inplace=True)

user_stat.reset_index(inplace=True)

# Merging the orders and prior orders to get the products info for orders
orders_products = pd.merge(orders, order_prior, on="order_id")

# Compute the number of times user reordered till now and count of unique
# products ordered by the user
user_order_stat = orders_products.groupby('user_id').agg(
    {
        "user_id": "size",
        "reordered": "sum",
        "product_id": lambda x: x.nunique()
    })

user_order_stat.rename(
    columns = {
        'user_id':'user_total_products',
        'product_id': 'user_distinct_products',
        'reordered': 'user_reorder_ratio'
    },
    inplace=True)

user_order_stat.reset_index(inplace=True)

# compute the reorder ratio based on how many times the user has reordered till
# now by total products ordered by the user
user_order_stat['user_reorder_ratio'] = user_order_stat['user_reorder_ratio'] / user_order_stat['user_total_products']

user_stat = pd.merge(user_stat, user_order_stat, on='user_id')
# Calculate the avg basket size by total products bought and total user orders
user_stat['user_average_basket'] = (user_stat['user_total_products'] 
                                    / user_stat['user_orders'])

## User product Features

In [7]:
# How many users purchased a given product?
prod_usr = orders_products.groupby(['product_id']).agg(
    {'user_id': lambda x: x.nunique()})
prod_usr.rename(columns={'user_id':'prod_users_unq'}, inplace=True)
prod_usr.reset_index(inplace=True)

# How many users reordered a product?
prod_usr_reordered = orders_products[orders_products.reordered==True].groupby(
    ['product_id']).agg(
        {'user_id': lambda x: x.nunique()}
        )
prod_usr_reordered.rename(columns={'user_id': 'prod_users_unq_reordered'}, inplace=True)
prod_usr_reordered.reset_index(inplace=True)

order_stat = orders_products.groupby('order_id').agg(
    {'order_id': 'size'})
order_stat = order_stat.rename(columns={'order_id': 'order_size'}).reset_index()

orders_products = pd.merge(orders_products, order_stat, on='order_id')
orders_products['add_to_cart_order_inverted'] = orders_products['order_size'] - orders_products['add_to_cart_order']
orders_products['add_to_cart_order_relative'] = orders_products['add_to_cart_order'] / orders_products['order_size']

# Compute user product features related to orders
data = orders_products.groupby(['user_id', 'product_id']).agg(
    {
        'user_id': 'size',
        'order_number': ['min', 'max'],
        'add_to_cart_order': ['mean', 'median'],
        'days_since_prior_order': ['mean', 'median'],
        'order_dow': ['mean', 'median'],
        'order_hour_of_day': ['mean', 'median'],
        'add_to_cart_order_inverted': ['mean', 'median'],
        'add_to_cart_order_relative': ['mean', 'median'],
        'reordered': ['sum']
     })

data.columns = data.columns.droplevel(0)
data.columns = ['up_orders', 'up_first_order', 'up_last_order',
                'up_mean_cart_position', 'up_median_cart_position',
                'days_since_prior_order_mean',
                'days_since_prior_order_median', 'order_dow_mean',
                'order_dow_median', 'order_hour_of_day_mean', 
                'order_hour_of_day_median', 'add_to_cart_order_inverted_mean',
                'add_to_cart_order_inverted_median',
                'add_to_cart_order_relative_mean',
                'add_to_cart_order_relative_median',
                'reordered_sum']

# Adding 1 incae of reordered_sum is 0
data['user_product_reordered_ratio'] = (data['reordered_sum'] + 1.0) / data['up_orders']

data.reset_index(inplace=True)

data = pd.merge(data, prod_stat, on='product_id')
data = pd.merge(data, user_stat, on='user_id')

# Ratio of user ordered a particular product by total orders by the order
data['up_order_rate'] = data['up_orders'] / data['user_orders']
data['up_orders_since_last_order'] = data['user_orders'] - data['up_last_order']
# ordering rate since first order by the user
data['up_order_rate_since_first_order'] = data['user_orders'] / (data['user_orders'] - data['up_first_order'] + 1)

# Get the necessary features

In [8]:
features = [
    'user_product_reordered_ratio', 'reordered_sum',
    'add_to_cart_order_inverted_mean',
    'add_to_cart_order_relative_mean', 'reorder_prob',
    'last', 'prev1', 'prev2', 'median', 'mean',
    'dep_reordered_ratio', 'aisle_reordered_ratio',
    'aisle_products', 'aisle_reordered',
    'dep_products', 'dep_reordered',
    'prod_users_unq', 'prod_users_unq_reordered',
    'order_number', 'prod_add_to_card_mean',
    'days_since_prior_order',
    'order_dow', 'order_hour_of_day',
    'reorder_ration', 'user_orders',
    'user_order_starts_at', 'user_mean_days_since_prior',
    'user_average_basket', 'user_distinct_products',
    'user_reorder_ratio', 'user_total_products',
    'prod_orders', 'prod_reorders',
    'up_order_rate', 'up_orders_since_last_order',
    'up_order_rate_since_first_order',
    'up_orders', 'up_first_order', 'up_last_order',
    'up_mean_cart_position', 'days_since_prior_order_mean',
    'order_dow_mean', 'order_hour_of_day_mean',
    'user_id', 'order_id'
    ]
features.extend(embedings)
categories = ['product_id', 'aisle_id', 'department_id']
features.extend(categories)

In [9]:
processed_features = features.copy()
processed_features.remove('order_id')
processed_features.remove('user_id')


class CustomStackingClassifier:
    """
    This Class accepts the estimator and params to train the models
    The model is trained loop times inorder to perform bagging
    """
    def __init__(self, estimators, random_state, params, nround, 
                 version, loop=3,
                 valid_size=0.05, stratify=True, verbose=1,
                 early_stopping=60, use_probas=True):
        self.clf = estimators
        self.mod=cpickle
        self.loop = loop
        self.params = params
        self.nround = nround    
        self.version = version
        self.valid_size = valid_size
        self.verbose = verbose
        self.random_state = random_state
        self.early_stopping = early_stopping
        self.models = []


    def split_build_valid(self, train_user, X_train, y_train):
        """
        Splits the dataset based on the user and divides them in train
        and valid set
        """
        train_user['is_valid'] = np.random.choice(
            [0,1],
            size=len(train_user),
            p=[1-self.valid_size, self.valid_size])

        valid_n = train_user['is_valid'].sum()
        build_n = (train_user.shape[0] - valid_n)
        
        print('build user:{}, valid user:{}'.format(build_n, valid_n))
        valid_user = train_user[train_user['is_valid']==1].user_id
        is_valid = X_train.user_id.isin(valid_user)
        
        dbuild = lgb.Dataset(X_train[~is_valid].drop('user_id', axis=1),
                             y_train[~is_valid],
                             categorical_feature=['product_id', 'aisle_id', 'department_id'])
        dvalid = lgb.Dataset(X_train[is_valid].drop('user_id', axis=1),
                             label=y_train[is_valid],
                             categorical_feature=['product_id', 'aisle_id', 'department_id'])
        watchlist_set = [dbuild, dvalid]
        watchlist_name = ['build', 'valid']
        
        print('FINAL SHAPE')
        print('dbuild.shape:{}  dvalid.shape:{}\n'.format(
            dbuild.data.shape,
            dvalid.data.shape))
        return dbuild, dvalid, watchlist_set, watchlist_name

    def fit(self, x, y):
        np.random.seed(self.random_state)
        train_user = x[['user_id']].drop_duplicates()

        for i in range(self.loop):
            dbuild, dvalid, watchlist_set, watchlist_name = self.split_build_valid(train_user, x, y)
            gc.collect();

            # Train models
            model = lgb.train(
                self.params,
                dbuild,
                self.nround,
                watchlist_set,
                watchlist_name,
                early_stopping_rounds=self.early_stopping,
                categorical_feature=['product_id', 'aisle_id', 'department_id'],
                verbose_eval=5)
            joblib.dump(model, "lgb_models/lgb_trained_{}_{}".format(self.version, i))
            self.models.append(model)
            del [dbuild, dvalid, watchlist_set, watchlist_name];
            gc.collect();
        del train_user;
        gc.collect()
        return self


    def predict(self, x, test_data):
#         dtest  = lgb.Dataset(x)
        sub_test = test_data[['order_id', 'product_id']]
        sub_test['yhat'] = 0
        for model in self.models:
            sub_test['yhat'] += model.predict(x)
        sub_test['yhat'] /= self.loop
        return sub_test


"""
@author: Faron
"""
'''
This kernel implements the O(n²) F1-Score expectation maximization algorithm presented in
"Ye, N., Chai, K., Lee, W., and Chieu, H.  Optimizing F-measures: A Tale of Two Approaches. In ICML, 2012."
It solves argmax_(0 <= k <= n,[[None]]) E[F1(P,k,[[None]])]
with [[None]] being the indicator for predicting label "None"
given posteriors P = [p_1, p_2, ... , p_n], where p_1 > p_2 > ... > p_n
under label independence assumption by means of dynamic programming in O(n²).
'''
class F1Optimizer():
    def __init__(self):
        pass

    @staticmethod
    def get_expectations(P, pNone=None):
        expectations = []
        P = np.sort(P)[::-1]

        n = np.array(P).shape[0]
        DP_C = np.zeros((n + 2, n + 1))
        if pNone is None:
            pNone = (1.0 - P).prod()

        DP_C[0][0] = 1.0
        for j in range(1, n):
            DP_C[0][j] = (1.0 - P[j - 1]) * DP_C[0, j - 1]

        for i in range(1, n + 1):
            DP_C[i, i] = DP_C[i - 1, i - 1] * P[i - 1]
            for j in range(i + 1, n + 1):
                DP_C[i, j] = P[j - 1] * DP_C[i - 1, j - 1] + (1.0 - P[j - 1]) * DP_C[i, j - 1]

        DP_S = np.zeros((2 * n + 1,))
        DP_SNone = np.zeros((2 * n + 1,))
        for i in range(1, 2 * n + 1):
            DP_S[i] = 1. / (1. * i)
            DP_SNone[i] = 1. / (1. * i + 1)
        for k in range(n + 1)[::-1]:
            f1 = 0
            f1None = 0
            for k1 in range(n + 1):
                f1 += 2 * k1 * DP_C[k1][k] * DP_S[k + k1]
                f1None += 2 * k1 * DP_C[k1][k] * DP_SNone[k + k1]
            for i in range(1, 2 * k - 1):
                DP_S[i] = (1 - P[k - 1]) * DP_S[i] + P[k - 1] * DP_S[i + 1]
                DP_SNone[i] = (1 - P[k - 1]) * DP_SNone[i] + P[k - 1] * DP_SNone[i + 1]
            expectations.append([f1None + 2 * pNone / (2 + k), f1])

        return np.array(expectations[::-1]).T

    @staticmethod
    def maximize_expectation(P, pNone=None):
        expectations = F1Optimizer.get_expectations(P, pNone)

        ix_max = np.unravel_index(expectations.argmax(), expectations.shape)
        max_f1 = expectations[ix_max]

        predNone = True if ix_max[0] == 0 else False
        best_k = ix_max[1]

        return best_k, predNone, max_f1

    @staticmethod
    def _F1(tp, fp, fn):
        return 2 * tp / (2 * tp + fp + fn)

    @staticmethod
    def _Fbeta(tp, fp, fn, beta=1.0):
        beta_squared = beta ** 2
        return (1.0 + beta_squared) * tp / ((1.0 + beta_squared) * tp + fp + beta_squared * fn)

######################################################################

def get_best_prediction(items, preds, pNone=None):
    items_preds = sorted(list(zip(items, preds)), key=itemgetter(1), reverse=True)
    P = [p for i,p in items_preds]
    L = [i for i,p in items_preds]
    
    opt = F1Optimizer.maximize_expectation(P, pNone)
    best_prediction = ['None'] if opt[1] else []
    best_prediction += (L[:opt[0]])
    return ' '.join(list(map(str,best_prediction)))


def preprocess_predict(user_id):
    """
    The function accepts user_id and fetches the last order by that user.
    Then gets the all the products purchased by the user till now.
    Computes feature based on all the previous products purchased by the user
    with all those features it is later passed to the predicted model and probability
    of every product is calculated.
    Later, these predictions are passed to F1Optimizer class which computes which products
    are highly likely to be purchased and only return those
    """
    # Get the user latest order
    user_last_order = orders[orders['user_id']==user_id].tail(1)['order_id'].values[0]
    # get the products purchased by the user till now
    user_order = order_train_chunk[order_train_chunk['order_id'] == user_last_order]
    user_order_products = user_order.copy()

    # Generate the features
    # Merge products
    user_order = pd.merge(user_order, products, on='product_id')
    # Merge the orders
    user_order = pd.merge(user_order, orders, on='order_id')
    # Merge the User department statistics
    user_order = pd.merge(user_order, user_dep_stat, on=['user_id', 'department_id'])
    # Merge the user aisle statistics
    user_order = pd.merge(user_order, user_aisle_stat, on=['user_id', 'aisle_id'])
    # Merge the user products features
    user_order = pd.merge(user_order, prod_usr, on='product_id')
    # Merge the user product reordered stats
    user_order = pd.merge(user_order, prod_usr_reordered, on='product_id', how='left')
    user_order.prod_users_unq_reordered.fillna(0, inplace=True)

    # Merge the this prepared set with the data
    user_order = pd.merge(user_order, data, on=['product_id', 'user_id'])

    # Compute the aisle and departement reorder ratio
    user_order['aisle_reordered_ratio'] = user_order['aisle_reordered'] / user_order['user_orders']
    user_order['dep_reordered_ratio'] = user_order['dep_reordered'] / user_order['user_orders']

    user_order = pd.merge(user_order, product_periods, on=['user_id', 'product_id'])
    user_order = pd.merge(user_order, product_embeddings, on=['product_id'])
    user_order = pd.merge(user_order, order_streaks, on=['user_id', 'product_id'], how='left')

    # Get the important features
    user_order = user_order[processed_features]

    # Load the models
    cscf_1 = CustomStackingClassifier(lgb, 71, None, 10000, 1)
    cscf_1.models = [
        joblib.load("lgb_models/lgb_trained_1_0"),
        joblib.load("lgb_models/lgb_trained_1_1"),
        joblib.load("lgb_models/lgb_trained_1_2")
    ]
    cscf_2 = CustomStackingClassifier(lgb, 72, None, 10000, 2)
    cscf_2.models = [
        joblib.load("lgb_models/lgb_trained_2_0"),
        joblib.load("lgb_models/lgb_trained_2_1"),
        joblib.load("lgb_models/lgb_trained_2_2")
    ]
    cscf_3 = CustomStackingClassifier(lgb, 73, None, 10000, 3)
    cscf_3.models = [
        joblib.load("lgb_models/lgb_trained_3_0"),
        joblib.load("lgb_models/lgb_trained_3_1"),
        joblib.load("lgb_models/lgb_trained_3_2")
    ]

    # Predict with loaded models
    predict_1 = cscf_1.predict(user_order, user_order_products)
    predict_2 = cscf_2.predict(user_order, user_order_products)
    predict_3 = cscf_3.predict(user_order, user_order_products)

    # concat all the 3 predictions and compute mean
    pred_item = pd.concat([predict_1, predict_2, predict_3])
    pred_item = pred_item.groupby(['order_id','product_id']).yhat.mean().reset_index()

    items = pred_item['product_id'].tolist()
    preds = pred_item['yhat'].tolist()

    predicted_products = get_best_prediction(items, preds)

    return predicted_products

In [10]:
user_id = 9
products_predicted = preprocess_predict(user_id)
print(f"Items to be purchased by user {user_id} are {products_predicted.split()}")

Items to be purchased by 9 are ['27973', '21462', '4957', '481', '41844', '40571', '42347', '26790', '13351', '5002', '42828', '6489', '16018', '43875', '38277', '3634', '38159', '12075', '8834']
