## Building and Evaluating Deep Learning Based Hybrid Recommendation System

In this notebook, we will be building a deep learning based recommendation system using open source retail dataset. 


#### Download dataset

Dataset is available in excel format on UCI 

In [1]:
!wget http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx

--2019-03-19 21:42:41--  http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx
Resolving archive.ics.uci.edu... 128.195.10.249
Connecting to archive.ics.uci.edu|128.195.10.249|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23715344 (23M) [application/vnd.openxmlformats-officedocument.spreadsheetml.sheet]
Saving to: ‘Online Retail.xlsx.1’

Online Retail.xlsx.  12%[=>                  ]   2.90M   474KB/s    eta 90s    ^C


####  Import libraries

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import scipy.sparse as sp
from tqdm import tqdm
import pickle

In [42]:
tf.__version__

'2.0.0-alpha0'

In [None]:
tf.

#### Load Dataset

In [None]:
## using pandas to read the dataset
df = pd.read_excel('Online Retail.xlsx')

## reading excel can be time-consuming, therefore converting it in binary format using Pickle 
## next time onwards we can directly read the binary file
with open('df_retail.pkl', 'wb') as out_file:
    pickle.dump(df, out_file)

In [3]:
#reading pickle file
with open('df_retail.pkl', 'rb') as in_file:
    df = pickle.load(in_file)

In [4]:
# exploring dataset
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [5]:
# get info 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
InvoiceNo      541909 non-null object
StockCode      541909 non-null object
Description    540455 non-null object
Quantity       541909 non-null int64
InvoiceDate    541909 non-null datetime64[ns]
UnitPrice      541909 non-null float64
CustomerID     406829 non-null float64
Country        541909 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


### Data Processing 

In [6]:
# converting column names to lower case
df.columns = df.columns.str.lower()

In [7]:
# return items 
df[df["invoiceno"].astype('str').str.startswith('C')].head()

Unnamed: 0,invoiceno,stockcode,description,quantity,invoicedate,unitprice,customerid,country
141,C536379,D,Discount,-1,2010-12-01 09:41:00,27.5,14527.0,United Kingdom
154,C536383,35004C,SET OF 3 COLOURED FLYING DUCKS,-1,2010-12-01 09:49:00,4.65,15311.0,United Kingdom
235,C536391,22556,PLASTERS IN TIN CIRCUS PARADE,-12,2010-12-01 10:24:00,1.65,17548.0,United Kingdom
236,C536391,21984,PACK OF 12 PINK PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom
237,C536391,21983,PACK OF 12 BLUE PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom


In [8]:
# filter rows for return items
df = df[~df.invoiceno.astype('str').str.startswith('C')].reset_index(drop=True)

In [9]:
# handling customer ID with NaN value
df[df["customerid"].isnull()].head()

Unnamed: 0,invoiceno,stockcode,description,quantity,invoicedate,unitprice,customerid,country
613,536414,22139,,56,2010-12-01 11:52:00,0.0,,United Kingdom
1431,536544,21773,DECORATIVE ROSE BATHROOM BOTTLE,1,2010-12-01 14:32:00,2.51,,United Kingdom
1432,536544,21774,DECORATIVE CATS BATHROOM BOTTLE,2,2010-12-01 14:32:00,2.51,,United Kingdom
1433,536544,21786,POLKADOT RAIN HAT,4,2010-12-01 14:32:00,0.85,,United Kingdom
1434,536544,21787,RAIN PONCHO RETROSPOT,2,2010-12-01 14:32:00,1.66,,United Kingdom


In [10]:
print("Number of customers with no available customer ID : {}".format(len(df[df["customerid"].isnull()])))

Number of customers with no available customer ID : 134697


In [11]:
# fill na with some fixed value of -1. Converting the field as integer
df["customerid"] = df["customerid"].fillna(-1).astype(int)

In [12]:
# check info again
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 532621 entries, 0 to 532620
Data columns (total 8 columns):
invoiceno      532621 non-null object
stockcode      532621 non-null object
description    531167 non-null object
quantity       532621 non-null int64
invoicedate    532621 non-null datetime64[ns]
unitprice      532621 non-null float64
customerid     532621 non-null int64
country        532621 non-null object
dtypes: datetime64[ns](1), float64(1), int64(2), object(4)
memory usage: 32.5+ MB


In [13]:
df.head()

Unnamed: 0,invoiceno,stockcode,description,quantity,invoicedate,unitprice,customerid,country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


In [14]:
# number of unique customers 
print("number of unique customers : {}".format(len(df.customerid.unique())))
# number of unique items 
print("number of unique items : {}".format(len(df.stockcode.astype('str').unique())))

number of unique customers : 4340
number of unique items : 4059


#### StockCode (Item) Mapping

In [15]:
unique_stockcodes = sorted(df.stockcode.astype('str').unique())
unique_stockcodes_mapping = {stockcode: index for (index, stockcode) in enumerate(unique_stockcodes)}

In [16]:
# map stockcode with mapping
df["stockcode"] = df["stockcode"].map(lambda x : unique_stockcodes_mapping[str(x)])

In [17]:
df.head()

Unnamed: 0,invoiceno,stockcode,description,quantity,invoicedate,unitprice,customerid,country
0,536365,3527,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,2791,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,3040,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,2981,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,2980,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 532621 entries, 0 to 532620
Data columns (total 8 columns):
invoiceno      532621 non-null object
stockcode      532621 non-null int64
description    531167 non-null object
quantity       532621 non-null int64
invoicedate    532621 non-null datetime64[ns]
unitprice      532621 non-null float64
customerid     532621 non-null int64
country        532621 non-null object
dtypes: datetime64[ns](1), float64(1), int64(3), object(3)
memory usage: 32.5+ MB


#### Train-Eval-Test Split

In [19]:
df["invoicedate"].min(),df["invoicedate"].max()

(Timestamp('2010-12-01 08:26:00'), Timestamp('2011-12-09 12:50:00'))

In [20]:
# last one month data in test 
# second last month data in eval 
# rest data in train
df_test = df[df["invoicedate"] >= "2011-11-09"]
df_val = df[(df["invoicedate"] >= "2011-10-09") & (df["invoicedate"] <= "2011-11-09")]
df_train = df[df["invoicedate"] < "2011-10-09"]

### Evaluation Metric

We are using  **Precision** as evaluation metric. Items that customer bought out of the recommendations provided to them. Other metrics that could have been used **MAP** , **NDCG**.

We will be recommeding **10 items** and then evaluate how many out of **10 items** the customer actually bought.

### Baseline Model ( Popularity Based )

Baseline model when we are always recommending most popular items.

In [21]:
df_train["stockcode"].value_counts().head(10)

3527    1814
3506    1667
1347    1622
2730    1527
180     1298
3300    1121
453     1104
1630    1083
1313    1077
182     1019
Name: stockcode, dtype: int64

In [22]:
# get most popular items
most_popular_items = df_train["stockcode"].value_counts().index.values[:10]

In [23]:
most_popular_items

array([3527, 3506, 1347, 2730,  180, 3300,  453, 1630, 1313,  182])

These **10 items** will be recommended to all users. 

- Group by invoiceno (this is our transaction ID)
- For each transaction make a recommendation
- Record the number of correct predictions per each group
- Calculate the overall precision


In [24]:
def calculate_precision(df_tmp):
    common_items_count = len(list(set(df_tmp["stockcodes_actual"]).intersection(df_tmp["stockcodes_recommended"])))
    return common_items_count / len(df_tmp["stockcodes_recommended"])

def recommendation_precision(df_val, recommendations):
    df_recommendations = df_val.groupby("invoiceno")["stockcode"].apply(lambda x : list(x)).reset_index()
    df_recommendations.columns = ["invoiceno","stockcodes_actual"]
    df_recommendations["stockcodes_actual"] = df_recommendations["stockcodes_actual"].map(lambda x: list(set(x)))
    df_recommendations["stockcodes_recommended"] = df_recommendations.index.map(lambda index: recommendations[index])
    df_recommendations["precision"] = df_recommendations.apply(calculate_precision, axis=1).values
    #print(df_recommendations.head())
    ## calculate average precision
    avg_precision = df_recommendations["precision"].mean()
    #print("average precision : {0:.3f}".format(avg_precision))
    return avg_precision
    

In [25]:
num_groups = len(df_val.invoiceno.drop_duplicates())
baseline = np.tile(most_popular_items, num_groups).reshape(-1, 10)

print('average precision : {0:.4f}'.format(recommendation_precision(df_val, baseline)))

average precision : 0.0579


### ALS Model 

In [26]:
df_train_user = df_train[df_train.customerid != -1].reset_index(drop=True)
customers = sorted(set(df_train_user.customerid))
customers = {c: i for (i, c) in enumerate(customers)}
df_train_user.customerid = df_train_user.customerid.map(customers)

In [27]:
df_val.customerid = df_val.customerid.apply(lambda c: customers.get(c, -1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [28]:
uid = df_train_user.customerid.values.astype('int32')
iid = df_train_user.stockcode.values.astype('int32')
ones = np.ones_like(uid, dtype='uint8')

X_train = sp.csr_matrix((ones, (uid, iid)))

In [29]:
X_train.count_nonzero() / (X_train.shape[0] * X_train.shape[1])

0.013108728461996209

In [111]:
!pip install implicit

[33mYou are using pip version 18.0, however version 19.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [30]:
from implicit.als import AlternatingLeastSquares

item_user = X_train.T.tocsr()
als = AlternatingLeastSquares(factors=128, regularization=0.000001)
als.fit(item_user)

W0319 21:43:55.485456 140737207169984 utils.py:26] OpenBLAS detected. Its highly recommend to set the environment variable 'export OPENBLAS_NUM_THREADS=1' to disable its internal multithreading
100%|██████████| 15.0/15 [00:02<00:00,  7.12it/s]


In [31]:
als_U = als.user_factors
als_I = als.item_factors


In [32]:
uid_val = df_val.drop_duplicates(subset='invoiceno').customerid.values
known_mask = uid_val != -1
uid_val = uid_val[known_mask] 

In [33]:
imp_baseline = baseline.copy()
pred_all = als_U[uid_val].dot(als_I.T)
top_val = (-pred_all).argsort(axis=1)[:, :10]
imp_baseline[known_mask] = top_val

In [34]:
imp_baseline

array([[ 815,  816, 1347, ..., 2164,  663, 1425],
       [3527, 3506, 1347, ..., 1630, 1313,  182],
       [2178, 2767,  155, ..., 2762,  662, 1100],
       ...,
       [3527, 3506, 1347, ..., 1630, 1313,  182],
       [3527, 3506, 1347, ..., 1630, 1313,  182],
       [3527, 3506, 1347, ..., 1630, 1313,  182]])

In [35]:
imp_baseline.shape

(2435, 10)

In [36]:
recommendation_precision(df_val, imp_baseline)

0.12558521560574948

In [37]:
def init_variable(size, dim, name=None):
    std = np.sqrt(2 / dim)
    return tf.Variable(tf.random_uniform([size, dim], -std, std), name=name)

def embed(inputs, size, dim, name=None):
    emb = init_variable(size, dim, name)
    return tf.nn.embedding_lookup(emb, inputs)


In [38]:
# parameters of the model
num_users = uid.max() + 1
num_items = iid.max() + 1

In [39]:
def get_variable(graph, session, name):
    v = graph.get_operation_by_name(name)
    v = v.values()[0]
    v = v.eval(session=session)
    return v

def calculate_validation_precision(graph, session, uid, df_val):
    U = get_variable(graph, session, 'user_factors')
    I = get_variable(graph, session, 'item_factors')
    bi = get_variable(graph, session, 'item_bias').reshape(-1)

    pred_all = U[uid_val].dot(I.T) + bi
    top_val = (-pred_all).argsort(axis=1)[:, :10]

    imp_baseline = baseline.copy()
    imp_baseline[known_mask] = top_val
    
    return recommendation_precision(df_val, imp_baseline)

    #return precision(val_indptr, val_items, imp_baseline)

In [40]:
num_factors = 128
lambda_user = 0.0000001
lambda_item = 0.0000001
lambda_bias = 0.0000001
lr = 0.0005
graph = tf.Graph()
graph.seed = 1

with graph.as_default():
    place_user = tf.placeholder(tf.int32, shape=(None, 1))
    place_item_pos = tf.placeholder(tf.int32, shape=(None, 1))
    place_item_neg = tf.placeholder(tf.int32, shape=(None, 1))
    # no place_y
    user_factors = embed(place_user, num_users, num_factors,
        "user_factors")
    # no user bias anymore as well as no global bias

    item_factors = init_variable(num_items, num_factors, 
        "item_factors")
    item_factors_pos = tf.nn.embedding_lookup(item_factors, place_item_pos)
    item_factors_neg = tf.nn.embedding_lookup(item_factors, place_item_neg)

    item_bias = init_variable(num_items, 1, "item_bias")
    item_bias_pos = tf.nn.embedding_lookup(item_bias, place_item_pos)
    item_bias_pos = tf.reshape(item_bias_pos, [-1, 1])
    item_bias_neg = tf.nn.embedding_lookup(item_bias, place_item_neg)
    item_bias_neg = tf.reshape(item_bias_neg, [-1, 1])

    # predictions for each item are same as previously
    # but no user bias and global bias
    pred_pos = item_bias_pos + \
        tf.reduce_sum(user_factors * item_factors_pos, axis=2)
    pred_neg = item_bias_neg + \
        tf.reduce_sum(user_factors * item_factors_neg, axis=2)
    
    pred_diff = pred_pos - pred_neg
    
    loss_bpr = -tf.reduce_mean(tf.log(tf.sigmoid(pred_diff)))
    loss_reg = lambda_user * tf.reduce_sum(user_factors * user_factors) +\
        lambda_item * tf.reduce_sum(item_factors_pos * item_factors_pos)+\
        lambda_item * tf.reduce_sum(item_factors_neg * item_factors_neg)+\
        lambda_bias * tf.reduce_sum(item_bias_pos) + \
        lambda_bias * tf.reduce_sum(item_bias_neg)

    loss_total = loss_bpr + loss_reg

    opt = tf.train.AdamOptimizer(learning_rate=lr)
    step = opt.minimize(loss_total)

    init = tf.global_variables_initializer()

AttributeError: module 'tensorflow' has no attribute 'placeholder'

In [41]:
session = tf.Session(config=None, graph=graph)
session.run(init)

size_total = uid.shape[0]
size_sample = 15000

np.random.seed(0)

for i in range(100):
    for k in range(30):
        idx = np.random.randint(low=0, high=size_total, size=size_sample)

        batch_uid = uid[idx].reshape(-1, 1)
        batch_iid_pos = iid[idx].reshape(-1, 1)
        batch_iid_neg = np.random.randint(
            low=0, high=num_items, size=(size_sample, 1), dtype='int32')

        feed_dict = {
            place_user: batch_uid,
            place_item_pos: batch_iid_pos,
            place_item_neg: batch_iid_neg,
        }
        _, l = session.run([step, loss_bpr], feed_dict)

    val_precision = calculate_validation_precision(graph, session, uid_val, df_val)
    print('epoch %02d: precision: %.3f' % (i+1, val_precision))

AttributeError: module 'tensorflow' has no attribute 'Session'