In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from tabulate import tabulate
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight
from sklearn.utils import shuffle

In [2]:
# Read the data from the vtc-cab repos
raw_data = pd.read_csv('./activities_201802011009.csv')

In [3]:
# Preview the ratings dataframe
print(raw_data.head())
print(raw_data.shape)

    name accountid  userid                                    id deviceid  \
0  watch   7041046     NaN  99ef7d20-f289-11e7-824b-fda2ff9f7794  Android   
1  watch   7041046     NaN  16836d10-f28a-11e7-a231-9114f613577e  Android   
2  watch   7041046     NaN  b280e9d0-f4cf-11e7-b167-f75a20dec89d  Android   
3  watch   7041046     NaN  6dd1cb50-f4d0-11e7-a231-9114f613577e  Android   
4  watch   7041046     NaN  199f1e11-f4d1-11e7-824b-fda2ff9f7794  Android   

                                    key  \
0                          LYS005228795   
1         tapchiclbvidaibayernmunich_1p   
2  tapchiderbyrealmadridvsbarcelona_lep   
3              aquayoga4tuthechienbinhp   
4                          LYS013573731   

                                            metadata               tstamp  \
0  [Synopsis=Từ 04/04/2017, Title=ON FOOTBALL, bo...  2018-01-06 09:30:46   
1  [Synopsis=, Title=Tạp chí CLB vĩ đại - Bayern ...  2018-01-06 09:34:15   
2  [Synopsis=, Title=Tạp chí Derby - Real Madri

In [4]:
raw_data['accountid'].replace('', np.nan, inplace=True)
raw_data['key'].replace('', np.nan, inplace=True)
raw_data.dropna(subset=['accountid'], inplace=True)
raw_data.dropna(subset=['key'], inplace=True)

In [5]:
# Let's see how many items and customers there are in the dataset
num_cust = len(raw_data.accountid.unique())
num_items = len(raw_data.key.unique())
print('Number of customers: ' + str(num_cust))
print('Number of items bought: ' + str(num_items))

Number of customers: 32729
Number of items bought: 13575


In [6]:
# add one quantity column to dataframe, for simple we just add 1 to everywhere
raw_data['quantity'] = 1

In [7]:
#clean up the raw data 
retail_data = raw_data.loc[pd.isnull(raw_data.accountid) == False]

In [8]:
# Let's group purchase quantities by Stock Code and CustomerID
retail_data = retail_data[['key', 'quantity', 'accountid']]
retail_grouped = retail_data.groupby(['accountid', 'key']).sum().reset_index()

In [9]:
# If the quantity sum is 0, replace with 1 to indicate that there was a purchase of that item atleast
retail_grouped.quantity.loc[retail_grouped.quantity == 0] = 1

# Filter out all negative quantities so that we can focus the recommendation of items that the customer purchased and liked
retail_grouped_final = retail_grouped[retail_grouped.quantity > 0]
print ('\nFinal Matrix of grouped purchases')
print (retail_grouped_final.head())


Final Matrix of grouped purchases
        accountid                            key  quantity
0  5038088.CD-TV1             gioithieuonfuturem         1
1  5038088.CD-TV1    holyflameofthemartialworldm         2
2  5038088.CD-TV1             move_dancecover4km         1
3  5038088.CD-TV1           swalla_dancecover4km         1
4  5038088.CD-TV1  transformer_thelastknight_4km         1


In [10]:
# Get list of unique customers
cust_list = list(np.sort(retail_grouped_final.accountid.unique()))
# Get list of unique items bought
item_list = list(np.sort(retail_grouped_final.key.unique()))
# Get list of all the purchase quantities
quantity_list = list(retail_grouped_final.quantity)

# Building the matrix....
mat_rows = retail_grouped_final.key.astype('category', categories = item_list).cat.codes
mat_cols = retail_grouped_final.accountid.astype('category', categories = cust_list).cat.codes
items_users = sparse.csr_matrix((quantity_list, (mat_rows, mat_cols)), shape = (len(item_list), len(cust_list)))

In [11]:
print ("Shape of sparse matrix " + str(items_users.shape))
items_users

Shape of sparse matrix (13575, 32729)


<13575x32729 sparse matrix of type '<class 'numpy.int64'>'
	with 116307 stored elements in Compressed Sparse Row format>

In [12]:
# Let's check how sparse the matrix is 
# Get all possible combination of purchases
items_users_size = items_users.shape[0]*items_users.shape[1]
# Get actual number of item purchased
num_purchases = len(items_users.nonzero()[0])
sparse_per = 100*(1 - (num_purchases/items_users_size))
print ('Sparsity Percentage in Ratings Matrix is ' + str(round(sparse_per,2)))

Sparsity Percentage in Ratings Matrix is 99.97


In [13]:
'''
Split into test and train by masking some values of the dataset in the training set with 0s indicating the customer did not purchase.
Then run the model on the complete matrix and see if the predicted values are equal to the original values before masking

'''

def create_train(matrix_data, mask_pct = 0.2):
    '''
    This function will take in the complete customer-item matrix and "mask" a percentage of the original purchases where a
    user-item interaction has taken place for use as a test set. The test set will contain all of the original purchases, 
    while the training set replaces the specified percentage of them with a zero in the original purchases matrix. 
    
    args:
    matrix_data - the original purchases matrix from which you want to generate a train/test set. Takes sparse csr_matrix form.
    
    mask_pct - The percentage of customer-item interactions where an interaction took place that you want to mask in the 
    training set for later comparison to the test set. 
    
    returns:
    
    training_set - The altered version of the original data with a certain percentage of the customer-item pairs 
    that originally had interaction set back to zero.
    
    test_set - A copy of the original purchase matrix converted to binary - 1 indicates purchase and 0 indicates no purchase.
    
    user_inds - From the randomly selected customer-item indices, which customer rows were altered in the training data.
    '''
    # Prepare the test set
    test_mat = matrix_data.copy()
    test_mat[test_mat != 0] = 1
    # Prepare the training set
    training_mat = matrix_data.copy()
    # Get indices of purchases in the matrix
    purchase_idx = training_mat.nonzero()
    # Get corresponding user-item indices of the purchase
    purchase_pairs = list(zip(purchase_idx[0], purchase_idx[1]))
    random.seed(0)
    # Number of samples to mask
    num_samples = int(np.ceil(mask_pct*len(purchase_pairs)))
    # Randomly sample from the purchases
    samples = random.sample(purchase_pairs, num_samples)
    item_idx = [index[0] for index in samples]
    customer_idx = [index[1] for index in samples]
    # Mask the items in the above indentified indices as 0
    training_mat[item_idx, customer_idx] = 0 
    # To save space, eliminate the zeros in the sparse matrix
    training_mat.eliminate_zeros()
    return training_mat, test_mat, list(set(customer_idx))

'\nSplit into test and train by masking some values of the dataset in the training set with 0s indicating the customer did not purchase.\nThen run the model on the complete matrix and see if the predicted values are equal to the original values before masking\n\n'

In [14]:
# Call the create_train function
train_mat, test_mat, customer_idx = create_train(items_users)
train_mat
test_mat

<13575x32729 sparse matrix of type '<class 'numpy.int64'>'
	with 93045 stored elements in Compressed Sparse Row format>

<13575x32729 sparse matrix of type '<class 'numpy.int64'>'
	with 116307 stored elements in Compressed Sparse Row format>

In [15]:
model = AlternatingLeastSquares(factors=40,iterations=30)



In [16]:
#train_mat = bm25_weight(train_mat,  B=0.9) * 5

In [17]:
model.fit(train_mat)

In [18]:
def calc_mean_auc(training_mat, altered_custs, predictions, test_mat):
    '''
    args:
    training_mat - The orginial training_mat where we masked some customers' purchases to zero. 
    
    altered_custs - The indices of the customers where atleast one customer/item pair was altered to 0.
    
    predictions - The matrix of predicted purchases. These should be stored in a list, with customer vectors as item zero and item vectors as item one.
    
    test_mat - The test matrix constructed from the create_train function
    
    returns:
    
    The mean AUC of the test set only on customer-item iteractions that were originally zero to test ranking ability in addition to most popular items as a benchmark.
    '''
    
    store_auc = []
    popularity_auc = []
    # Get sum of item interactions to get most popular items
    popular_items = np.array(test_mat.sum(axis =1)).reshape(-1)
    item_vecs = predictions[1]
    item_vecs = item_vecs.T
    for cust in altered_custs:
        # Get the training matrix row where the interactions were zero
        training_col = training_mat[:, cust].toarray().reshape(-1)
        zero_inds = np.where(training_col == 0)
        # Get the predicted values based on our customer/item vectors
        cust_vec = predictions[0][cust,:]
        pred = cust_vec.dot(item_vecs).toarray()[0, zero_inds].reshape(-1)
        # Get only items that were originally zero
        actual = test_mat[:,cust].T.toarray()[0, zero_inds].reshape(-1)
        popular = popular_items[zero_inds]
        # Calculate AUC for predicted vs actual
        fpr, tpr, thresholds = metrics.roc_curve(actual, pred)
        store_auc.append(metrics.auc(fpr, tpr))
        # Calculate AUC for popular vs actual
        fpr_pop, tpr_pop, thresholds_pop = metrics.roc_curve(actual, popular)
        popularity_auc.append(metrics.auc(fpr_pop, tpr_pop))
        
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))

In [19]:
calc_mean_auc(train_mat, customer_idx, [sparse.csr_matrix(model.user_factors), sparse.csr_matrix(model.item_factors)], test_mat)

(0.604, 0.923)

In [20]:
# Sampling the Recommendations provide using the item:desc dictionary we had created earlier
customers_arr = np.array(cust_list)
items_arr = np.array(item_list)

In [21]:
def get_items_purchased(customer_id, train_mat, customers_arr, items_arr):
    '''
    Returns the items purchased by a specific customer in the training set
    
    args:
    customer_id - ID of a customer whose made atleast one purchase
    train_mat - The initial purchase matrix that we masked a percentage of
    customers_arr - Array of customers in the purchase matrix
    items_arr - Array of items in the purchase matrix
    
    returns:
    A dictionary of stock_cd and description of those items already purchased
    '''
    # Get the index of the row where that customer ID is present
    cust_ind = np.where(customers_arr == customer_id)[0][0]
    # Get all the indices of the purchases made
    purchase_ind = train_mat[:,cust_ind].nonzero()[0]
    # Retrieve the product codes for the purchase indices
    stock = items_arr[purchase_ind]
    # Look up the description for the stock code from itemDescDict
    #subdict = {x: item_lookup[x] for x in stock_codes if x in item_lookup}
    return stock

In [22]:
# Let's look at what the first 5 customers purchased
for cust in shuffle(customers_arr)[:9]:
    print('Customer ID: ', cust)
    print(get_items_purchased(cust, train_mat, customers_arr, items_arr))
    print('--------------------------------------------------')

Customer ID:  6950916
['wheninromep']
--------------------------------------------------
Customer ID:  6955129
['thachthucdanhhaimua3tap17gala3p']
--------------------------------------------------
Customer ID:  6935183
['mrscop1p']
--------------------------------------------------
Customer ID:  6658683
['youcamefromthestars1p']
--------------------------------------------------
Customer ID:  6733024
[]
--------------------------------------------------
Customer ID:  6778568
['DP27_Congaicuacachuyenthoaip' 'LYS002171878']
--------------------------------------------------
Customer ID:  6739797
['LYS011255246' 'vucthamvohinhtap21p']
--------------------------------------------------
Customer ID:  6280749
['LYS002161371' 'dienvienlanphuongp']
--------------------------------------------------
Customer ID:  7067626
['LYS002171878' 'LYS003482908']
--------------------------------------------------


In [23]:
def get_rec_item(model,customer_id,customers_arr,items_arr,items_users):
    cust_ind = np.where(customers_arr == customer_id)[0][0]
    recs =  model.recommend(cust_ind,items_users.T)
    rec_list = []
    for rec in recs:
        rec_list.append(items_arr[rec[0]])
    return rec_list

In [24]:
# Let's look at what was recommended for the 3 customers
for cust in shuffle(customers_arr)[:9]:
    print('Customer ID: ', cust)
    print(get_rec_item(model, cust, customers_arr, items_arr,items_users))
    print('--------------------------------------------------')

Customer ID:  6804220
['LYS002171878', 'LYS002161371', 'tapchiderbytottenhamvsarsenalp', 'flyingdaggerthep', 'DP23_Anvap', 'TCdvliverpoolvsmanchesterunitedp', 'tapchiclbvidaibenfica_1p', 'tapchiclbvidaibayernmunich_1p', 'madeaswitnessprotectionp', 'mininijas11p']
--------------------------------------------------
Customer ID:  6934541
['LYS003482908', 'caccauthudamdanggioinoitrop', 'hlvnguyenhuuthangp', 'tapchiderbybrazilvsargentinap', 'cacclbcotenkylap', 'buidoanquanghuyp', 'hanamutdvsliverpoolhanoifcp', 'caulongcanmothocvienp', '6flingdragons13p', 'LYS011764101']
--------------------------------------------------
Customer ID:  7013940
['LYS002161371', '5vungchienthuatp', 'tapchiderbytottenhamvsarsenalp', 'gooddoctor1p', 'TCdvliverpoolvsmanchesterunitedp', 'thelastsongp', 'hellostrangerp', 'tapchiclbvidaibayernmunich_1p', 'DP22_CactranchungketChampionsLeaguep', 'flyingdaggerthep']
--------------------------------------------------
Customer ID:  7029266
['LYS002171878', 'spiritstallion

In [25]:
'''
Let's compare the items bought vs the items recommended by customer in a dataframe so 
that we can see more clearly how the recommendation engine did
'''

def compare_purchase_rec(accountid, key, rec_list):
    '''
    This function returns a dataframe with the 10 purchased items and top 10 recommended items for each customer
    
    args:
    customer_id - The customer ID in the purchase matrix
    purchase_dict - The output of the get_items_purchased function which is a dictionary of stock_cd:description of items purchased
    rec_list - The output of the get_rec_item function which is a list of the top n stock_cd and description pairs
    
    returns:
    A dataframe with all purchased items and top n recommended items by customer
    '''
    # Create dataframes of one column each - CustomerID, Purchased items, Recommended Items
    cust_df = pd.DataFrame({'CustID': [accountid]})
    purchase_df = pd.DataFrame({'PurchasedItem': list(key)})
    rec_df = pd.DataFrame({'RecommendedItem': [pair for pair in rec_list]})

    # Column wise concatenate the dataframes
    final_df = pd.concat([cust_df, purchase_df, rec_df], ignore_index=True, axis=1)
    # Format the final dataframe
    final_df.columns = ['CustID', 'PurchasedItem', 'RecommendedItem']
    final_df['PurchasedItem'] = final_df.PurchasedItem.astype(str)
    final_df['RecommendedItem'] = final_df.RecommendedItem.astype(str)
    final_df = final_df.fillna('')
    final_df = final_df.replace('nan', '', regex=True)
    return final_df

"\nLet's compare the items bought vs the items recommended by customer in a dataframe so \nthat we can see more clearly how the recommendation engine did\n"

In [26]:
# Let's compare the same 3 customers
for cust in shuffle(customers_arr)[:9]:
    print(tabulate(compare_purchase_rec(cust, 
                               get_items_purchased(cust, train_mat, customers_arr, items_arr),
                               get_rec_item(model, cust, customers_arr, items_arr,items_users)),
                  headers= ['CustID', 'PurchasedItem', 'RecommendedItem']))

    CustID    PurchasedItem    RecommendedItem
--  --------  ---------------  ---------------------------------
 0  6874930   LYS002171878     LYS003482908
 1            LYS003482908     LYS002171878
 2            LYS005294306     LYS005294306
 3                             gapnhaucuoinam2017p
 4                             tapchiderbytottenhamvsarsenalp
 5                             remember1p
 6                             tapchiderbybrazilvsargentinap
 7                             DP26_KhagcuacaccauthutrongCasinop
 8                             theitalianjobp
 9                             golfernuquyenrup
    CustID    PurchasedItem                        RecommendedItem
--  --------  -----------------------------------  -----------------------
 0  6980929   LYS013063491                         SuQuyenRuChetNguoi1p
 1            nhungdieukithutaicacgiaivdqgchauaup  nhungduongcongnongbongp
 2            thienduongamthucmua2tap9p            kungfuinstructorthep
 3                  