In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from tabulate import tabulate
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Read the data from the vtc-cab repos
raw_data = pd.read_csv('./activities_201802011009.csv')

In [3]:
# Preview the ratings dataframe
print(raw_data.head())
print(raw_data.shape)

    name accountid  userid                                    id deviceid  \
0  watch   7041046     NaN  99ef7d20-f289-11e7-824b-fda2ff9f7794  Android   
1  watch   7041046     NaN  16836d10-f28a-11e7-a231-9114f613577e  Android   
2  watch   7041046     NaN  b280e9d0-f4cf-11e7-b167-f75a20dec89d  Android   
3  watch   7041046     NaN  6dd1cb50-f4d0-11e7-a231-9114f613577e  Android   
4  watch   7041046     NaN  199f1e11-f4d1-11e7-824b-fda2ff9f7794  Android   

                                    key  \
0                          LYS005228795   
1         tapchiclbvidaibayernmunich_1p   
2  tapchiderbyrealmadridvsbarcelona_lep   
3              aquayoga4tuthechienbinhp   
4                          LYS013573731   

                                            metadata               tstamp  \
0  [Synopsis=Từ 04/04/2017, Title=ON FOOTBALL, bo...  2018-01-06 09:30:46   
1  [Synopsis=, Title=Tạp chí CLB vĩ đại - Bayern ...  2018-01-06 09:34:15   
2  [Synopsis=, Title=Tạp chí Derby - Real Madri

In [4]:
raw_data['accountid'].replace('', np.nan, inplace=True)
raw_data['key'].replace('', np.nan, inplace=True)
raw_data.dropna(subset=['accountid'], inplace=True)
raw_data.dropna(subset=['key'], inplace=True)

In [5]:
# Let's see how many items and customers there are in the dataset
num_cust = len(raw_data.accountid.unique())
num_items = len(raw_data.key.unique())
print('Number of customers: ' + str(num_cust))
print('Number of items bought: ' + str(num_items))

Number of customers: 32729
Number of items bought: 13575


In [6]:
# add one quantity column to dataframe, for simple we just add 1 to everywhere
raw_data['quantity'] = 1
# add more quantity for the completed video to denote the stonger preference 
raw_data[raw_data['value'] == 'complete']['quantity'] = 5

In [7]:
#clean up the raw data 
retail_data = raw_data.loc[pd.isnull(raw_data.accountid) == False]

In [8]:
retail_data.shape

(173364, 10)

In [9]:
retail_data = retail_data[:70000]

In [10]:
retail_data.shape

(70000, 10)

In [11]:
# Let's group purchase quantities by Stock Code and CustomerID
retail_data = retail_data[['key', 'quantity', 'accountid']]
retail_grouped = retail_data.groupby(['accountid', 'key']).sum().reset_index()

In [12]:
retail_grouped.head(10)

Unnamed: 0,accountid,key,quantity
0,5129035-TV1,LYS005056949,2
1,5129035-TV1,LYS014206049,1
2,5129035-TV1,annaandthekingm,1
3,5129035-TV1,astrademovideo4km,1
4,5129035-TV1,changtraicuaem_tap2_4km,2
5,5129035-TV1,changtraicuaem_tap3_4km,3
6,5129035-TV1,garfieldm,1
7,5129035-TV1,killerconstablem,1
8,5129035-TV1,marvelstheavengersm,1
9,5129035-TV1,mousehuntm,1


In [13]:
# If the quantity sum is 0, replace with 1 to indicate that there was a purchase of that item atleast
retail_grouped.quantity.loc[retail_grouped.quantity == 0] = 1

# Filter out all negative quantities so that we can focus the recommendation of items that the customer purchased and liked
retail_grouped_final = retail_grouped[retail_grouped.quantity > 0]
print ('\nFinal Matrix of grouped purchases')
print (retail_grouped_final.head())


Final Matrix of grouped purchases
     accountid                      key  quantity
0  5129035-TV1             LYS005056949         2
1  5129035-TV1             LYS014206049         1
2  5129035-TV1          annaandthekingm         1
3  5129035-TV1        astrademovideo4km         1
4  5129035-TV1  changtraicuaem_tap2_4km         2


In [14]:
# Get list of unique customers
cust_list = list(np.sort(retail_grouped_final.accountid.unique()))
# Get list of unique items bought
item_list = list(np.sort(retail_grouped_final.key.unique()))
# Get list of all the purchase quantities
quantity_list = list(retail_grouped_final.quantity)


# Building the matrix....
mat_rows = retail_grouped_final.accountid.astype('category', categories = cust_list).cat.codes
mat_cols = retail_grouped_final.key.astype('category', categories = item_list).cat.codes

purchases_mat = sparse.csr_matrix((quantity_list, (mat_rows, mat_cols)), shape = (len(cust_list), len(item_list)))

In [15]:
print ("Shape of sparse matrix " + str(purchases_mat.shape))
purchases_mat

Shape of sparse matrix (13423, 8422)


<13423x8422 sparse matrix of type '<class 'numpy.int64'>'
	with 47165 stored elements in Compressed Sparse Row format>

In [16]:
# Let's check how sparse the matrix is 
# Get all possible combination of purchases
purchase_mat_size = purchases_mat.shape[0]*purchases_mat.shape[1]
# Get actual number of item purchased
num_purchases = len(purchases_mat.nonzero()[0])
sparse_per = 100*(1 - (num_purchases/purchase_mat_size))
print ('Sparsity Percentage in Ratings Matrix is ' + str(round(sparse_per,2)))

Sparsity Percentage in Ratings Matrix is 99.96


In [17]:
'''
Split into test and train by masking some values of the dataset in the training set with 0s indicating the customer did not purchase.
Then run the model on the complete matrix and see if the predicted values are equal to the original values before masking

'''

def create_train(matrix_data, mask_pct = 0.2):
    '''
    This function will take in the complete customer-item matrix and "mask" a percentage of the original purchases where a
    user-item interaction has taken place for use as a test set. The test set will contain all of the original purchases, 
    while the training set replaces the specified percentage of them with a zero in the original purchases matrix. 
    
    args:
    matrix_data - the original purchases matrix from which you want to generate a train/test set. Takes sparse csr_matrix form.
    
    mask_pct - The percentage of customer-item interactions where an interaction took place that you want to mask in the 
    training set for later comparison to the test set. 
    
    returns:
    
    training_set - The altered version of the original data with a certain percentage of the customer-item pairs 
    that originally had interaction set back to zero.
    
    test_set - A copy of the original purchase matrix converted to binary - 1 indicates purchase and 0 indicates no purchase.
    
    user_inds - From the randomly selected customer-item indices, which customer rows were altered in the training data.
    '''
    # Prepare the test set
    test_mat = matrix_data.copy()
    test_mat[test_mat != 0] = 1
    # Prepare the training set
    training_mat = matrix_data.copy()
    # Get indices of purchases in the matrix
    purchase_idx = training_mat.nonzero()
    # Get corresponding user-item indices of the purchase
    purchase_pairs = list(zip(purchase_idx[0], purchase_idx[1]))
    random.seed(0)
    # Number of samples to mask
    num_samples = int(np.ceil(mask_pct*len(purchase_pairs)))
    # Randomly sample from the purchases
    samples = random.sample(purchase_pairs, num_samples)
    customer_idx = [index[0] for index in samples]
    item_idx = [index[1] for index in samples]
    # Mask the items in the above indentified indices as 0
    training_mat[customer_idx, item_idx] = 0 
    # To save space, eliminate the zeros in the sparse matrix
    training_mat.eliminate_zeros()
    return training_mat, test_mat, list(set(customer_idx))

'\nSplit into test and train by masking some values of the dataset in the training set with 0s indicating the customer did not purchase.\nThen run the model on the complete matrix and see if the predicted values are equal to the original values before masking\n\n'

In [18]:
# Call the create_train function
train_mat, test_mat, customer_idx = create_train(purchases_mat)
train_mat
test_mat

<13423x8422 sparse matrix of type '<class 'numpy.int64'>'
	with 37732 stored elements in Compressed Sparse Row format>

<13423x8422 sparse matrix of type '<class 'numpy.int64'>'
	with 47165 stored elements in Compressed Sparse Row format>

In [None]:
##### ALS Matrix Factorization
### Reference : http://yifanhu.net/PUB/cf.pdf


def implicit_weighted_ALS(training_mat, lambda_val = 0.1, alpha = 40, iterations = 10, rank_size = 20, seed = 0):
    '''
    args:
    training_mat - Matrix with shape m x n; m = number of customers, n = number of items
    
    lambda_val - Regularization constraint for bias-variance trade-off. Increasing lambda_val increases bias but reduced variance
    
    alpha - Parameter describing the confidence of the matrix. The paper identified 40 as most effective. Descreasing this value will decrease the confidence between various purchases.
    
    iterations - Number of times to alternate between the customer feature vector (U) and item feature vector (V) in ALS. More iterations will give better convergence but increase computation.
    
    rank_size - Number of latent features in the customer/item feature vectors. Paper recommends between 20-200. Increasing may overfit but reduce bias.
    
    seed - internal state of random number generator.
    
    returns:
    U (feature vector for customers) and V (feature vector for item.)
    U.dot(V) would give us the predicted purchases matrix.
    '''
    
    # Create confidence Matrix of size m x n
    conf = (alpha*training_mat)
    num_cust = conf.shape[0]
    num_item = conf.shape[1]
    
    # Initial U/V feature vectors randomly
    state = np.random.RandomState(seed)
    # Create the customer feature vector with random numbers of size m x rank_size (number of latent features)
    U = sparse.csr_matrix(state.normal(size = (num_cust, rank_size)))
    # Create the item feature vector with random numbers of size n x rank_size (number of latent features). Will transpose it later
    V = sparse.csr_matrix(state.normal(size = (num_item, rank_size)))
    
    # Create a sparse matrix with 1s along the diagonal for U
    U_diag = sparse.eye(num_cust)
    # Create a sparse matrix with 1s along the diagonal for V
    V_diag = sparse.eye(num_item)
    # Create a sparse matrix of 1s along the diagonal of the latent feature vector and the regularitzation term
    lambda_diag = lambda_val * sparse.eye(rank_size)
    
    # Set up iterations
    # Iterate between solving for U with V fixed and vice versa
    for step in range(iterations):
        print("step: {}".format(step+1))
        # Compute vTv and uTu before to save computing time
        vTv = V.T.dot(V)
        uTu = U.T.dot(U)
        # Begin iteration to solve for U on fixed V
        for u in range(num_cust):
            # Convert customer row from confidence matrix to dense vector
            conf_samp = conf[u,:].toarray()
            pref = conf_samp.copy()
            # Create a binary preference vector
            pref[pref != 0] = 1
            # Beging solving through the equations defined in the paper
            # Cu -I term
            CuI = sparse.diags(conf_samp, [0])
            # yT(Cu-I)Y term
            vTCuIV = V.T.dot(CuI).dot(V)
            # yTCuPu term where we add the diagonal back in
            vTCupu = V.T.dot(CuI + V_diag).dot(pref.T)
            # Solve for Xu = ((yTy + yT(Cu-I)Y + lambda*I)^-1)yTCuPu
            U[u] = spsolve(vTv + vTCuIV + lambda_diag, vTCupu)
            
        # Begin iteration to solve for V on fixed U
        for v in range(num_item):
            # Transpose item row from confidence matrix to dense vector
            conf_samp = conf[:,v].T.toarray()
            pref = conf_samp.copy()
            # Create a binary preference vector
            pref[pref != 0] = 1
            # Beging solving through the equations defined in the paper
            # Cu -I term
            CvI = sparse.diags(conf_samp, [0])
            # xT(Cv-I)X term
            uTCvIU = U.T.dot(CvI).dot(U)
            # xTCuPu term where we add the diagonal back in
            uTCvpv = U.T.dot(CvI + U_diag).dot(pref.T)
            # Solve for Yv = ((xTx + xT(Cu-I)X) + lambda*I)^-1)xTCvPv
            V[v] = spsolve(uTu + uTCvIU + lambda_diag, uTCvpv)
            
    return U, V.T

In [None]:
# Call the function with lambda_val 0.1, alpha 40, 30 iterations and 10 latent features
cust_vecs, item_vecs = implicit_weighted_ALS(train_mat, lambda_val=0.1, alpha = 40, iterations = 30, rank_size = 20)

step: 1
step: 2
step: 3
step: 4
step: 5
step: 6
step: 7
step: 8
step: 9
step: 10
step: 11
step: 12
step: 13
step: 14
step: 15
step: 16
step: 17
step: 18
step: 19
step: 20
step: 21
step: 22
step: 23
step: 24
step: 25
step: 26
step: 27
step: 28
step: 29
step: 30


In [None]:
# Predictions for the first customer across the first 5 items
cust_vecs[0:,].dot(item_vecs).toarray()[0,:5]

In [None]:
def calc_mean_auc(training_mat, altered_custs, predictions, test_mat):
    '''
    args:
    training_mat - The orginial training_mat where we masked some customers' purchases to zero. 
    
    altered_custs - The indices of the customers where atleast one customer/item pair was altered to 0.
    
    predictions - The matrix of predicted purchases. These should be stored in a list, with customer vectors as item zero and item vectors as item one.
    
    test_mat - The test matrix constructed from the create_train function
    
    returns:
    
    The mean AUC of the test set only on customer-item iteractions that were originally zero to test ranking ability in addition to most popular items as a benchmark.
    '''
    
    store_auc = []
    popularity_auc = []
    # Get sum of item interactions to get most popular items
    popular_items = np.array(test_mat.sum(axis =0)).reshape(-1)
    item_vecs = predictions[1]
    for cust in altered_custs:
        # Get the training matrix row where the interactions were zero
        training_row = training_mat[cust, :].toarray().reshape(-1)
        zero_inds = np.where(training_row == 0)
        # Get the predicted values based on our customer/item vectors
        cust_vec = predictions[0][cust,:]
        pred = cust_vec.dot(item_vecs).toarray()[0, zero_inds].reshape(-1)
        # Get only items that were originally zero
        actual = test_mat[cust,:].toarray()[0, zero_inds].reshape(-1)
        popular = popular_items[zero_inds]
        # Calculate AUC for predicted vs actual
        fpr, tpr, thresholds = metrics.roc_curve(actual, pred)
        store_auc.append(metrics.auc(fpr, tpr))
        # Calculate AUC for popular vs actual
        fpr_pop, tpr_pop, thresholds_pop = metrics.roc_curve(actual, popular)
        popularity_auc.append(metrics.auc(fpr_pop, tpr_pop))
        
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))

In [None]:
calc_mean_auc(train_mat, customer_idx, [sparse.csr_matrix(cust_vecs), sparse.csr_matrix(item_vecs)], test_mat)

In [None]:
# Sampling the Recommendations provide using the item:desc dictionary we had created earlier

customers_arr = np.array(cust_list)
items_arr = np.array(item_list)

In [None]:
def get_items_purchased(customer_id, train_mat, customers_arr, items_arr):
    '''
    Returns the items purchased by a specific customer in the training set
    
    args:
    customer_id - ID of a customer whose made atleast one purchase
    train_mat - The initial purchase matrix that we masked a percentage of
    customers_arr - Array of customers in the purchase matrix
    items_arr - Array of items in the purchase matrix
    item_lookup - Dictionary of unique item ID to description
    
    returns:
    A dictionary of stock_cd and description of those items already purchased
    '''
    # Get the index of the row where that customer ID is present
    cust_ind = np.where(customers_arr == customer_id)[0][0]
    # Get all the indices of the purchases made
    purchase_ind = train_mat[cust_ind,:].nonzero()[1]
    # Retrieve the product codes for the purchase indices
    stock = items_arr[purchase_ind]
    # Look up the description for the stock code from itemDescDict
    #subdict = {x: item_lookup[x] for x in stock_codes if x in item_lookup}
    return stock

In [None]:
# Let's look at what the first 5 customers purchased
for cust in customers_arr[:9]:
    print('Customer ID: ', cust)
    print(get_items_purchased(cust, train_mat, customers_arr, items_arr))
    print('--------------------------------------------------')

In [None]:
# Now let's write a function to get the recommended items for each of these customers using our recommendation engine

def get_rec_item(customer_id, train_mat, customer_vecs, items_vecs, customer_arr, item_arr, num_items = 10):
    '''
    This function will return the top num_items recommended items to the customers
    
    args:
    customer_id - ID of a customer who we want to see the recommendations for
    train_mat - The initial purchase matrix that we masked a percentage of
    customers_arr - Array of customers in the purchase matrix
    items_arr - Array of items in the purchase matrix
    item_lookup - Dictionary of unique item ID to description
    num_items - The number of recommended items in order of best recommendation to lowest.
    
    returns:
    The top n recommendations based on the U/V vectors for items never purchased/interacted with before
    '''
    # Get index of customerID
    cust_ind = np.where(customer_arr == customer_id)[0][0]
    # Get purchases made by that customer
    pref_vec = train_mat[cust_ind,:].toarray()
    # Add 1 to all purchases so that items not purchased yet become equal to 1
    pref_vec = pref_vec.reshape(-1) + 1
    # Make items that were already purchased 0 (so that they don't get included in the recommendation)
    pref_vec[pref_vec > 1] = 0
    # Get dot product of customer vector across all items in the item vector
    rec_vector = customer_vecs[cust_ind,:].dot(item_vecs).toarray()
    # Scale the recommendations between 0 and 1 using MinMax
    scaler = MinMaxScaler()
    rec_vector_scaled = scaler.fit_transform(rec_vector.reshape(-1,1))[:,0]
    # Multiply by the purchased vector so that items already purchased are set to 0
    recommend_vector = pref_vec*rec_vector_scaled
    # Sort the indices in order of recommendations
    item_idx = np.argsort(recommend_vector)[::-1][:num_items]
    # Get the list of recommended items
    rec_list = []
    for index in item_idx:
        stock_code = item_arr[index]
        rec_list.append(stock_code)
    return rec_list

In [None]:
# Let's look at what was recommended for the 3 customers
for cust in customers_arr[:9]:
    print('Customer ID: ', cust)
    print(get_rec_item(cust, train_mat, cust_vecs, item_vecs, customers_arr, items_arr))
    print('--------------------------------------------------')

In [None]:
'''
Let's compare the items bought vs the items recommended by customer in a dataframe so 
that we can see more clearly how the recommendation engine did
'''

def compare_purchase_rec(accountid, key, rec_list):
    '''
    This function returns a dataframe with the 10 purchased items and top 10 recommended items for each customer
    
    args:
    customer_id - The customer ID in the purchase matrix
    purchase_dict - The output of the get_items_purchased function which is a dictionary of stock_cd:description of items purchased
    rec_list - The output of the get_rec_item function which is a list of the top n stock_cd and description pairs
    
    returns:
    A dataframe with all purchased items and top n recommended items by customer
    '''
    # Create dataframes of one column each - CustomerID, Purchased items, Recommended Items
    cust_df = pd.DataFrame({'CustID': [accountid]})
    purchase_df = pd.DataFrame({'PurchasedItem': list(key)})
    rec_df = pd.DataFrame({'RecommendedItem': [pair for pair in rec_list]})

    # Column wise concatenate the dataframes
    final_df = pd.concat([cust_df, purchase_df, rec_df], ignore_index=True, axis=1)
    # Format the final dataframe
    final_df.columns = ['CustID', 'PurchasedItem', 'RecommendedItem']
    final_df['PurchasedItem'] = final_df.PurchasedItem.astype(str)
    final_df['RecommendedItem'] = final_df.RecommendedItem.astype(str)
    final_df = final_df.fillna('')
    final_df = final_df.replace('nan', '', regex=True)
    return final_df

In [None]:
# Let's compare the same 3 customers
for cust in customers_arr[10:12]:
    print(tabulate(compare_purchase_rec(cust, 
                               get_items_purchased(cust, train_mat, customers_arr, items_arr),
                               get_rec_item(cust, train_mat, cust_vecs, item_vecs, customers_arr, items_arr)),
                  headers= ['CustID', 'PurchasedItem', 'RecommendedItem']))