# Packages and Functions

In [19]:
import numpy as np
import pandas as pd
from scipy.stats import dirichlet
from scipy.special import logsumexp, gammaln, digamma, polygamma
from datetime import datetime
import pickle
from sklearn.preprocessing import MultiLabelBinarizer

In [20]:
def logdotexp(A, B):
    max_A = np.max(A)
    max_B = np.max(B)
    C = np.dot(np.exp(A - max_A), np.exp(B - max_B))
    np.log(C, out=C)
    C += max_A + max_B
    return C

# Load List of Sparse Matrices

In [21]:
path = '/content/drive/MyDrive/PhD/Modules/IS6101 Topics in Machine Learning and Optimization/HMM for CF/Data and Parameters/'
users_ds = pickle.load(open(path + 'users_ds.pkl','rb'))

In [22]:
U = len(users_ds)
T = users_ds[0].shape[0]
N = users_ds[0].shape[1]
U, T, N

(874, 73, 17606)

In [23]:
# test_ds = [user[-1,:] for user in users_ds]
# users_ds = [user[:-1,:] for user in users_ds]

In [24]:
t_predict = -2
test_ds = [user[t_predict,:] for user in users_ds]
users_ds = [user[47:t_predict,:] for user in users_ds]

In [25]:
len(test_ds), test_ds[0].shape[0], test_ds[0].shape[1]

(874, 1, 17606)

In [26]:
U = len(users_ds)
T = users_ds[0].shape[0]
N = users_ds[0].shape[1]
U, T, N

(874, 24, 17606)

In [27]:
# calculate and store total number of ratings per user in a period

users_Nt = []
for i in range(len(users_ds)):
    user_sum = users_ds[i].sum(axis=-1)
    user_sum = np.array([row[0,0] for row in user_sum]) # for some reason, flatten/squeeze doesn't work
    users_Nt.append(user_sum)
    
len(users_Nt), users_Nt[0].shape

(874, (24,))

# Multiple runs on same settings

In [28]:
for r in range(5):
    # Initial parameters
    K = 10 # no of latent classes
    prior_const = K-1 # affects the parameters of the Dirichlet priors
    pi_alpha = prior_const/K * np.ones((K)) # alpha hyperparams for pi
    A_alpha = prior_const/K * np.ones((K,K)) # alpha hyperparams for A
    theta_alpha = prior_const/K * np.ones((K,N)) # alpha hyperparams for theta
    a = np.random.uniform(low=2, high=5, size=(K)) # initialise parameter a randomly
    p = np.random.uniform(low=0.6, high=0.8, size=(K)) # initialise parameter p which corresponds to standard multinomial set up 
    b = p / (1-p) # derive parameter b for the gamma mixture of poisson distribution which = NBD

    pi = dirichlet.rvs(alpha=pi_alpha, size=1) # initialise pi randomly
    A = np.zeros((K,K)) 
    theta = np.zeros((K,N))
    for k in range(K):
        A[k,:] = dirichlet.rvs(alpha=A_alpha[k,:], size=1) 
        theta[k,:] = dirichlet.rvs(alpha=theta_alpha[k,:], size=1) 

    A = np.log(A)
    pi = np.log(pi)
    theta = np.log(theta)

    # EM Algorithm
    start_time = datetime.now()
    epsilon = 1e-4
    a_epsilon = 1e-3
    old_likelihood = None
    iteration = 0
    while True:
        # initialise variables to store per user calculations
        init = 0
        trans = 0
        nbd = 0
        multi = 0
        N_bar = 0
        log_N_bar = 0
        user_class = np.empty((U,K))
        for u in range(U):
            # E STEP PER USER

            dataset = np.array(users_ds[u].todense())
            N_t = np.array(users_Nt[u])

            # log prob of N given z as gamma mixture of poisson i.e. number of articles read
            p_n_ab = gammaln(N_t[..., np.newaxis] + a[np.newaxis, ...]) \
                    - gammaln(a)[np.newaxis, ...] - gammaln(N_t+1)[..., np.newaxis] \
                    + N_t[..., np.newaxis] * np.log(b)[np.newaxis, ...]  \
                    - (N_t[..., np.newaxis] + a[np.newaxis, ...]) * np.log(b+1)[np.newaxis, ...]

            # log prob of I given z and N as Multinomial(theta) i.e. which articles are read where read=1/unread=0     
            p_i_theta = (gammaln(N_t+1) - gammaln(dataset+1).sum(axis=1))[..., np.newaxis] \
                        + np.dot(dataset, theta.T)

            # log prob of joint dist of N, I given z
            p_i_z = p_n_ab + p_i_theta

            # HMM for CF paper definition of alpha and beta
            alpha = np.empty((T,K))
            p_i_i = np.empty((T))

            alpha[0] = p_i_z[0] + pi
            alpha[0] -= logsumexp(alpha[0])
            for t in range(1, T):
                alpha[t] = logdotexp(alpha[t-1], A) + p_i_z[t]
                p_i_i[t] = logsumexp(alpha[t]) # normalization constant for alpha/beta/p_zz_i
                alpha[t] -= p_i_i[t]

            beta = np.zeros((T,K))
            for t in range(-2, -1, -1):
                beta[t] = logdotexp(A, (p_i_z[t+1] + beta[t+1]))
                beta[t] -= p_i_i[t+1] # normalization

            # calculate the posterior P(Z|I) 
            p_z_i = alpha + beta
            user_class[u,:] = p_z_i[-1,:]

            # calculate the transitional posterior P(Z(t-1), Z(t)|I)
            p_zz_i = np.zeros((T-1,K,K))
            for t in range(T-1):
                p_zz_i[t,:,:] = np.tile(alpha[t,:], (K,1)).T + A + np.tile(p_i_z[t+1,:], (K,1)) + np.tile(beta[t+1,:], (K,1)) 
                p_zz_i[t,:,:] -= p_i_i[t+1][...,np.newaxis,np.newaxis] # normalization

            # CALCULATE EXPECTED LOG LIKELIHOOD PER USER    

            # intial state 
            init += np.sum(np.exp(p_z_i[0]) * pi)

            # transitional 
            trans += np.sum(np.exp(p_zz_i) * A[np.newaxis,...])

            # # of items 
            nbd += np.sum(np.exp(p_z_i) * p_n_ab)

            # specific item 
            multi += np.sum(np.exp(p_z_i) * p_i_theta)

            # M STEP PER USER

            # update pi parameters using MAP
            pi_alpha += np.exp(p_z_i[0])

            # update A parameters using MAP
            A_alpha += np.sum(np.exp(p_zz_i), axis=0)

            # update theta parameters using MAP
            for k in range(K):
                theta_alpha[k,:] += np.sum(dataset * np.exp(p_z_i[:,k][...,np.newaxis]), axis=0)

            # update a using MLE with Newton's method
            N_bar += np.sum((np.exp(p_z_i) * N_t[...,np.newaxis] + a[np.newaxis,...]) * (b / (b + 1))[np.newaxis,...], axis=0) / (U*T)
            log_N_bar += np.sum(digamma(np.exp(p_z_i) * N_t[...,np.newaxis] + a[np.newaxis,...]) + np.log(b / (b + 1))[np.newaxis,...], axis=0) / (U*T)

        # CALCULATE EXPECTED LOG LIKELIHOOD FOR ALL USERS COMBINED
        # calculation done after posterior of latent class is calculated which means KL divergence=0
        # lower bound = evidence since q(z) = p(z|x)
        # check that it is increasing at every iteration and check for convergence condition

        if old_likelihood is None:
            old_likelihood = init + trans + nbd + multi
        else:
            new_likelihood = init + trans + nbd + multi 
            if np.isnan(new_likelihood):
                print('Numerical issues in calculation of log likelihood\nPrevious calculated log likelihood = ', old_likelihood)
                break
            if new_likelihood < old_likelihood:
                print('Iteration resulted in lower log likelihood =', new_likelihood)
                break
            if np.abs((new_likelihood - old_likelihood) / old_likelihood) < epsilon:
                old_likelihood = new_likelihood
                print('Iteration', iteration,': log likelihood =', old_likelihood, '\n')
                print('Convergence attained \n')
                break
            old_likelihood = new_likelihood
        print('Iteration', iteration,': log likelihood =', old_likelihood)
        iteration += 1

        # M STEP FOR ALL USERS

        # update pi log prob using MAP with the parameters
        pi = (pi_alpha - 1) / (np.sum(pi_alpha) - K)
        pi = np.log(pi)
        
        # update A log prob using MAP with the parameters
        A = (A_alpha - 1) / (np.sum(A_alpha, axis=-1) - K)[...,np.newaxis] # to align the division
        A = np.log(A)

        # update theta log prob using MAP with the parameters
        theta = theta_alpha / (np.sum(theta_alpha, axis=-1))[...,np.newaxis]
        theta = np.log(theta)
        
        # update a using MLE with Newton's method
        for _ in range(10):
            a_new = (1/a + (log_N_bar - np.log(N_bar) + np.log(a) - digamma(a)) / (a**2 * (1/a - polygamma(1, a))))**-1
            if np.isnan(a).any():
                print('Numerical issues in calculating parameter a')
                break
            if np.sum(np.abs(a_new - a)) / np.sum(a) < a_epsilon:
                a = a_new
                break
            a = a_new
        
        # update b using MLE 
        b = N_bar / a

    run_time = datetime.now() - start_time
    np.set_printoptions(precision=3)
    print('Execution Time:', run_time)
    print('\npi = ', np.exp(pi),'\n\nA = ', np.exp(A),'\n\na = ', a,'\n\nb = ', b,'\n\np(Z(t=T, u=0:10)|I) = \n', np.exp(user_class[:3]))
    # save parameters
    threshold = '1000'
    alpha = str(prior_const)
    run = str(r+1)
    path = '/content/drive/MyDrive/PhD/Modules/IS6101 Topics in Machine Learning and Optimization/HMM for CF/Data and Parameters/'
    np.save(path + 'pi_K_' + str(K) + '_threshold_' + threshold + '_alpha_' + alpha + '_run_' + run, pi)
    np.save(path + 'mA_K_' + str(K) + '_threshold_' + threshold + '_alpha_' + alpha + '_run_' + run, A)
    np.save(path + 'va_K_' + str(K) + '_threshold_' + threshold + '_alpha_' + alpha + '_run_' + run, a)
    np.save(path + 'b_K_' + str(K) + '_threshold_' + threshold + '_alpha_' + alpha + '_run_' + run, b)
    np.save(path + 'user_class_K_' + str(K) + '_threshold_' + threshold + '_alpha_' + alpha + '_run_' + run, user_class)

    # number of items to recommend
    num_items = 5000

    # log prob of user each latent class in next period assuming user in Z(t) with log p(Z(t)|I(1:T))
    # result is multiplying transitional prob to prob of user in each latent class at time t
    # p_z = logdotexp(p_z_i[:,-1], A)
    p_z = logdotexp(user_class, A)

    # calculate probability that item i is not read in the next time period
    p_noti_z = np.power(1 + b[...,np.newaxis] * np.exp(theta), -a[...,np.newaxis])

    # calculate rank score of the items likely to appear in next time period
    rank_score = -np.exp(p_z) @ p_noti_z

    # generate indices of top num_items to recommend which will be unsorted
    rec_list = np.argpartition(rank_score, -num_items, axis=-1)[:,-num_items:]

    # sort indices by rank score
    rec_list_score = np.array([row[rec_list[i,:]] for i, row in enumerate(rank_score)]) # get the scores of items in rec_list
    sorted_rec_list = np.array([row[np.flip(np.argsort(rec_list_score[i]))] for i, row in enumerate(rec_list)]) # sort the rec_list based on the score
    sorted_rec_list[:10]

    # check if item in user history
    user_history = np.array([row[:,sorted_rec_list[i]] for i, row in enumerate(users_ds)]) # get all binary values in user_ds corresponding to the item in rec_list for each user in each time period
    user_history = np.array([np.sum(user, axis=0) for user in user_history]).squeeze() # get boolean array indicating whether each item in sorted_rec_list is in user history (assumes user only has each item at most once)
    if user_history.max() > 1: print('There are repeated ratings of a movie by at least one user')
    # print(user_history.shape)

    # filter sorted_rec_list for items not in user history
    filtered_rec_list = [row[np.logical_not(user_history[i])] for i, row in enumerate(sorted_rec_list)] # each user's list will not have the same amount of items as it depends on user history
    # get multi-hot encoding of top N recommended movies for the next period

    mlb = MultiLabelBinarizer(range(N), sparse_output=True) # prediction done on based on one hot encoding indexing i.e. starting index is 0
    # top_5_list = mlb.fit_transform(filtered_rec_list) 
    top_5_list = [mlb.fit_transform([user[:5]]) for user in filtered_rec_list] # convert top 5 list to one hot encoding
    # print(np.shape(top_5_list), top_5_list[0].sum()) # top 5 list means sum = 5
    top_10_list = [mlb.fit_transform([user[:10]]) for user in filtered_rec_list] 
    # print(np.shape(top_10_list), top_10_list[0].sum())

    # test how many of top N recommended movies appear in user's rated list of movies in the test period
    positive_top_5 = [rec_user.multiply(test_ds[i]) for i, rec_user in enumerate(top_5_list)] # get (#users,#items) boolean vectors indicating whether recommended movie was rating in test period
    users_result_top_5 = [row.sum() for row in positive_top_5] # get list of positive matches per user
    all_result_top_5 = np.sum(users_result_top_5) # total number of positive matches across all users

    positive_top_10 = [rec_user.multiply(test_ds[i]) for i, rec_user in enumerate(top_10_list)] 
    users_result_top_10 = [row.sum() for row in positive_top_10] # get list of positive matches per user
    all_result_top_10 = np.sum(users_result_top_10) # total number of positive matches across all users

    test_num_movies_rated = np.sum(test_ds).sum()

    # print(all_result_top_5, all_result_top_10, test_num_movies_rated)

    # output results to excel via pandas df
    dict_result = {'# Ratings Threshold':threshold, '# Users':U, '# Movies':N, 'K':K, 'T': T, 't_predict':t_predict,
                'Dirichlet Prior Parameter':alpha, 'Run':run, 'Convergence epsilon': epsilon,
                'Log Likelihood':old_likelihood, 'Iterations':iteration,'Time (min)':run_time*24*60,
                'Avg Time per iteration (s)':run_time*24*60*60/iteration, 
                '# movies rated in test period': test_num_movies_rated, 
                'Total +ve for top 5':all_result_top_5, 
                'Precision of top 5':all_result_top_5/(5*U),
                'Recall of top 5':all_result_top_5/test_num_movies_rated,
                'Total +ve for top 10':all_result_top_10,
                'Precision of top 10':all_result_top_10/(10*U),
                'Recall of top 10':all_result_top_10/test_num_movies_rated
                }
    df_result = pd.DataFrame(data=dict_result, index=[0])
    print(df_result)

    table_path = '/content/drive/MyDrive/PhD/Modules/IS6101 Topics in Machine Learning and Optimization/HMM for CF/Report/Table.xlsx'
    df_table = pd.read_excel(table_path)
    df_table = pd.concat([df_table ,df_result], ignore_index=True)
    df_table.to_excel(table_path, index=False)

Iteration 0 : log likelihood = -4796912.726427405
Iteration 1 : log likelihood = -3969056.0823254026
Iteration 2 : log likelihood = -3886982.5048903986
Iteration 3 : log likelihood = -3841325.657217962
Iteration 4 : log likelihood = -3815055.323419285
Iteration 5 : log likelihood = -3798422.849588177
Iteration 6 : log likelihood = -3786711.6386238015
Iteration 7 : log likelihood = -3778137.79837009
Iteration 8 : log likelihood = -3771575.7347250134
Iteration 9 : log likelihood = -3766468.7867676285
Iteration 10 : log likelihood = -3762436.327564114
Iteration 11 : log likelihood = -3759111.2267586133
Iteration 12 : log likelihood = -3756309.557142806
Iteration 13 : log likelihood = -3753945.8201048267
Iteration 14 : log likelihood = -3751921.7065121294
Iteration 15 : log likelihood = -3750165.7890330385
Iteration 16 : log likelihood = -3748619.0725070373
Iteration 17 : log likelihood = -3747239.2522660876
Iteration 18 : log likelihood = -3746005.322325865
Iteration 19 : log likelihood =