# Packages

In [1]:
import numpy as np
import pandas as pd
import pickle
import torch
from torch.distributions import Dirichlet, Multinomial, Categorical, Beta
from datetime import datetime
from sklearn.preprocessing import MultiLabelBinarizer

try:
    import google.colab
    IN_COLAB = True
    path = '/content/drive/MyDrive/PhD/Modules/CS5340 Uncertainty Modeling in AI/Project/'
except:    
    IN_COLAB = False
    path = './'

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

n_splits = int(2) # excludes final chunk
splits = int(400)

# Dataset

In [2]:
users_ds = np.load(path + 'users_ds_dense.npy', allow_pickle=True)

In [3]:
# size of training dataset
U, T, N = users_ds.shape
print('Shape of dataset:', users_ds.shape)
print('Number of users:', U)
print('Number of movie titles:', N)
print('Number of periods:', T)

Shape of dataset: (1212, 73, 17768)
Number of users: 1212
Number of movie titles: 17768
Number of periods: 73


In [4]:
t_predict = -1                                          # index for holdout period for prediction
test_ds = users_ds[:,t_predict,:]                       # extract holdout period from dataset
test_ds = test_ds[:,np.newaxis,:]                       # align # of dimensions
users_ds = users_ds[:,:t_predict,:]                     # specify training periods

In [5]:
# calculate and store total number of ratings per user in a period
users_Nt = np.sum(users_ds, axis=-1)    # number of movies rated by user in each time period
T = np.shape(users_ds)[1]               # final length of time period after trimming
users_Nt.shape 

(1212, 72)

In [6]:
users_ds = torch.tensor(users_ds, dtype=torch.float).to(device)
users_Nt = torch.tensor(users_Nt).to(device)

# Helper Functions

In [7]:
def logdotexp(A, B):
    max_A = A.max()
    max_B = B.max()
    C = torch.matmul(torch.exp(A - max_A), torch.exp(B - max_B))
    torch.log(C, out=C)
    C += max_A + max_B
    return C

In [8]:
def initialise_params_and_probs(pi_alpha, A_alpha, theta_alpha):
    K = len(pi_alpha)
    pi = Dirichlet(pi_alpha).sample().squeeze().to(device)                        # Initial state distribution
    A = torch.zeros((K,K)).to(device)                                                         # Transitional probabilities
    theta = torch.zeros((K,N)).to(device)                                                     # Multinomial probabilities
    for k in range(K):
        A[k,:] = Dirichlet(A_alpha[k,:]).sample().squeeze()
        theta[k,:] = Dirichlet(theta_alpha[k,:]).sample().squeeze()
    pi = torch.log(pi)
    A = torch.log(A)
    theta = torch.log(theta)

    # NBD parameters
    a = torch.ones((K)).to(device)
    p = torch.FloatTensor(K).uniform_(0.7, 0.9).to(device)
    b = p / (1-p)
    return pi, A, theta, a, b

# K = 5
# pi_alpha = torch.ones((K))
# A_alpha = torch.ones((K,K))
# theta_alpha = torch.ones((K,N))
# pi, A, theta, a, b = initialise_params_and_probs(pi_alpha, A_alpha, theta_alpha)

In [9]:
def initialise_latent_states(pi, A, U, T):
    # latent states of all users across the periods
    latent_states = torch.zeros((U,T), dtype=int).to(device)
    latent_states[:,0] = torch.multinomial(torch.exp(pi), U, replacement=True)    # initial latent state assignments
    sample_A = torch.tile(torch.exp(A), (U,1,1)).to(device)
    for t in range(T-1):
        previous_states = latent_states[:,t]
        pvals = sample_A[t, previous_states] 
        latent_states[:,t+1] = torch.multinomial(pvals, 1, replacement=True).squeeze()
        
    return latent_states

# latent_states = initialise_latent_states(pi, A, U, T)
# print(latent_states.shape, latent_states[0])

In [10]:
def update_b(latent_states, users_Nt, K, a):
    alpha = torch.ones((K), dtype=torch.float).to(device)
    beta = torch.tensor(users_Nt.shape[0] * users_Nt.shape[1] + 1).to(torch.float).to(device)

    b = torch.zeros((K)).to(device)
    for i in range(K):
        indices = latent_states == i
        alpha[i] += users_Nt[indices].sum()
        p = Beta(alpha[i], beta).sample()
        b[i] = p / (1-p)

    return b

# b = update_b(latent_states, users_Nt, K, 1)
# print(b)

In [11]:
def update_theta(latent_states, dataset, K):
    alpha = torch.ones((K,N), dtype=torch.float).to(device)
    U = latent_states.shape[0]
    for u in range(U):
        user_states = latent_states[u,:]
        counts = dataset[u,:,:]
        for t in range(T):
            alpha[user_states[t]] += counts[t]

    # print(alpha.sum() - K*N, dataset.sum()) # checksum

    # theta = rng.dirichlet(alpha=alpha, size=1) # dirichlet does not accept array like for alpha unlike multinomial

    theta = torch.zeros((K,N), dtype=torch.float).to(device)
    for k in range(K):
        theta[k,:] = Dirichlet(alpha[k,:]).sample().squeeze()

    return torch.log(theta)

# theta = update_theta(latent_states, users_ds, K)
# np.exp(theta).sum(axis=-1)

In [12]:
def update_A(latent_states, K):
    alpha = torch.ones((K,K), dtype=torch.float)
    T = latent_states.shape[1]

    for t in range(T - 1):
        current_states = latent_states[:,t]
        next_states = latent_states[:,t+1]
        for i in range(len(current_states)):
            alpha[current_states[i], next_states[i]] += 1
    
    # print(alpha.sum() - K*K, U*(T-1)) # checksum

    A = torch.zeros((K,K), dtype=torch.float).to(device)
    for k in range(K):
        A[k,:] = Dirichlet(A_alpha[k,:]).sample().squeeze()
    return torch.log(A)

# A = update_A(latent_states, K)
# np.exp(A)

In [13]:
def update_pi(latent_states, K):
    alpha = torch.ones((K), dtype=torch.float).to(device)

    for i in range(K):
        alpha[i] += (latent_states[:,0] == i).sum()

    # print(alpha.sum() - K, U)

    pi = Dirichlet(alpha).sample().squeeze()

    return torch.log(pi)

# pi = update_pi(latent_states, K)
# print(np.exp(pi))

In [22]:
def calculate_emission_prob(users_ds, users_Nt, nbd_first_part, multi_first_part, theta, a, b, n_splits=n_splits, splits=splits):
    
    # log prob of N given z as gamma mixture of poisson i.e. number of articles read
#     p_n_ab = gammaln(users_Nt[..., np.newaxis] + a[np.newaxis, np.newaxis, ...]) \
#             - gammaln(a)[np.newaxis, np.newaxis, ...] - gammaln(users_Nt+1)[..., np.newaxis] \
#             + users_Nt[..., np.newaxis] * np.log(b)[np.newaxis, np.newaxis, ...]  \
#             - (users_Nt[..., np.newaxis] + a[np.newaxis, np.newaxis, ...]) * np.log(b+1)[np.newaxis, np.newaxis, ...]

    second_part = users_Nt.unsqueeze(-1) * torch.log(b).unsqueeze(0).unsqueeze(0)  \
                - (users_Nt.unsqueeze(-1) + a.unsqueeze(0).unsqueeze(0)) * torch.log(b+1).unsqueeze(0).unsqueeze(0)
    
    p_n_ab = nbd_first_part + second_part

    # log prob of I given z and N as Multinomial(theta) i.e. which movies are rated=1/unrated=0
    splits_list = []
    for i in range(n_splits):
        temp_ds = users_ds[i*splits:(i+1)*splits]
        splits_list.append(torch.matmul(temp_ds, theta.T))
    temp_ds = users_ds[n_splits * splits:]
    splits_list.append(torch.matmul(temp_ds, theta.T))
    second_part = torch.cat(splits_list, dim=0)
    del temp_ds, splits

    p_i_theta = multi_first_part + second_part

    # log prob of joint dist of N, I given z
    p_i_z = p_n_ab + p_i_theta

    return p_i_z, p_n_ab, p_i_theta

In [15]:
# out1 = torch.matmul(users_ds, theta.T)
# ds1 = users_ds[:600]
# ds2 = users_ds[600:]
# out2 = torch.matmul(ds1, theta.T)
# out3 = torch.matmul(ds2, theta.T)
# out4 = torch.cat((out2, out3), dim=0)
# assert (out1 == out4).all()

In [16]:
def calculate_posterior(A, pi, p_i_z, U, T, K):

    # Calculate normalised alpha and beta
    alpha = torch.zeros((U,T,K), dtype=torch.float).to(device)
    p_i_i = torch.zeros((U,T), dtype=torch.float).to(device)

    alpha[:,0] = p_i_z[:,0] + pi
    alpha[:,0] -= torch.logsumexp(alpha[:,0], axis=-1).unsqueeze(-1)
    for t in range(1, T):
        alpha[:,t] = logdotexp(alpha[:,t-1], A) + p_i_z[:,t]
        p_i_i[:,t] = torch.logsumexp(alpha[:,t], dim=-1)
        alpha[:,t] -= p_i_i[:,t].unsqueeze(-1)

    beta = torch.zeros((U,T,K), dtype=torch.float).to(device)
    for u in range(U):
        for t in range(T-2, -1, -1):
            beta[u,t] = logdotexp(A, (p_i_z[u,t+1] + beta[u,t+1]))
            beta[u,t] -= p_i_i[u,t+1] # normalization

    # numerical issues "divide by zero encountered in log" for the vectorized code below
    # for t in range(T-2, -1, -1):
    #     beta[:,t,:] = logdotexp((p_i_z[:,t+1,:] + beta[:,t+1,:]), A.T)
    #     beta[:,t,:] -= p_i_i[:,t+1][...,np.newaxis] # normalization

    # log prob of Z(t) given I(1:T)
    p_z_i = alpha + beta

    # log prob of Z(t), Z(t+1) given I(1:T)
    p_zz_i = torch.zeros((U,T-1,K,K), dtype=torch.float).to(device)
    for u in range(U):
        for t in range(T-1):
            p_zz_i[u,t,:,:] = torch.tile(alpha[u,t,:], (K,1)).T + A + torch.tile(p_i_z[u,t+1,:], (K,1)) + torch.tile(beta[u,t+1,:], (K,1)) 
            p_zz_i[u,t,:,:] -= p_i_i[u,t+1].unsqueeze(-1).unsqueeze(-1) # normalization
    
    return p_z_i, p_zz_i

In [17]:
def update_all_states(latent_states, p_z_i, p_zz_i):
    U, T = latent_states.shape

    sample_pi = torch.exp(p_z_i[:,0,:]).mean(dim=0)    
    latent_states[:,0] = torch.multinomial(sample_pi, U, replacement=True)

    sample_A = torch.exp(p_zz_i - p_z_i[:,:-1,:].unsqueeze(-1)).mean(dim=0)
    for t in range(T-1):
        previous_states = latent_states[:,t]
        pvals = sample_A[t, previous_states]
        latent_states[:,t+1] = torch.multinomial(pvals, 1, replacement=True).squeeze()

    return latent_states

# p_i_z, p_n_ab, p_i_theta = calculate_emission_prob(users_ds, users_Nt, nbd_first_part, multi_first_part, theta, a, b)
# p_z_i, p_zz_i = calculate_posterior(A, pi, p_i_z, U, T, K)
# update_all_states(latent_states, p_z_i, p_zz_i)

In [18]:
def calculate_log_likelihood(pi, A, p_n_ab, p_i_theta, p_z_i, p_zz_i):
    # intial state 
    init = torch.sum(torch.multiply(torch.exp(p_z_i[:,0]), pi.unsqueeze(0)))

    # transitional 
    trans = torch.sum(torch.multiply(torch.exp(p_zz_i), A.unsqueeze(0).unsqueeze(0)))

    # # of items 
    nbd = torch.sum(torch.multiply(torch.exp(p_z_i), p_n_ab))

    # specific item 
    multi = torch.sum(torch.multiply(torch.exp(p_z_i), p_i_theta))
    
    return init + trans + nbd + multi

In [19]:
# out1 = torch.lgamma(users_ds + 1).sum(dim=-1)
# out2 = torch.lgamma(ds1 + 1).sum(dim=-1)
# out3 = torch.lgamma(ds2 + 1).sum(dim=-1)
# out4 = torch.cat((out2, out3), dim=0)
# assert (out1 == out4).all()

# Gibbs Sampling

In [20]:
def gibbs_sampling(users_ds, users_Nt, pi_alpha, A_alpha, theta_alpha, n_iterations, burn_in=50, print_freq=1):
    max_likelihood = None
    U, T = users_ds.shape[:-1]
    K = len(pi_alpha)

    pi, A, theta, a, b = initialise_params_and_probs(pi_alpha, A_alpha, theta_alpha)
    latent_states = initialise_latent_states(pi, A, U, T)

    # store repeated calculations in NBD and multinomial log prob for significant speed up
    splits_list = []
    for i in range(n_splits):
        temp_ds = users_ds[i*splits:(i+1)*splits]
        splits_list.append(torch.lgamma(temp_ds + 1).sum(dim=-1))
    temp_ds = users_ds[n_splits * splits:]
    splits_list.append(torch.lgamma(temp_ds + 1).sum(dim=-1))
    multi_first_part = (torch.lgamma(users_Nt + 1) - torch.cat(splits_list, dim=0)).unsqueeze(-1)
    del temp_ds, splits_list

    nbd_first_part = torch.lgamma(users_Nt.unsqueeze(-1) + a.unsqueeze(0).unsqueeze(0)) \
            - torch.lgamma(a).unsqueeze(0).unsqueeze(0) - torch.lgamma(users_Nt+1).unsqueeze(-1) 

    # initialise sum of parameters
    pi_bar = torch.zeros(pi.shape).to(device)
    A_bar = torch.zeros(A.shape).to(device)
    theta_bar = torch.zeros(theta.shape).to(device)
    b_bar = torch.zeros(b.shape).to(device)

    # GIBBS SAMPLING
    start_time = datetime.now() # for keeping track of running time
    for iteration in range(n_iterations + burn_in):

        # UPDATE PARAMETERS AND PROBABILITIES
        b = update_b(latent_states, users_Nt, K, a[0])
        theta = update_theta(latent_states, users_ds, K)
        A = update_A(latent_states, K)
        pi = update_pi(latent_states, K)

        # UPDATE LATENT STATES    
        p_i_z, p_n_ab, p_i_theta = calculate_emission_prob(users_ds, users_Nt, nbd_first_part, multi_first_part, theta, a, b)
        p_z_i, p_zz_i = calculate_posterior(A, pi, p_i_z, U, T, K)
        latent_states = update_all_states(latent_states, p_z_i, p_zz_i)

        # CALCULATE EXPECTED LOG LIKELIHOOD
        likelihood = calculate_log_likelihood(pi, A, p_n_ab, p_i_theta, p_z_i, p_zz_i)
        if max_likelihood is None: 
            max_likelihood = likelihood
        else:
            if likelihood > max_likelihood: 
                max_likelihood = likelihood
                pi_max = pi
                A_max = A
                theta_max = theta
                a_max = a
                b_max = b

        print('Iteration', iteration+1,': log likelihood =', likelihood)

        if iteration+1 > burn_in:
            pi_bar += pi
            A_bar += A
            theta_bar += theta
            b_bar += b

    pi_bar /= n_iterations
    A_bar /= n_iterations
    theta_bar /= n_iterations
    b_bar /= n_iterations

    run_time = datetime.now() - start_time    
    print('Execution time for Gibbs Sampling iterations:', run_time)

    return ((pi_bar, A_bar, theta_bar, a, b_bar), 
            (pi_max, A_max, theta_max, a_max, b_max))

In [23]:
n_iterations = 2
K = 5

prior_const = 0.9*K                                       # affects the parameters of the Dirichlet priors
pi_alpha = prior_const/K * torch.ones((K))                 # alpha hyperparams for pi
A_alpha = prior_const/K * torch.ones((K,K))                # alpha hyperparams for A
theta_alpha = prior_const/K * torch.ones((K,N))            # alpha hyperparams for theta

expected_params, max_params = gibbs_sampling(users_ds, users_Nt, pi_alpha, A_alpha, theta_alpha, n_iterations, burn_in=1)
pi_bar, A_bar, theta_bar, a_bar, b_bar = expected_params
pi_max, A_max, theta_max, a_max, b_max = max_params

Iteration 1 : log likelihood = tensor(-14983963., device='cuda:0')
Iteration 2 : log likelihood = tensor(-14855959., device='cuda:0')
Iteration 3 : log likelihood = tensor(-14609771., device='cuda:0')
Execution time for Gibbs Sampling iterations: 0:01:33.303807


In [None]:
# save parameters

np.save(path + 'pi_bar_K_' + str(K), pi_bar.cpu().numpy())
np.save(path + 'A_bar_K_' + str(K), A_bar.cpu().numpy())
np.save(path + 'theta_bar_K_' + str(K), theta_bar.cpu().numpy())
np.save(path + 'a_bar_K_' + str(K), a_bar.cpu().numpy())
np.save(path + 'b_bar_K_' + str(K), b_bar.cpu().numpy())

np.save(path + 'pi_max_K_' + str(K), pi_max.cpu().numpy())
np.save(path + 'A_max_K_' + str(K), A_max.cpu().numpy())
np.save(path + 'theta_max_K_' + str(K), theta_max.cpu().numpy())
np.save(path + 'a_max_K_' + str(K), a_max.cpu().numpy())
np.save(path + 'b_max_K_' + str(K), b_max.cpu().numpy())

# Evaluation on test data

In [None]:
# Use either the expected parameter values or the max likelihood parameteres

# pi, A, theta, a, b = pi_bar, A_bar, theta_bar, a_bar, b_bar

pi, A, theta, a, b = pi_max, A_max, theta_max, a_max, b_max

In [None]:
multi_first_part = (torch.lgamma(users_Nt + 1) - torch.lgamma(users_ds + 1).sum(dim=-1)).unsqueeze(-1)
nbd_first_part = torch.lgamma(users_Nt.unsqueeze(-1) + a.unsqueeze(0).unsqueeze(0)) \
        - torch.lgamma(a).unsqueeze(0).unsqueeze(0) - torch.lgamma(users_Nt+1).unsqueeze(-1) 
p_i_z, p_n_ab, p_i_theta = calculate_emission_prob(users_ds, users_Nt, nbd_first_part, multi_first_part, theta, a, b)
p_z_i, p_zz_i = calculate_posterior(A, pi, p_i_z, U, T, K)
likelihood = calculate_log_likelihood(pi, A, p_n_ab, p_i_theta, p_z_i, p_zz_i)
print(likelihood)

In [None]:
# number of items to recommend
num_items = 5000

# log prob of user each latent class in next period assuming user in Z(t) with log p(Z(t)|I(1:T))
# result is multiplying transitional prob to prob of user in each latent class at time t
p_z = logdotexp(p_z_i[:,-1], A).cpu().numpy()

pi, A, theta, a, b = pi.cpu().numpy(), A.cpu().numpy(), theta.cpu().numpy(), a.cpu().numpy(), b.cpu().numpy() # move back to cpu and numpy

# calculate probability that item i is not read in the next time period
p_noti_z = np.power(1 + b[...,np.newaxis] * np.exp(theta), -a[...,np.newaxis])

# calculate rank score of the items likely to appear in next time period
rank_score = -np.exp(p_z) @ p_noti_z

# generate indices of top num_items to recommend which will be unsorted
rec_list = np.argpartition(rank_score, -num_items, axis=-1)[:,-num_items:]

# sort indices by rank score
rec_list_score = np.array([row[rec_list[i,:]] for i, row in enumerate(rank_score)]) # get the scores of items in rec_list
sorted_rec_list = np.array([row[np.flip(np.argsort(rec_list_score[i]))] for i, row in enumerate(rec_list)]) # sort the rec_list based on the score

# check if item in user history
user_history = np.array([row[:,sorted_rec_list[i]] for i, row in enumerate(users_ds.cpu().numpy())]) # get all values in user_ds corresponding to the item in rec_list for each user in each time period
user_history = np.sum(user_history, axis=1) # get boolean array indicating whether each item in sorted_rec_list is in user history (assumes user only has each item at most once)
if user_history.max() > 1: print('There are repeated ratings of a movie by at least one user')
# print(user_history.shape)

# filter sorted_rec_list for items not in user history
filtered_rec_list = [row[np.logical_not(user_history[i])] for i, row in enumerate(sorted_rec_list)] # each user's list will not have the same amount of items as it depends on user history

# get multi-hot encoding of top N recommended movies for the next period
mlb = MultiLabelBinarizer(classes=range(N), sparse_output=False) # prediction done on based on one hot encoding indexing i.e. starting index is 0
top_5_list = [mlb.fit_transform([user[:5]]) for user in filtered_rec_list] # convert top 5 list to one hot encoding
top_10_list = [mlb.fit_transform([user[:10]]) for user in filtered_rec_list] 

# test how many of top N recommended movies appear in user's rated list of movies in the test period
positive_top_5 = [np.multiply(test_ds[i], rec_user) for i, rec_user in enumerate(top_5_list)] # get (#users,#items) boolean vectors indicating whether recommended movie was rating in test period
users_result_top_5 = [row.sum() for row in positive_top_5] # get list of positive matches per user
all_result_top_5 = np.sum(users_result_top_5) # total number of positive matches across all users

positive_top_10 = [np.multiply(test_ds[i], rec_user) for i, rec_user in enumerate(top_10_list)] 
users_result_top_10 = [row.sum() for row in positive_top_10] # get list of positive matches per user
all_result_top_10 = np.sum(users_result_top_10) # total number of positive matches across all users

# total number of movies rated in test period
test_num_movies_rated = np.sum(test_ds).sum()

# output results to excel via pandas df
dict_result = {'Log Likelihood':likelihood.cpu().numpy(), 'Iterations':n_iterations,            
            '# movies rated in test period': test_num_movies_rated, 
            'Total +ve for top 5':all_result_top_5, 
            'Precision of top 5':all_result_top_5/(5*U),
            'Recall of top 5':all_result_top_5/test_num_movies_rated,
            'Total +ve for top 10':all_result_top_10,
            'Precision of top 10':all_result_top_10/(10*U),
            'Recall of top 10':all_result_top_10/test_num_movies_rated
            }
df_result = pd.DataFrame(data=dict_result, index=[0])
print(df_result)
df_result.to_csv(path + 'result.csv', index=False)