In [6226]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.metrics import average_precision_score

In [6227]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [6228]:
def prepare_data(file_name='./train.csv'):
    train_df = pd.read_csv(file_name)
    train_df.drop(train_df.columns[0],axis=1,inplace=True)
    train_npa = train_df.to_numpy()
    
    train_data = []
    max_item_id = 0
    max_user_id = train_npa.shape[0]

    for i, docs in enumerate(train_npa):
        tmp_arr = np.array(list(map(int, list(docs)[0].split())))
        tmp_max = np.max(tmp_arr)
        if tmp_max > max_item_id:
            max_item_id = tmp_max
        train_data.append(tmp_arr)

    train_data = np.array(train_data, dtype=object)

    return train_data, max_item_id, max_user_id

In [6229]:
def create_rating_matrix(train_data, max_item_id, max_user_id):
    R = np.zeros((max_item_id, max_user_id))
    
    for i in range(max_item_id):
        for j in range(max_user_id):
            if i in train_data[j]:
                # user j interacted with item i
                R[i][j] = 1
    
    return R

In [6230]:
def bce_loss(data, i, j):
    hor_sum = np.sum(data, axis=1)
    vert_sum = np.sum(data, axis=0)
    
    return -(data[i, j]*np.ln(hor_sum[i]) + (1-data[i, j])*np.ln(1-vert_sum[j]))

In [6231]:
def count_occurrences(row):
    return len([x for x in row if x == 1])

In [6232]:
def gradient_descent(R, max_iter=100, learn_rate=0.001, lamb=0.001, dim=16):
    Q = np.random.uniform(0, 1, size=(R.shape[0], dim))
    P = np.random.uniform(0, 1, size=(R.shape[1], dim))
    
    for iters in range(max_iter):
        r = np.random.randint(0, Q.shape[0])
        for i in range(P.shape[0]):
            y = R[r, i]
            p = np.dot(Q[r], P[i])
            eps = (-y/p)+((1-y)/(1-p))
            eps = np.sign(p)*eps

            Q[r] = Q[r] + learn_rate*(-eps*P[i]-lamb*Q[r])
            P[i] = P[i] + learn_rate*(-eps*Q[r]-lamb*P[i])
            
            
            
        r = np.random.randint(0, P.shape[0])
        for i in range(Q.shape[0]):
            y = R[i, r]
            p = np.dot(Q[i], P[r])
            eps = (-y/p)+((1-y)/(1-p))
            
            eps = np.sign(p)*eps
    
            Q[i] = Q[i] + learn_rate*(-eps*P[r]-lamb*Q[i])
            P[r] = P[r] + learn_rate*(-eps*Q[i]-lamb*P[r])
                
    return P, Q

In [6233]:
def get_most_relevant(mat, n=50):
    res = []
    
    for i in range(mat.shape[0]):
        d = dict(enumerate(mat[i]))
        sl = sorted(d.items(), key=lambda item: item[1], reverse=True)
        res.append(sl[:n])

    return res

In [6234]:
def output_result_csv(M, output_filename):
    relevant_items = get_most_relevant(M)
    rel_items = [[x[0] for x in l] for l in relevant_items]
    rel_items = dict(enumerate(rel_items))

    for k, v in rel_items.items():
        rel_items[k] = ' '.join(str(x) for x in v)
        
    res = pd.DataFrame([{'ItemId': v} for v in rel_items.values()], index=rel_items.keys())
    res.index.name = 'UserId'
    
    res.to_csv(output_filename)  
        

In [6235]:
def _sample(self, n_users, n_items, indices, indptr):
        """sample batches of random triplets u, i, j"""
        sampled_pos_items = np.zeros(self.batch_size, dtype = np.int)
        sampled_neg_items = np.zeros(self.batch_size, dtype = np.int)
        sampled_users = np.random.choice(
            n_users, size = self.batch_size, replace = False)

        for idx, user in enumerate(sampled_users):
            pos_items = indices[indptr[user]:indptr[user + 1]]
            pos_item = np.random.choice(pos_items)
            neg_item = np.random.choice(n_items)
            while neg_item in pos_items:
                neg_item = np.random.choice(n_items)

            sampled_pos_items[idx] = pos_item
            sampled_neg_items[idx] = neg_item

        return sampled_users, sampled_pos_items, sampled_neg_items

In [6236]:
def bpr_sgd(R, max_iter=10, learn_rate=0.001, lamb=0.001):
#     Q = np.random.uniform(0, 1, size=(R.shape[1], 16))
#     P = np.random.uniform(0, 1, size=(R.shape[0], 16))

    n_users = R.shape[1]
    n_items = R.shape[0]
    batch_size = 100
    n_factors = 20

    batch_iters = n_users // batch_size

    rstate = np.random.RandomState(1234)
    Q = rstate.normal(size = (R.shape[1], n_factors))
    P = rstate.normal(size = (R.shape[0], n_factors))
    
    for it in range(max_iter):
        for _ in range(batch_iters):
            sampled_pos_items = np.zeros(batch_size, dtype = np.int)
            sampled_neg_items = np.zeros(batch_size, dtype = np.int)
            sampled_users = np.random.choice(n_users, size = batch_size, replace = False)
            
            for idx, user in enumerate(sampled_users):
                pos_items = np.where(np.array(R[:, user]) == 1)[0]
                pos_item = np.random.choice(pos_items)
                
                neg_items = np.where(np.array(R[:, user]) == 0)[0]
                neg_item = np.random.choice(neg_items)
#                 neg_item = np.random.choice(n_items)
#                 while neg_item in pos_items:
#                     neg_item = np.random.choice(n_items)

                sampled_pos_items[idx] = pos_item
                sampled_neg_items[idx] = neg_item
#             print(sampled_pos_items)
#             print(sampled_neg_items)
#             u = np.random.randint(0, R.shape[1])
#             i = np.random.randint(0, R.shape[0])
#             j = np.random.randint(0, R.shape[0])
            u = sampled_users
            i = sampled_pos_items
            j = sampled_neg_items
            
            xuij = np.sum(Q[u] * (P[i] - P[j]), axis = 1)

            sig = np.exp(-xuij) / (1.0 + np.exp(-xuij))
            sig_t = np.tile(sig, (n_factors, 1)).T

            grad_u = sig_t * (P[j] - P[i]) + lamb * Q[u]
            grad_i = sig_t * -Q[u] + lamb * P[i]
            grad_j = sig_t * Q[u] + lamb * P[j]
            Q[u] -= learn_rate * grad_u
            P[i] -= learn_rate * grad_i
            P[j] -= learn_rate * grad_j
    
    return P, Q

In [6237]:
train_data, max_item_id, max_user_id = prepare_data()

In [6238]:
# R = create_rating_matrix(train_data, max_item_id, max_user_id)

In [6239]:
M, N = R.shape[0], R.shape[1]

In [6240]:
BCE_P, BCE_Q = gradient_descent(R)

In [6241]:
BCE_M = np.matmul(BCE_Q, BCE_P.T)

In [6242]:
BPR_P, BPR_Q = bpr_sgd(R)

In [6243]:
BPR_M = np.matmul(BPR_Q, BPR_P.T)

In [6244]:
output_result_csv(BCE_M, 'BCE_output.csv')

In [6245]:
output_result_csv(BPR_M, 'BPR_output.csv')

In [6246]:
# BCE_relevant_items = get_most_relevant(BCE_M)
# BCE_rel_items = [[x[0] for x in l] for l in BCE_relevant_items]
# BCE_rel_items_dict = dict(enumerate(rel_items))
# output_result_csv(rel_items_dict)

In [6250]:
def submission_to_npa(filename):
    df = pd.read_csv(filename)
    npa = df.to_numpy()
    sa = []
    for idx, r in enumerate(npa):
        sa.append(np.array(npa[idx][1].split(' '), dtype=np.int))
    sa = np.array(sa, dtype=object)
    
    return sa

In [6251]:
def MAP(R, sa, user):
    true_labels = R.T
    output_labels = np.zeros(true_labels.shape)

    for i in range(sa.shape[0]):
        for j in range(sa.shape[1]):
            output_labels[i][sa[i][j]] = 1
    
    return average_precision_score(true_labels[user], output_labels[user])

In [6252]:
BCE_sa = submission_to_npa('BCE_output.csv')
BPR_sa = submission_to_npa('BPR_output.csv')

In [None]:
BCE_sa.sh

In [6253]:
u = np.random.randint(0, R.shape[0])

BCE_MAP = MAP(R, BCE_sa, u)
BPR_MAP = MAP(R, BPR_sa, u)

IndexError: index 4090 is out of bounds for axis 0 with size 3259

In [None]:
isec = []
for i in range(len(ta)):
    isec.append(np.intersect1d(ta[i], sa[i]).shape[0]/len(ta[i]))

In [None]:
np.mean(np.array(isec))