In [679]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

In [680]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [681]:
def prepare_data(file_name='./train.csv'):
    train_df = pd.read_csv(file_name)
    train_df.drop(train_df.columns[0],axis=1,inplace=True)
    train_npa = train_df.to_numpy()
    
    train_data = []
    max_item_id = 0
    max_user_id = train_npa.shape[0]

    for i, docs in enumerate(train_npa):
        tmp_arr = np.array(list(map(int, list(docs)[0].split())))
        tmp_max = np.max(tmp_arr)
        if tmp_max > max_item_id:
            max_item_id = tmp_max
        train_data.append(tmp_arr)

    train_data = np.array(train_data, dtype=object)

    return train_data, max_item_id, max_user_id

In [682]:
def create_rating_matrix(train_data, max_item_id, max_user_id):
    R = np.zeros((max_item_id, max_user_id))
    
    for i in range(max_item_id):
        for j in range(max_user_id):
            if i in train_data[j]:
                # user j interacted with item i
                R[i][j] = 1
    
    return R

In [683]:
def bce_loss(data, i, j):
    hor_sum = np.sum(data, axis=1)
    vert_sum = np.sum(data, axis=0)
    
    return -(data[i, j]*np.ln(hor_sum[i]) + (1-data[i, j])*np.ln(1-vert_sum[j]))

In [684]:
# def gradient_descent(start, learn_rate=0.01, n_iter=50, tolerance=1e-06):
#     vector = start
#     for _ in range(n_iter):
# #         diff = -learn_rate * np.gradient(vector)
#         if np.all(np.abs(diff) <= tolerance):
#             break
#         vector += diff
#     return vector

In [685]:
def count_occurrences(row):
    return len([x for x in row if x == 1])

In [686]:
def gradient_descent(R, P, Q, max_iter=1, learn_rate=0.01, n_iter=50, tolerance=1e-06, lamb1=0.001, lamb2=0.001):
    for iters in range(max_iter):
        print("iteration: {}".format(iters))
        for x in range(R.shape[1]):
            for i in range(R.shape[0]):
#                 o = count_occurrences()
#                 eps = R[i, x] - (np.dot(Q[i], P[x]))
#                 eps = R[i, x] - (np.log(sigmoid(Q[i])) + np.log(sigmoid(P[x])))
                p = sigmoid(np.dot(Q[i], P[x]))
#                 print(p)
                if math.isnan(p):
                    print("({}, {}) --- Q[i]: {}, P[x]: {}".format(x, i, Q[i], P[x]))
                if p == 0:
                    p = 0.01
                if (1-p) <= 0:
                    p = 0.99
                y = R[i, x]
#                 print(p)
                
                try:
                    eps = -(np.sum(y*np.log(p) + (1-y)*(np.log(1-p))))
#                     print(eps)
                    if math.isinf(eps) or math.isnan(eps):
                        print("eps is wrong --- y: {}, p: {}".format(y, p))
                except:
                    print("p: {}".format(p))
                    print("y: {}".format(y))
#                 eps = -(R[i, x]*np.log(pred) + (1-R[i, x])*np.log((1-pred)))

                if (y-p) <= 0:
                    eps = -eps
                Q[i] = Q[i] + learn_rate*(eps*P[x]-lamb2*Q[i])
#                 print(learn_rate*(eps*Q[i]-lamb1*P[x]))
                P[x] = P[x] + learn_rate*(eps*Q[i]-lamb1*P[x])
    
    return P, Q

In [687]:
def get_most_relevant(mat, n=50):
    res = []
    
    for i in range(mat.shape[0]):
        d = dict(enumerate(mat[i]))
        sl = sorted(d.items(), key=lambda item: item[1], reverse=True)
        res.append(sl[:n])

    return res

In [688]:
def output_result_csv(rel_items):        
#     res = pd.DataFrame.from_dict(rel_items, orient='index', columns=['ItemId'])
    for k, v in rel_items.items():
        rel_items[k] = ' '.join(str(x) for x in v)
        
    res = pd.DataFrame([{'ItemId': v} for v in rel_items.values()], index=rel_items.keys())
    res.index.name = 'UserId'
    
    res.to_csv('./submission.csv')  
        

In [689]:
train_data, max_item_id, max_user_id = prepare_data()

In [690]:
# R = create_rating_matrix(train_data, max_item_id, max_user_id)

In [691]:
M, N = R.shape[0], R.shape[1]

In [692]:
# u, s, vh = np.linalg.svd(R, )

# # P = u
# # Q = np.matmul(s, vh)

In [693]:
# P = np.matmul(u, s)
# Q = vh

In [694]:
initial_Q = np.random.rand(M, 2)
initial_P = np.random.rand(N, 2)
P, Q = gradient_descent(R, initial_P, initial_Q)

iteration: 0


In [695]:
M = np.matmul(P, np.transpose(Q))

In [696]:
M.shape

(4454, 3259)

In [697]:
relevant_items = get_most_relevant(M)
rel_items = [[x[0] for x in l] for l in relevant_items]

In [698]:
rel_items_dict = dict(enumerate(rel_items))

In [699]:
output_result_csv(rel_items_dict)