In [1]:
import numpy as np
import pandas as pd
from scipy.optimize import fmin_cg

In [2]:
plays = pd.io.json.read_json("json/lastfm_plays.json", lines = True)
artists = pd.io.json.read_json("json/lastfm_artists.json", lines = True)

num_users = len(plays.user_id.value_counts())
num_artists = len(artists.artist_id)
num_features = 10

lambd = 0 # regularization param

print("number of users :" + str(num_users))
print("number of artists :" + str(num_artists))

user_dict = {}   # map of user_id to an index value
i = 0
for id in sorted(list(set(plays.user_id))):
    user_dict[id] = i
    i += 1

artist_dict = {}   # map of artist_id to an index value
i = 0
for id in sorted(list(set(artists.artist_id))):
    artist_dict[id] = i
    i += 1
    
assert(len(user_dict) == num_users)
assert(len(artist_dict) == num_artists)

number of users :1892
number of artists :17632


In [3]:
# R = np.zeros((num_artists, num_users))
# Y = np.zeros((num_artists, num_users))
# for i in range(plays.shape[0]):
#     if i % 1000 == 0:
#         print(i)
#     R[artist_dict[plays.loc[i]["artist_id"]], user_dict[plays.loc[i]["user_id"]]] = 1
#     Y[artist_dict[plays.loc[i]["artist_id"]], user_dict[plays.loc[i]["user_id"]]] = plays.loc[i]["plays"]

# np.save("temp/r", R)
# np.save("temp/y", Y)

R = np.load("temp/R.npy")
Y = np.load("temp/Y.npy")

assert(Y[artist_dict[1109], user_dict[2100]] == 1333.0)

In [4]:
X = np.random.randn(num_artists, num_features)
theta = np.random.randn(num_users, num_features)

# print(X)

param = np.ndarray.flatten(np.concatenate((np.reshape(X, (num_artists*num_features, 1)), np.reshape(theta, (num_users*num_features, 1)))))

X, theta = np.split(param, [num_artists*num_features])    

# print(X)
# print(np.reshape(X, (num_artists, num_features)))
# param = np.asarray((X, theta))
# X, theta = np.split(param, [num_artists])

In [5]:
def costFunc(param, *args):
    
    Y, R, lambd, num_artists, num_users, num_features = args

    X, theta = np.split(param, [num_artists*num_features])    
#     X, theta = param
    X = np.reshape(X, (num_artists, num_features))
    theta = np.reshape(theta, (num_users, num_features))
    
    temp = np.multiply(np.transpose(np.matmul(theta, np.transpose(X))) - Y, R)

    J = np.sum(np.sum(np.square(temp)))/2
    # adding the regularization term
    J += (lambd / 2)*(np.sum(np.sum(np.square(X))) + np.sum(np.sum(np.square(theta))))
    return J

def gradsFunc(param, *args):
    
    Y, R, lambd, num_artists, num_users, num_features = args

    X, theta = np.split(param, [num_artists*num_features])
#     X, theta = param
    X = np.reshape(X, (num_artists, num_features))
    theta = np.reshape(theta, (num_users, num_features))
    
    c = np.multiply(np.transpose(np.matmul(theta, np.transpose(X))) - Y, R)
    
    X_grad = np.matmul(c, theta) + lambd*X
    theta_grad = np.matmul(np.transpose(c), X) + lambd*theta
    
    return np.ndarray.flatten(np.concatenate((np.reshape(X_grad, (num_artists*num_features, 1)), np.reshape(theta_grad, (num_users*num_features, 1)))))

In [6]:
def normalize(Y, R):
    m, n = Y.shape
    Ymean = np.zeros((m, 1), dtype='float32')
    Ynorm = np.zeros((m, n), dtype='float32')
    for i in range(m):
        idx = np.nonzero(R[i,:])
        Ymean[i, 0] = np.mean(np.ndarray.flatten(Y[i, idx]))
        Ynorm[i, idx] = Y[i, idx] - Ymean[i, 0]
    return Ynorm, Ymean

In [8]:
Ynorm, Ymean = normalize(Y, R)
J = costFunc(param, Ynorm, R, lambd, num_artists, num_users, num_features)
grads = gradsFunc(param, Y, R, lambd, num_artists, num_users, num_features)
# X_grad, theta_grad = np.split(grads, [num_artists])
X_grad, theta_grad = np.split(grads, [num_artists*num_features])
X_grad = np.reshape(X_grad, (num_artists, num_features))
theta_grad = np.reshape(theta_grad, (num_users, num_features))
print(J)
print(X.shape)
print(X_grad.shape)
# assert(X.shape == X_grad.shape)
# assert(theta.shape == theta_grad.shape)

626644724166.1815
(176320,)
(17632, 10)


In [12]:
print(Ynorm.shape)

(17632, 1892)


In [20]:
# Ynorm, Ymean = normalize(Y, R)
# args = Ynorm, R, lambd, num_artists, num_users, num_features
loaded_param = np.load("temp/params.npy")
optimised_param = fmin_cg(costFunc, x0 = loaded_param, fprime = gradsFunc, args = args, maxiter = 1000)

         Current function value: 6847680101.122496
         Iterations: 1000
         Function evaluations: 1476
         Gradient evaluations: 1476


In [24]:
print(optimised_param)
print(param)

[  0.16253402   2.18776473  -5.43684085 ...  -8.4580799  -17.04482914
   5.50054983]
[-0.27042525  0.4475314  -0.92153093 ... -0.6506772   0.62353092
 -0.40155027]


In [25]:
np.save("temp/params", optimised_param)

In [26]:
X, theta = np.split(loaded_param, [num_artists*num_features])
X = np.reshape(X, (num_artists, num_features))
theta = np.reshape(theta, (num_users, num_features))

In [27]:
predictions = np.matmul(X, theta.T) + Ymean

In [37]:
print(Y[artist_dict[51],user_dict[2]])
print(predictions[artist_dict[51],user_dict[2]])

13883.0
13932.229345555032


In [38]:
artist_idx_to_id = {}

for k, v in artist_dict.items():
    artist_idx_to_id[v] = k


In [95]:
# top 10 other artists a user might like

top_p_ids = []

user = user_dict[315]

for i in np.argsort(np.ndarray.flatten(predictions[:, user]))[::-1]:
    if R[i, user] == 0:
        top_p_ids.append(artist_idx_to_id[i])
    if len(top_p_ids) >= 10:
        break

top_pred_artists = [list(artists.loc[artists.artist_id == i]["name"])[0] for i in top_p_ids]
print("recommended artists were are never listened to")
print(top_pred_artists)


top_a_ids = []
for i in np.argsort(Y[:, user])[::-1]:
    top_a_ids.append(artist_idx_to_id[i])
    if len(top_a_ids) >= 10:
        break

top_actual_artists = [list(artists.loc[artists.artist_id == i]["name"])[0] for i in top_a_ids]
print("actual most played artists")
print(top_actual_artists)

recommended artists were are never listened to
['Viking Quest', 'Tyler Adam', 'Rytmus', 'Thalía', 'Wanessa', 'Johnny Hallyday', 'Dicky Dixon', 'Taylor Swift', 'RICHARD DIXON-COMPOSER', '浜崎あゆみ']
actual most played artists
['Coldplay', 'Muse', 'Radiohead', 'The Beatles', 'Daft Punk', 'Kings of Leon', 'The Killers', 'Florence + the Machine', 'Arctic Monkeys', 'David Guetta']
