In [1]:
import gzip
import json
import numpy
import scipy
import scipy.optimize
import random
from collections import defaultdict

def readCSV(path):
    f = gzip.open(path, 'rt')
    for l in f:
        d = json.loads(l)
        if d['rating'] != None:
            yield d['user_id'], d['item_id'], int(d['rating']), d

data = []
for user, item, rating, d in readCSV('renttherunway_final_data.json.gz'):
    data.append((user, item, rating))

train = data[:int(len(data) * 0.8)]
test = data[int(len(data) * 0.8):]

In [2]:
itemsPerUser = defaultdict(list)
usersPerItem = defaultdict(list)
labels = []
for user, item, rating in train:
    itemsPerUser[user].append(item)
    usersPerItem[item].append(user)
    labels.append(rating)

mu = sum([rating for _, _, rating in train]) / len(train)

userBiases = defaultdict(float)
itemBiases = defaultdict(float)
alpha = mu
nUsers = len(itemsPerUser)
nItems = len(usersPerItem)
users = list(itemsPerUser.keys())
items = list(usersPerItem.keys())

def MSE(predictions, labels):
    differences = [(x - y) ** 2 for x, y in zip(predictions, labels)]
    return sum(differences) / len(differences)

def prediction(user, item):
    return alpha + userBiases[user] + itemBiases[item]

def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    alpha = theta[0]
    userBiases = dict(zip(users, theta[1 : nUsers + 1]))
    itemBiases = dict(zip(items, theta[1 + nUsers :]))
    
def cost(theta, labels, lamb):
    unpack(theta)
    predictions = [prediction(u, i) for u, i, _ in train]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in userBiases:
        cost += lamb * userBiases[u] ** 2
    for i in itemBiases:
        cost += lamb * itemBiases[i] ** 2
    return cost

def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(train)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    for u, i, r in train:
        pred = prediction(u, i)
        diff = pred - r
        dalpha += 2 / N * diff
        dUserBiases[u] += 2 / N * diff
        dItemBiases[i] += 2 / N * diff
    for u in userBiases:
        dUserBiases[u] += 2 * lamb * userBiases[u]
    for i in itemBiases:
        dItemBiases[i] += 2 * lamb * itemBiases[i]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    return numpy.array(dtheta)

scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + [0.0] * (nUsers + nItems), derivative, args = (labels, 0.0001), maxiter = 100)

MSE = 2.0431834174343386
MSE = 2.022511115992889
MSE = 3.2565160754555667
MSE = 2.020439133364178
MSE = 1.9950716232141874
MSE = 1.978697426678984
MSE = 1.9137088292212052
MSE = 1.8875497832759613
MSE = 1.8172718129366447
MSE = 1.796065837037931
MSE = 1.7896293904865057
MSE = 1.7626443511465093
MSE = 1.740906099940967
MSE = 1.7204614947906003
MSE = 1.6939399481280126
MSE = 1.6872980109013358
MSE = 1.7146273564995262
MSE = 1.68724290968428
MSE = 1.6857334590065314
MSE = 1.6827638369770999
MSE = 1.6775170048243706
MSE = 1.6742370122184442
MSE = 1.6683866315221618
MSE = 1.6644094299484673
MSE = 1.6618367539416499
MSE = 1.6622507908990298
MSE = 1.66106280221897
MSE = 1.658870244152181
MSE = 1.6583170388348623
MSE = 1.6586027733344224
MSE = 1.6582091065511246
MSE = 1.65765002921178
MSE = 1.6569622806415933
MSE = 1.6567988120123511
MSE = 1.6563553306115173
MSE = 1.6570313124191012
MSE = 1.656609232594068
MSE = 1.6562337235875964
MSE = 1.6563512156858657
MSE = 1.6565048593910296
MSE = 1.65667

(array([ 9.04405322, -0.35316339,  0.05278849, ...,  0.05726395,
         0.03603514,  0.05498117]),
 1.7927873164261876,
 {'funcalls': 70,
  'grad': array([-4.26682771e-05, -2.53873883e-08, -5.26516068e-09, ...,
         -1.96391436e-09, -1.53137206e-08,  7.21105210e-09]),
  'nit': 60,
  'task': b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH',
  'warnflag': 0})

In [3]:
def predict(user, item):
    userBias = userBiases[user] if user in userBiases else 0.0
    itemBias = itemBiases[item] if item in itemBiases else 0.0
    return alpha + userBias + itemBias

preds = []
results = []
for user, item, rating in test:
    preds.append(predict(user, item))
    results.append(rating)
    
MSE(preds, results)

1.9228015920444799

In [5]:
userGamma = {}
itemGamma = {}
userBiases = defaultdict(float)
itemBiases = defaultdict(float)
K = 2
    
for u in itemsPerUser:
    userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]
    
for i in usersPerItem:
    itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    global userGamma
    global itemGamma
    index = 0
    alpha = theta[index]
    index += 1
    userBiases = dict(zip(users, theta[index : index + nUsers]))
    index += nUsers
    itemBiases = dict(zip(items, theta[index : index + nItems]))
    index += nItems
    for u in users:
        userGamma[u] = theta[index : index+K]
        index += K
    for i in items:
        itemGamma[i] = theta[index : index+K]
        index += K

def inner(x, y):
    return sum([a * b for a, b in zip(x, y)])

def prediction(user, item):
    return alpha + userBiases[user] + itemBiases[item] + inner(userGamma[user], itemGamma[item])

def cost(theta, labels, lamb):
    unpack(theta)
    predictions = [prediction(u, i) for u, i, _ in train]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in users:
        cost += lamb*userBiases[u]**2
        for k in range(K):
            cost += lamb*userGamma[u][k]**2
    for i in items:
        cost += lamb*itemBiases[i]**2
        for k in range(K):
            cost += lamb*itemGamma[i][k]**2
    return cost

def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(train)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    dUserGamma = {}
    dItemGamma = {}
    for u in itemsPerUser:
        dUserGamma[u] = [0.0 for k in range(K)]
    for i in usersPerItem:
        dItemGamma[i] = [0.0 for k in range(K)]
    for u, i, r in train:
        pred = prediction(u, i)
        diff = pred - r
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
        for k in range(K):
            dUserGamma[u][k] += 2/N*itemGamma[i][k]*diff
            dItemGamma[i][k] += 2/N*userGamma[u][k]*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
        for k in range(K):
            dUserGamma[u][k] += 2*lamb*userGamma[u][k]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
        for k in range(K):
            dItemGamma[i][k] += 2*lamb*itemGamma[i][k]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    for u in users:
        dtheta += dUserGamma[u]
    for i in items:
        dtheta += dItemGamma[i]
    return numpy.array(dtheta)

scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + [0.0]*(nUsers+nItems) + [random.random() * 0.1 - 0.05 for k in range(K*(nUsers+nItems))], derivative, args = (labels, 0.0001), maxiter = 100)

MSE = 2.0456327923981648
MSE = 2.8903817450310627
MSE = 2.0429095807950097
MSE = 2.0426144393744816
MSE = 2.0414433190299444
MSE = 2.0369099630467193
MSE = 2.0211945451220257
MSE = 1.9595469338225868
MSE = 1.913437531425012
MSE = 1.857315236786392
MSE = 1.8253507388329604
MSE = 1.7993755088036556
MSE = 1.7636106597728634
MSE = 1.7207682044509014
MSE = 1.6937969453617427
MSE = 1.694270034132174
MSE = 1.6864209456054526
MSE = 1.670655380695687
MSE = 1.6771741077657718
MSE = 1.7017001497152968
MSE = 1.676249237816988
MSE = 1.6712923926125862
MSE = 1.6667355017633154
MSE = 1.6640736607952733
MSE = 1.660991289313356
MSE = 1.657243668189552
MSE = 1.6538217595303792
MSE = 1.6553578041774855
MSE = 1.6545192627148702
MSE = 1.6536735253231443
MSE = 1.6516331295009237
MSE = 1.6468686042982756
MSE = 1.7114002716847199
MSE = 1.6467305471049347
MSE = 1.6404251566450272
MSE = 1.6310145175866475
MSE = 1.6316128755183297
MSE = 1.627934857321712
MSE = 1.6148481565611497
MSE = 1.6304883646167267
MSE = 1.

(array([ 9.04290223e+00, -3.53363882e-01,  3.03389781e-02, ...,
        -3.49276950e-05, -1.02054363e-04, -1.02468335e-04]),
 1.6830060052426299,
 {'funcalls': 122,
  'grad': array([-3.31188044e-03, -5.07870324e-07, -1.03195546e-06, ...,
          5.41185808e-09, -1.88467713e-08, -1.91810059e-08]),
  'nit': 101,
  'task': b'STOP: TOTAL NO. of ITERATIONS EXCEEDS LIMIT',
  'warnflag': 1})

In [7]:
def predict(user, item):
    userBias = userBiases[user] if user in userBiases else 0.0
    itemBias = itemBiases[item] if item in itemBiases else 0.0
    gamma = inner(userGamma[user], itemGamma[item]) if user in userGamma and item in itemGamma else 0.0
    return alpha + userBias + itemBias + gamma

preds = []
results = []
for user, item, rating in test:
    preds.append(predict(user, item))
    results.append(rating)
    
MSE(preds, results)

1.9358281274238103