In [1]:

import gzip
import math
import numpy as  np
import random
import sklearn
import string
from collections import defaultdict
import json
def parseData(file):
    for l in open(file,'r'):
        yield json.loads(l)

In [2]:


dataset2 = []
dataset2 = list(parseData('renttherunway_final_data.json'))

dataset2[0]

{'fit': 'fit',
 'user_id': '420272',
 'bust size': '34d',
 'item_id': '2260466',
 'weight': '137lbs',
 'rating': '10',
 'rented for': 'vacation',
 'review_text': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.",
 'body type': 'hourglass',
 'review_summary': 'So many compliments!',
 'category': 'romper',
 'height': '5\' 8"',
 'size': 14,
 'age': '28',
 'review_date': 'April 20, 2016'}

In [3]:
cases = set()
for i in dataset2:
    cases.add(i['fit'])
cases

{'fit', 'large', 'small'}

In [4]:
fittingPerUser = defaultdict(list)
fittingPerItem = defaultdict(list)
allFitting = []
for entry in dataset2:
    item_id = entry['item_id']
    user_id = entry['user_id']
    fit = entry['fit']
    if fit == "fit":
        fit = 3
    elif fit == "small":
        fit = 1
    elif fit == 'large':
        fit = 5
        
    fittingPerUser[user_id].append((item_id,fit))
    fittingPerItem[item_id].append((user_id,fit))
    allFitting.append((user_id, item_id, fit ))

In [5]:

print('Size of the dataset :', len(dataset2))
lengths = []
for u in fittingPerUser:
    lengths.append(len(fittingPerUser[u]))
print("average fitting per user :", np.array(lengths).mean())
print('number of unique users: ', len(lengths))
lengths = []
for i in fittingPerItem:
    lengths.append(len(fittingPerItem[i]))

print("average fitting per item :", np.array(lengths).mean())
print('number of unique items: ', len(lengths))

print('-----------------------------')
print("Remove users with too few reviews")
print('-----------------------------')
removed_user_set = set()
lengths = []
for u in fittingPerUser:
    if len(fittingPerUser[u])>2:
        lengths.append(len(fittingPerUser[u]))
    else:
        removed_user_set.add(u)
print("average fitting per user post filterng :", np.array(lengths).mean())
print("total number of users post filtering :", len(lengths))

removed_item_set = set()
lengths = []
for i in fittingPerItem:
    if len(fittingPerItem[i])>2:
        lengths.append(len(fittingPerItem[i]))
    else:
        removed_item_set.add(i)
print("average fitting per item post filterng :", np.array(lengths).mean())
print("total number of items post filtering :", len(lengths))


Size of the dataset : 192544
average fitting per user : 1.8238341968911918
number of unique users:  105571
average fitting per item : 32.91350427350427
number of unique items:  5850
-----------------------------
Remove users with too few reviews
-----------------------------
average fitting per user post filterng : 5.3576835730507195
total number of users post filtering : 15852
average fitting per item post filterng : 36.93733127651369
total number of items post filtering : 5186


In [8]:
users = list(fittingPerUser.keys())
items = list(fittingPerItem.keys())
userIDs = {}
itemIDs = {}
for u,i,r in allFitting:
    if u in removed_user_set or i in removed_item_set :
        continue
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    
nTrain = int(len(allFitting)*0.8)
nTest = int(len(allFitting)-nTrain)
interactionsTrain =  allFitting[:nTrain]
interactionsTest = allFitting[nTrain:]

# %%
import tensorflow as tf
optimizer = tf.keras.optimizers.Adam(0.0050)
mean_rating = np.array([i[2] for i in allRatings]).mean()
print(mean_rating)

class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb1, lamb2):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu,dtype='float32')
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001),dtype='float32')
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001),dtype='float32')
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001),dtype='float32')
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001),dtype='float32')
        self.lamb1 = lamb1
        self.lamb2 = lamb2

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i] +\
            tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb1 * (tf.reduce_sum(self.betaU**2) +\
                            tf.reduce_sum(self.betaI**2)) +\
                            self.lamb2*(tf.reduce_sum(self.gammaU**2) +\
                            tf.reduce_sum(self.gammaI**2))
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        pred = self.alpha + beta_u + beta_i +\
               tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

# %%
def trainingStep(model, interactions):
    Nsamples =65000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,i,r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleR.append(r)

        loss = model(sampleU,sampleI,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

# %%
modelLFM = LatentFactorModel(mean_rating, 5, 0.000008,0.000038)
for i in range(210):
    obj = trainingStep(modelLFM, interactionsTrain)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))



NameError: name 'allRatings' is not defined

In [7]:
!pip install tensorflow

Collecting tensorflow
  Using cached tensorflow-2.11.0-cp39-cp39-win_amd64.whl (1.9 kB)
Collecting tensorflow-intel==2.11.0
  Using cached tensorflow_intel-2.11.0-cp39-cp39-win_amd64.whl (266.3 MB)
Collecting libclang>=13.0.0
  Using cached libclang-14.0.6-py2.py3-none-win_amd64.whl (14.2 MB)
Collecting protobuf<3.20,>=3.9.2
  Using cached protobuf-3.19.6-cp39-cp39-win_amd64.whl (895 kB)
Collecting tensorflow-estimator<2.12,>=2.11.0
  Using cached tensorflow_estimator-2.11.0-py2.py3-none-any.whl (439 kB)
Collecting absl-py>=1.0.0
  Using cached absl_py-1.3.0-py3-none-any.whl (124 kB)
Collecting flatbuffers>=2.0
  Using cached flatbuffers-22.11.23-py2.py3-none-any.whl (26 kB)
Collecting opt-einsum>=2.3.2
  Using cached opt_einsum-3.3.0-py3-none-any.whl (65 kB)
Collecting google-pasta>=0.1.1
  Using cached google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting tensorflow-io-gcs-filesystem>=0.23.1
  Using cached tensorflow_io_gcs_filesystem-0.28.0-cp39-cp39-win_amd64.whl (1.5 MB)
Collecti