In [1]:
#similarity

import gzip
import json
import numpy
import random
from collections import defaultdict
from gensim.models import Word2Vec

def readCSV(path):
    f = gzip.open(path, 'rt')
    for l in f:
        d = json.loads(l)
        if d['rating'] != None:
            yield d['user_id'], d['item_id'], int(d['rating']), d

data = []
for user, item, rating, d in readCSV('renttherunway_final_data.json.gz'):
    d = {'user_id':user, 'item_id':item, 'rating':rating}
    data.append(d)


In [2]:
data[0]

{'user_id': '420272', 'item_id': '2260466', 'rating': 10}

In [3]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
ratingDict = {} # To retrieve a rating for a specific user/item pair

for d in data:
    user,item = d['user_id'], d['item_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    ratingDict[(user,item)] = d['rating']

In [4]:
ratingMean = sum([d['rating'] for d in data]) / len(data)
ratingMean

9.092371481123546

In [5]:
userAverages = {}
itemAverages = {}

for u in itemsPerUser:
    rs = [ratingDict[(u,i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(u,i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)

In [10]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

In [11]:
for d in data:
    user,item = d['user_id'], d['item_id']
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)

In [13]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [17]:
def predictRating1(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['item_id']
        if i2 == item: continue
        ratings.append(d['rating'])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return sum(weightedRatings) / sum(similarities)
    else:
        return ratingMean

In [18]:
def predictRating2(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerItem[item]:
        u2 = d['user_id']
        if u2 == user: continue
        ratings.append(d['rating'])
        similarities.append(Jaccard(itemsPerUser[user],itemsPerUser[u2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return sum(weightedRatings) / sum(similarities)
    else:
        return ratingMean

In [19]:
def predictRating3(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['item_id']
        if i2 == item: continue
        ratings.append(d['rating'] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        return ratingMean

In [20]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [21]:
alwaysPredictMean = [ratingMean for d in data]

In [22]:
labels = [d['rating'] for d in data]

In [23]:
MSE(alwaysPredictMean, labels)

2.0450157423723705

In [25]:
simPredictions1 = [predictRating1(d['user_id'], d['item_id']) for d in data]
simPredictions2 = [predictRating2(d['user_id'], d['item_id']) for d in data]
simPredictions3 = [predictRating3(d['user_id'], d['item_id']) for d in data]

In [26]:
MSE(simPredictions1, labels)

2.5007756551257687

In [27]:
MSE(simPredictions2, labels)

2.0091934934607543

In [28]:
MSE(simPredictions3, labels)

2.3346205467721846

In [41]:
#item2vec

import gzip
import json
import numpy
import random
from collections import defaultdict
from gensim.models import Word2Vec

def readCSV(path):
    f = gzip.open(path, 'rt')
    for l in f:
        d = json.loads(l)
        if d['rating'] != None:
            yield d['user_id'], d['item_id'], d['review_text'], int(d['rating']), d

dataset = []
for user, item, review_text, rating, d in readCSV('renttherunway_final_data.json.gz'):
    d = {'user_id':user, 'item_id':item, 'review':review_text, 'rating':rating}
    dataset.append(d)


In [20]:
dataset[0]

{'user_id': '420272',
 'item_id': '2260466',
 'review': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.",
 'rating': 10}

In [78]:
itemAverages = defaultdict(list)
reviewsPerUser = defaultdict(list)
    
for d in dataset:
    i = d['item_id']
    u = d['user_id']
    itemAverages[i].append(d['rating'])
    reviewsPerUser[d['user_id']].append((d['review'], d['item_id']))
    
for i in itemAverages:
    itemAverages[i] = sum(itemAverages[i]) / len(itemAverages[i])

In [79]:
reviewLists = []
for u in reviewsPerUser:
    rl = list(reviewsPerUser[u])
    rl.sort()
    reviewLists.append([x[1] for x in rl])

In [80]:
reviewLists[0]

['1083818', '2260466', '348662', '2431951', '2340996', '2363191']

In [114]:
model10 = Word2Vec(reviewLists,
                 min_count=8, # Words/items with fewer instances are discarded
                 #size=10, # Model dimensionality
                 window=3, # Window size
                 sg=1) # Skip-gram model

In [115]:
itemAverages = defaultdict(list)
reviewsPerUser = defaultdict(list)
    
for d in dataset:
    i = d['item_id']
    u = d['user_id']
    itemAverages[i].append(d['rating'])
    reviewsPerUser[u].append(d)
    
for i in itemAverages:
    itemAverages[i] = sum(itemAverages[i]) / len(itemAverages[i])

In [116]:
def predictRating(user,item):
    ratings = []
    similarities = []
    if not str(item) in model10.wv:
        return ratingMean
    for d in reviewsPerUser[user]:
        i2 = d['item_id']
        if i2 == item: continue
        ratings.append(d['rating'] - itemAverages[i2])
        if str(i2) in model10.wv:
            similarities.append(model10.wv.distance(str(item), str(i2)))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        return ratingMean

In [117]:
predictions = [predictRating(d['user_id'],d['item_id']) for d in dataset]

In [118]:
MSE(predictions, labels)

2.3824298115265954