In [1]:
import gzip
import json
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
from sklearn import linear_model

In [2]:
f = gzip.open("renttherunway_final_data.json.gz")
dataset1 = []
for l in f:
    dataset1.append(json.loads(l))

In [3]:
dataset1[1]

{'fit': 'fit',
 'user_id': '273551',
 'bust size': '34b',
 'item_id': '153475',
 'weight': '132lbs',
 'rating': '10',
 'rented for': 'other',
 'review_text': 'I rented this dress for a photo shoot. The theme was "Hollywood Glam and Big Beautiful Hats". The dress was very comfortable and easy to move around in. It is definitely on my list to rent again for another formal event. ',
 'body type': 'straight & narrow',
 'review_summary': 'I felt so glamourous!!!',
 'category': 'gown',
 'height': '5\' 6"',
 'size': 12,
 'age': '36',
 'review_date': 'June 18, 2013'}

In [4]:
len(dataset1)

192544

## Baseline Logistic Regression Model

Notes: Model does not use any user info or temporal characteristics

In [5]:
train = dataset1[:180000]
test = dataset1[180000:]

In [6]:
def accuracy(predictions, y):
    incorrect = np.sum(np.logical_xor(predictions, y))
    total = len(y)
    accuracy = (total - incorrect)/total
    return accuracy

In [7]:
def categorization_accuracy(prediction, y):
    ls = [prediction[i]==y[i] for i in range(len(y))]
    correct = np.sum(ls)
    total = len(y)
    cat_accuracy = correct / total
    return cat_accuracy

In [8]:
# useful data structures
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

for d in train:
    u = d['user_id']
    i = d['item_id']
    reviewsPerUser[u].append(d)
    reviewsPerItem[i].append(d)

In [9]:
def features(u,i):
    fs = []
    fits = [d['fit'] for d in reviewsPerItem[i]]
    fs.append(len(fits))
    fs.append(fits.count('fit')/(len(fits)+0.1))
    fs.append(fits.count('small')/(len(fits)+0.1))
    fs.append(fits.count('large')/(len(fits)+0.1))
    return [1] + fs

In [10]:
X_train = [features(d['user_id'],d['item_id']) for d in train]
X_test = [features(d['user_id'],d['item_id']) for d in test]

In [11]:
y_train_fit = [d['fit']=="fit" for d in train]
y_test_fit = [d['fit']=="fit" for d in test]

y_train_small = [d['fit']=="small" for d in train]
y_test_small = [d['fit']=="small" for d in test]

y_train_large = [d['fit']=="large" for d in train]
y_test_large = [d['fit']=="large" for d in test]

In [12]:
mod_fit = linear_model.LogisticRegression(fit_intercept=False, max_iter=200, C=1)
mod_fit.fit(X_train,y_train_fit)
y_test_pred = mod_fit.predict(X_test)
print(accuracy(y_test_pred,y_test_fit))

0.7502391581632653


In [13]:
mod_small = linear_model.LogisticRegression(fit_intercept=False, max_iter=200, C=1)
mod_small.fit(X_train,y_train_small)
y_test_pred = mod_small.predict(X_test)
print(accuracy(y_test_pred,y_test_small))

0.8696588010204082


In [14]:
mod_large = linear_model.LogisticRegression(fit_intercept=False, max_iter=200, C=1)
mod_large.fit(X_train,y_train_large)
y_test_pred = mod_large.predict(X_test)
print(accuracy(y_test_pred,y_test_large))

0.8783482142857143


In [15]:
def class_prediction(X):
    fit_percent = mod_fit.predict_proba(X)
    small_percent = mod_small.predict_proba(X)
    large_percent = mod_large.predict_proba(X)
    
    percents = list(zip(fit_percent[:,1],small_percent[:,1],large_percent[:,1]))
    preds = []
    for i in range(len(X)):
        f,s,l = percents[i]
        #print(f,s,l)
        if l > s and l > f:
            preds.append('large')
        elif s > f and s > l:
            preds.append('small')
        else:
            preds.append('fit')
    return preds

In [16]:
y = [d['fit'] for d in test]
preds = class_prediction(X_test)
print("Categorization Accuracy of Model: ", categorization_accuracy(preds, y))

Categorization Accuracy of Model:  0.75
