In [1]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
DATA_DIR = Path("./data")
DATA_FILEPATH = DATA_DIR / "renttherunway_final_data.json"
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_json(DATA_FILEPATH, lines=True)
df = df[["fit", "user_id", "item_id", "size", "review_date", "category"]].rename(columns={"fit":"result"})
df.head()


Unnamed: 0,result,user_id,item_id,size,review_date,category
0,fit,420272,2260466,14,"April 20, 2016",romper
1,fit,273551,153475,12,"June 18, 2013",gown
2,fit,360448,1063761,4,"December 14, 2015",sheath
3,fit,909926,126335,8,"February 12, 2014",dress
4,fit,151944,616682,12,"September 26, 2016",gown


In [3]:
def reindex_column(df, column_name):
    result_df = df.sort_values(column_name)
    result_column = []
    current_new, current_old = 0, result_df[column_name][0]
    for i, row in enumerate(result_df[column_name]):
        if row != current_old:
            current_old = row
            if i != 0:
                current_new+=1
        result_column.append(current_new)
    result_df = result_df.rename(columns = {column_name: column_name+"_old"})
    result_df[column_name]= result_column
    return result_df
    


In [4]:
df = reindex_column(df, "user_id")
df = reindex_column(df, "item_id")
df = reindex_column(df, "result")

In [5]:
df.sort_values("item_id").head()

Unnamed: 0,result_old,user_id_old,item_id_old,size,review_date,category,user_id,item_id,result
104062,small,812181,123373,9,"March 17, 2014",gown,85603,0,2
63099,fit,229133,123373,24,"May 6, 2014",gown,24037,0,0
138915,fit,584548,123373,3,"November 28, 2015",gown,61609,0,0
123878,fit,584611,123373,21,"March 10, 2014",gown,61619,0,0
89481,fit,741127,123373,21,"September 17, 2012",gown,78026,0,0


In [6]:
df["size"].mean()

12.245175128801728

In [63]:
class HierarchicalSizeSimplified:
    def __init__(self, train_data):
        #self.train_original = train_data
        self.train = train_data
        self.customers = train_data["user_id"].sort_values().unique()
        self.articles = train_data["item_id"].sort_values().unique()
        self.number_of_customers = self.customers.size
        self.number_of_articles = self.articles.size
        self.KEPT, self.BIG, self.SMALL = 0,1,2
        self._init_const()
        self._init_variables()
        self.fill_variables_in_train()

    def _init_const(self):
        self.Nc = self.train.groupby("user_id")["user_id"].count().values
        self.mu_0 = df["size"].mean()
        self.sigma_0 = df["size"].std()
        self.sigma_0_inverse_square = 1/(self.sigma_0**2)
        self.eta_kept = 0
        self.alpha_sigma_c = (self.Nc / 2 ) + 1
    
    def _init_variables(self):
        self.beta_sigma_c = np.ones_like(self.Nc)*2
        self.mean_mu_c = np.ones_like(self.Nc)*self.mu_0
        self.variance_mu_c = np.ones_like(self.Nc)*self.sigma_0
        self.mean_mu_a = np.zeros(self.number_of_articles)
        self.variance_mu_a = np.ones(self.number_of_articles)
        self.mean_eta_small = -1
        self.mean_eta_big = 1
        self.variance_eta_small = 1
        self.variance_eta_big = 1
        

    def fill_variables_in_train(self):
        self.fill_variables_sigma_c()
        self.fill_variables_mu_c()
        self.fill_variables_mu_a()
        self.train["Nc"] = self.Nc[self.train["user_id"]]
        self.fill_variables_eta_r()

    def fill_variables_sigma_c(self):
        self.train["alpha_to_beta"] = self.alpha_sigma_c[self.train["user_id"]] / self.beta_sigma_c[self.train["user_id"]]
    def fill_variables_mu_c(self):
        self.train["mean_mu_c"] = self.mean_mu_c[self.train["user_id"]]
        self.train["variance_mu_c"]= self.variance_mu_c[self.train["user_id"]]
    def fill_variables_mu_a(self):
        self.train["mean_mu_a"] = self.mean_mu_a[self.train["item_id"]]
        self.train["variance_mu_a"] = self.variance_mu_a[self.train["item_id"]]
    def fill_variables_eta_r(self):
        self.train["mean_eta_r"] = self.train["result"].map({self.SMALL: self.mean_eta_small, self.BIG: self.mean_eta_big, self.KEPT: self.eta_kept})
        self.train["variance_eta_r"] = self.train["result"].map({self.SMALL: self.variance_eta_small, self.BIG: self.variance_eta_big, self.KEPT: 0})

    def all_converged(self):
        return (self.converged_beta_sigma_c and
                self.converged_mean_mu_a and self.converged_variance_mu_a and 
                self.converged_variance_mu_c and self.converged_mean_mu_c and
                self.converged_mean_eta_r and self.converged_variance_eta_r)

    def _update_and_check_variance_mu_c(self, variance_mu_c):
        self.converged_variance_mu_c = np.allclose(self.variance_mu_c, variance_mu_c)
        self.variance_mu_c = variance_mu_c  
    def _update_and_check_beta_sigma_c(self, beta_sigma_c):
        self.converged_beta_sigma_c = np.allclose(self.beta_sigma_c, beta_sigma_c)
        self.beta_sigma_c = beta_sigma_c
    def _update_and_check_mean_mu_c(self, mean_mu_c):
        self.converged_mean_mu_c = np.allclose(self.mean_mu_c, mean_mu_c)
        self.mean_mu_c = mean_mu_c
    def _update_and_check_mean_mu_a(self, mean_mu_a):
        self.converged_mean_mu_a = np.allclose(self.mean_mu_a, mean_mu_a)
        self.mean_mu_a = mean_mu_a
    def _update_and_check_variance_mu_a(self, variance_mu_a):
        self.converged_variance_mu_a = np.allclose(self.variance_mu_a, variance_mu_a)
        self.variance_mu_a = variance_mu_a  
    def _update_and_check_variance_eta_r(self, small, big):
        self.converged_variance_eta_r = np.isclose(self.variance_eta_small, small) and np.isclose(self.variance_eta_big, big)
        self.variance_eta_small = small
        self.variance_eta_big = big
    def _update_and_check_mean_eta_r(self, small, big):
        self.converged_mean_eta_r = np.isclose(self.mean_eta_small, small) and np.isclose(self.mean_eta_big, big)
        self.mean_eta_small = small
        self.mean_eta_big = big

    def update_sigma_c(self):
        self.train["expected_sigma_c"] = ((self.train["size"] 
                                           -self.train["mean_mu_c"] -self.train["mean_mu_a"] - self.train["mean_eta_r"])**2
                                          + (self.train["variance_mu_c"]+self.train["variance_mu_a"]+self.train["variance_eta_r"]))
        beta_sigma_c = 2 + 0.5*self.train.groupby('user_id')["expected_sigma_c"].sum().values
        self._update_and_check_beta_sigma_c(beta_sigma_c)
        self.fill_variables_sigma_c()

    def update_mu_c(self):
        variance_mu_c = 1/(self.Nc*self.alpha_sigma_c/self.beta_sigma_c + self.sigma_0_inverse_square)
        self._update_and_check_variance_mu_c(variance_mu_c)

        self.train["expected_for_mu_c"] = self.train["mean_mu_a"]+self.train["mean_eta_r"]-self.train["size"]
        sum_over_c = self.train.groupby('user_id')["expected_for_mu_c"].sum().values
        mean_mu_c = (sum_over_c*self.alpha_sigma_c/self.beta_sigma_c + self.mu_0/self.sigma_0) * self.variance_mu_c ##CHANGED added + self.mu_0/self.sigma_0
        self._update_and_check_mean_mu_c(mean_mu_c)

        self.fill_variables_mu_c()

    def update_mu_a(self):
        variance_mu_a = 1/ (1 + self.train.groupby('item_id')["alpha_to_beta"].sum().values)
        self._update_and_check_variance_mu_a(variance_mu_a)

        self.train["expected_for_mu_a"] = (self.train["mean_mu_c"]+self.train["mean_eta_r"]-self.train["size"])*self.train["alpha_to_beta"]
        sum_over_a = self.train.groupby('item_id')["expected_for_mu_a"].sum().values
        mean_mu_a = sum_over_a * self.variance_mu_a
        self._update_and_check_mean_mu_a(mean_mu_a)

        self.fill_variables_mu_a()

    def update_eta_r(self):
        variance_eta_small = 1/(1+self.train[self.train["result"]==self.SMALL]["alpha_to_beta"].sum())
        variance_eta_big   = 1/(1+self.train[self.train["result"]==self.BIG]["alpha_to_beta"].sum())
        self._update_and_check_variance_eta_r(variance_eta_small, variance_eta_big)
        self.train["expected_for_eta_r"] =  (self.train["mean_mu_c"]+self.train["mean_mu_a"]-self.train["size"])*self.train["alpha_to_beta"]
        mean_eta_small = self.variance_eta_small * (-1+ self.train[self.train["result"]==self.SMALL]["expected_for_eta_r"].sum())
        mean_eta_big = self.variance_eta_big * (1+ self.train[self.train["result"]==self.BIG]["expected_for_eta_r"].sum())
        self._update_and_check_mean_eta_r(mean_eta_small, mean_eta_big)
        self.fill_variables_eta_r()

        
    def update(self):
        self.update_sigma_c()
        self.update_mu_c()
        self.update_mu_a()
        self.update_eta_r()




In [64]:
hss_test = HierarchicalSizeSimplified(df)

In [65]:
# 100 iterations ~ 12s
for i in range(1000):
    hss_test.update()
    if hss_test.all_converged():
        print(i)
        break

In [66]:
hss_test.mean_eta_small

4.731064030852345

In [67]:
hss_test.mean_eta_big

0.8453725741898405

In [68]:
hss_test.train

Unnamed: 0,result_old,user_id_old,item_id_old,size,review_date,category,user_id,item_id,result,alpha_to_beta,...,variance_mu_c,mean_mu_a,variance_mu_a,Nc,mean_eta_r,variance_eta_r,expected_for_mu_c,expected_for_mu_a,expected_for_eta_r,expected_sigma_c
28348,fit,847302,948396,4,"December 19, 2016",gown,89443,1703,0,0.000183,...,69.412779,0.790537,0.952812,3,0.000000,0.000000,-3.559194,0.017545,0.017690,9358.582184
189680,fit,567719,982932,20,"December 5, 2017",dress,59854,1766,0,0.000173,...,68.728660,1.415800,0.953624,4,0.000000,0.000000,-18.092525,0.013634,0.013879,6575.520295
109227,fit,311059,982932,20,"October 24, 2016",dress,32823,1766,0,0.000440,...,69.944058,1.415800,0.953624,1,0.000000,0.000000,-18.092525,0.035286,0.035908,6820.236088
48601,fit,177396,982932,4,"June 14, 2016",dress,18649,1766,0,0.000115,...,68.189683,1.415800,0.953624,7,0.000000,0.000000,-2.092525,0.010917,0.011081,9384.017799
126581,fit,97385,982932,16,"November 15, 2017",dress,10257,1766,0,0.000396,...,70.159213,1.415800,0.953624,1,0.000000,0.000000,-14.092525,0.033538,0.034098,7576.254288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55521,small,603604,916639,1,"July 23, 2016",dress,63587,1636,2,0.000130,...,65.987803,-1.892863,0.759374,10,4.731064,0.029466,5.255031,0.013147,0.011968,10120.816663
84291,small,590895,916639,20,"August 14, 2016",dress,62274,1636,2,0.000223,...,67.804129,-1.892863,0.759374,4,4.731064,0.029466,-13.744969,0.018713,0.016689,6943.617429
83688,small,855337,1962198,8,"May 18, 2015",dress,90294,3798,2,0.000148,...,62.774681,4.585302,0.941187,14,4.731064,0.029466,3.790102,0.013228,0.012842,8830.646270
99447,small,566967,916639,16,"May 27, 2016",dress,59769,1636,2,0.008444,...,3.628072,-1.892863,0.759374,31,4.731064,0.029466,-9.744969,-0.105870,-0.182617,117.790596
