In [2]:
from pathlib import Path
import pickle
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from data_preparation import get_train_runttherunway_data, get_test_runttherunway_data, get_processed_renttherunway_data


In [9]:
import time

In [90]:
import time
import pickle
from os.path import exists
from os import listdir, remove
import re
def question(question_text):
    answer = input(question_text+ "[y/n] ").lower()
    if answer in ["yes", "y", "ye"]:
        return True
    if answer in ["no", "n"]:
        return False
    else:
        print("Please enter yes or no.")
        return question(question_text)

def current_timestamp():
    return time.strftime("%Y%m%d-%H%M%S")

def save_model(model, filename, dir = "models/", add_date=True, extension = "model"):
    if model is not None:
        if add_date:
            filename += current_timestamp()
        filepath = f"{dir}{filename}.{extension}"
        if exists(filepath):
            if not question(f"file {filepath} already exists. Do you want to overwrite it?"):
                print("Exiting. Model was not saved.")
                return
            else:
                print("Model will be overwritten.")
        with open(filepath, "wb") as file:
            pickle.dump(model, file)

def get_matching_models(filename, dir ="models/", extension = "model"):
    return [f for f in listdir("models/") if re.match(filename+"\d{8}-\d{6}\."+extension, f)]

def delete_files(file_list, dir = ""):
    file_list = [dir+f for f in file_list]
    if question(f"Following files will be deleted : {file_list}. Continue?"):
        for filename in file_list:
            remove(filename)

def delete_models(filename, leave="none" ,dir="models/", extension = "model"):
    models_to_delete = get_matching_models(filename, dir, extension)
    if leave in ["latest", "last", "newest", "new"]:
        models_to_delete.remove(max(models_to_delete))
    if leave in ["old", "oldest", "first"]:
        models_to_delete.remove(min(models_to_delete))
    if leave not in ["no_date"] and exists(f"{dir}{filename}.{extension}"):
        models_to_delete.append(f"{filename}.{extension}")
    delete_files(models_to_delete, dir)


def load_model(filename, dir ="models/", select = "latest", extension = "model"):
    models_with_date = get_matching_models(filename, dir, extension)
    if select is None or select == "none":
        filepath = f"{dir}{filename}.{extension}"
    elif select in ["latest", "last", "newest", "new"]:
        filepath = f"{dir}{max(models_with_date)}"
    elif select in ["first", "oldest", "old"]:
        filepath = f"{dir}{min(models_with_date)}"
    else:
        raise ValueError(f"Unrecognised select value {select}. Select can be: 'none', 'latest', 'first'")
    with open(filepath, "rb") as file:
        print(f"Loading model {filepath}")
        return pickle.load(file)


In [95]:
save_model(hss_test, "hss_test", add_date=False)

In [97]:
delete_models("hss_test", leave="none")

In [None]:
df = get_processed_renttherunway_data()

In [4]:
class HierarchicalSizeSimplified:
    def __init__(self, train_data):
        #self.train_original = train_data
        self.train = train_data
        self.iterations = 0 
        #self.customers = train_data["user_id"].sort_values().unique()
        #self.articles = train_data["item_id"].sort_values().unique()
        self.number_of_customers = train_data["user_id"].max()+1#self.customers.size
        self.number_of_articles = train_data["item_id"].max()+1#self.articles.size
        self.KEPT, self.BIG, self.SMALL = 0,1,2
        self._init_const()
        self._init_variables()
        self.fill_variables_in_train()

    def _init_const(self):
        self.Nc = self.train.groupby("user_id")["user_id"].count().values
        self.mu_0 = df["size"].mean()
        self.sigma_0 = df["size"].std()
        self.sigma_0_inverse_square = 1/(self.sigma_0**2)
        self.eta_kept = 0
        self.alpha_sigma_c = (self.Nc / 2 ) + 1
    
    def _init_variables(self):
        self.beta_sigma_c = np.ones_like(self.Nc)*2
        self.mean_mu_c = np.ones_like(self.Nc)*self.mu_0
        self.variance_mu_c = np.ones_like(self.Nc)*self.sigma_0
        self.mean_mu_a = np.zeros(self.number_of_articles)
        self.variance_mu_a = np.ones(self.number_of_articles)
        self.mean_eta_small = -1
        self.mean_eta_big = 1
        self.variance_eta_small = 1
        self.variance_eta_big = 1

    def fill_variables_in_train(self):
        self.fill_variables_sigma_c()
        self.fill_variables_mu_c()
        self.fill_variables_mu_a()
        self.train["Nc"] = self.Nc[self.train["user_id"]]
        self.fill_variables_eta_r()

    def fill_variables_sigma_c(self):
        self.train["alpha_to_beta"] = self.alpha_sigma_c[self.train["user_id"]] / self.beta_sigma_c[self.train["user_id"]]
    def fill_variables_mu_c(self):
        self.train["mean_mu_c"] = self.mean_mu_c[self.train["user_id"]]
        self.train["variance_mu_c"]= self.variance_mu_c[self.train["user_id"]]
    def fill_variables_mu_a(self):
        self.train["mean_mu_a"] = self.mean_mu_a[self.train["item_id"]]
        self.train["variance_mu_a"] = self.variance_mu_a[self.train["item_id"]]
    def fill_variables_eta_r(self):
        self.train["mean_eta_r"] = self.train["result"].map({self.SMALL: self.mean_eta_small, self.BIG: self.mean_eta_big, self.KEPT: self.eta_kept})
        self.train["variance_eta_r"] = self.train["result"].map({self.SMALL: self.variance_eta_small, self.BIG: self.variance_eta_big, self.KEPT: 0})

    def all_converged(self):
        return (self.converged_beta_sigma_c and
                self.converged_mean_mu_a and self.converged_variance_mu_a and 
                self.converged_variance_mu_c and self.converged_mean_mu_c and
                self.converged_mean_eta_r and self.converged_variance_eta_r)

    def _update_and_check_variance_mu_c(self, variance_mu_c):
        self.converged_variance_mu_c = np.allclose(self.variance_mu_c, variance_mu_c)
        self.variance_mu_c = variance_mu_c  
    def _update_and_check_beta_sigma_c(self, beta_sigma_c):
        self.converged_beta_sigma_c = np.allclose(self.beta_sigma_c, beta_sigma_c)
        self.beta_sigma_c = beta_sigma_c
    def _update_and_check_mean_mu_c(self, mean_mu_c):
        self.converged_mean_mu_c = np.allclose(self.mean_mu_c, mean_mu_c)
        self.mean_mu_c = mean_mu_c
    def _update_and_check_mean_mu_a(self, mean_mu_a):
        self.converged_mean_mu_a = np.allclose(self.mean_mu_a, mean_mu_a)
        self.mean_mu_a = mean_mu_a
    def _update_and_check_variance_mu_a(self, variance_mu_a):
        self.converged_variance_mu_a = np.allclose(self.variance_mu_a, variance_mu_a)
        self.variance_mu_a = variance_mu_a  
    def _update_and_check_variance_eta_r(self, small, big):
        self.converged_variance_eta_r = np.isclose(self.variance_eta_small, small) and np.isclose(self.variance_eta_big, big)
        self.variance_eta_small = small
        self.variance_eta_big = big
    def _update_and_check_mean_eta_r(self, small, big):
        self.converged_mean_eta_r = np.isclose(self.mean_eta_small, small) and np.isclose(self.mean_eta_big, big)
        self.mean_eta_small = small
        self.mean_eta_big = big

    def update_sigma_c(self):
        self.train["expected_sigma_c"] = ((self.train["size"] 
                                           -self.train["mean_mu_c"] -self.train["mean_mu_a"] - self.train["mean_eta_r"])**2
                                          + (self.train["variance_mu_c"]+self.train["variance_mu_a"]+self.train["variance_eta_r"]))
        beta_sigma_c = 2 + 0.5*self.train.groupby('user_id')["expected_sigma_c"].sum().values
        self._update_and_check_beta_sigma_c(beta_sigma_c)
        self.fill_variables_sigma_c()

    def update_mu_c(self):
        variance_mu_c = 1/(self.Nc*self.alpha_sigma_c/self.beta_sigma_c + self.sigma_0_inverse_square)
        self._update_and_check_variance_mu_c(variance_mu_c)

        self.train["expected_for_mu_c"] = self.train["mean_mu_a"]+self.train["mean_eta_r"]-self.train["size"]
        sum_over_c = self.train.groupby('user_id')["expected_for_mu_c"].sum().values
        mean_mu_c = (sum_over_c*self.alpha_sigma_c/self.beta_sigma_c + self.mu_0/self.sigma_0) * self.variance_mu_c ##CHANGED added + self.mu_0/self.sigma_0
        self._update_and_check_mean_mu_c(mean_mu_c)

        self.fill_variables_mu_c()

    def update_mu_a(self):
        variance_mu_a = 1/ (1 + self.train.groupby('item_id')["alpha_to_beta"].sum().values)
        self._update_and_check_variance_mu_a(variance_mu_a)

        self.train["expected_for_mu_a"] = (self.train["mean_mu_c"]+self.train["mean_eta_r"]-self.train["size"])*self.train["alpha_to_beta"]
        sum_over_a = self.train.groupby('item_id')["expected_for_mu_a"].sum().values
        mean_mu_a = sum_over_a * self.variance_mu_a
        self._update_and_check_mean_mu_a(mean_mu_a)

        self.fill_variables_mu_a()

    def update_eta_r(self):
        variance_eta_small = 1/(1+self.train[self.train["result"]==self.SMALL]["alpha_to_beta"].sum())
        variance_eta_big   = 1/(1+self.train[self.train["result"]==self.BIG]["alpha_to_beta"].sum())
        self._update_and_check_variance_eta_r(variance_eta_small, variance_eta_big)
        self.train["expected_for_eta_r"] =  (self.train["mean_mu_c"]+self.train["mean_mu_a"]-self.train["size"])*self.train["alpha_to_beta"]
        mean_eta_small = self.variance_eta_small * (-1+ self.train[self.train["result"]==self.SMALL]["expected_for_eta_r"].sum())
        mean_eta_big = self.variance_eta_big * (1+ self.train[self.train["result"]==self.BIG]["expected_for_eta_r"].sum())
        self._update_and_check_mean_eta_r(mean_eta_small, mean_eta_big)
        self.fill_variables_eta_r()
        
    def update(self):
        self.iterations+=1
        self.update_sigma_c()
        self.update_mu_c()
        self.update_mu_a()
        self.update_eta_r()




In [None]:
hss_test = HierarchicalSizeSimplified(df)

In [None]:
# 100 iterations ~ 12s
for i in range(100):
    hss_test.update()
    if hss_test.all_converged():
        print(i)
        break

In [6]:
hss_test.mean_eta_small

7.119018449959658

In [74]:
save_model(hss_test, "hss_test", add_date=True)

In [68]:
load_model("hss_test", select="latest")

Loading model models/hss_test20220715-103733.model


<__main__.HierarchicalSizeSimplified at 0x7f5bd8234fa0>

In [7]:
hss_test.mean_eta_big

3.346381647110297

In [8]:
hss_test.train

Unnamed: 0,result_original,user_id_original,item_id_original,size,review_date,category,user_id,item_id,result,alpha_to_beta,...,variance_mu_c,mean_mu_a,variance_mu_a,Nc,mean_eta_r,variance_eta_r,expected_sigma_c,expected_for_mu_c,expected_for_mu_a,expected_for_eta_r
0,fit,420272,2260466,14,"April 20, 2016",romper,44334,4396,0,0.000188,...,66.740319,0.148828,0.956167,6,0.0,0.0,6617.741781,-14.325771,0.015265,0.015293
1,fit,273551,153475,12,"June 18, 2013",gown,28835,65,0,0.000312,...,70.576094,10.354689,0.848392,1,0.0,0.0,9624.550840,-4.008021,0.027931,0.031157
2,fit,360448,1063761,4,"December 14, 2015",sheath,37976,1945,0,0.000319,...,70.539116,1.140773,0.890603,1,0.0,0.0,9400.358697,-5.125536,0.031123,0.031487
3,fit,909926,126335,8,"February 12, 2014",dress,96080,7,0,0.000206,...,71.105221,24.516869,0.617642,1,0.0,0.0,14549.615920,17.632552,0.019532,0.024586
4,fit,151944,616682,12,"September 26, 2016",gown,15959,1032,0,0.000205,...,65.389103,2.574426,0.961189,7,0.0,0.0,7024.076948,-9.419857,0.016596,0.017124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192539,fit,66386,2252812,8,"May 18, 2016",jumpsuit,7026,4382,0,0.000173,...,66.366476,-0.161312,0.887071,7,0.0,0.0,7614.283171,-7.975646,0.015032,0.015004
192540,fit,118398,682043,4,"September 30, 2016",dress,12494,1164,0,0.000157,...,69.793032,-1.624764,0.702435,3,0.0,0.0,9214.119204,-4.925670,0.015162,0.014908
192541,fit,47002,683251,8,"March 4, 2016",dress,5019,1166,0,0.009237,...,4.245548,-3.663509,0.648229,24,0.0,0.0,117.206876,-8.231496,-0.082414,-0.116253
192542,fit,961120,126335,16,"November 25, 2015",dress,101534,7,0,0.000238,...,70.945454,24.516869,0.617642,1,0.0,0.0,12611.361685,9.632552,0.020553,0.026383
