In [1]:
from pathlib import Path
import pickle
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from data_preparation import get_train_runttherunway_data, get_test_runttherunway_data, get_processed_renttherunway_data


In [2]:
from data_preparation import *

In [3]:
import time

In [4]:
import time
import pickle
from os.path import exists
from os import listdir, remove
import re
def question(question_text):
    answer = input(question_text+ "[y/n] ").lower()
    if answer in ["yes", "y", "ye"]:
        return True
    if answer in ["no", "n"]:
        return False
    else:
        print("Please enter yes or no.")
        return question(question_text)

def current_timestamp():
    return time.strftime("%Y%m%d-%H%M%S")

def save_model(model, filename, dir = "models/", add_date=True, extension = "model"):
    if model is not None:
        if add_date:
            filename += current_timestamp()
        filepath = f"{dir}{filename}.{extension}"
        if exists(filepath):
            if not question(f"file {filepath} already exists. Do you want to overwrite it?"):
                print("Exiting. Model was not saved.")
                return
            else:
                print("Model will be overwritten.")
        with open(filepath, "wb") as file:
            pickle.dump(model, file)

def get_matching_models(filename, dir ="models/", extension = "model"):
    return [f for f in listdir("models/") if re.match(filename+"\d{8}-\d{6}\."+extension, f)]

def delete_files(file_list, dir = ""):
    file_list = [dir+f for f in file_list]
    if question(f"Following files will be deleted : {file_list}. Continue?"):
        for filename in file_list:
            remove(filename)

def delete_models(filename, leave="none" ,dir="models/", extension = "model"):
    models_to_delete = get_matching_models(filename, dir, extension)
    if leave in ["latest", "last", "newest", "new"]:
        models_to_delete.remove(max(models_to_delete))
    if leave in ["old", "oldest", "first"]:
        models_to_delete.remove(min(models_to_delete))
    if leave not in ["no_date"] and exists(f"{dir}{filename}.{extension}"):
        models_to_delete.append(f"{filename}.{extension}")
    delete_files(models_to_delete, dir)


def load_model(filename, dir ="models/", select = "latest", extension = "model"):
    models_with_date = get_matching_models(filename, dir, extension)
    if select is None or select == "none":
        filepath = f"{dir}{filename}.{extension}"
    elif select in ["latest", "last", "newest", "new"]:
        filepath = f"{dir}{max(models_with_date)}"
    elif select in ["first", "oldest", "old"]:
        filepath = f"{dir}{min(models_with_date)}"
    else:
        raise ValueError(f"Unrecognised select value {select}. Select can be: 'none', 'latest', 'first'")
    with open(filepath, "rb") as file:
        print(f"Loading model {filepath}")
        return pickle.load(file)


In [5]:
df = get_processed_renttherunway_data()

In [6]:
def normal_pdf(mu = 0, sigma = 1, x = 0):
    return (np.exp(-0.5*
                   ((x-mu)/sigma)**2) /
            (sigma* np.sqrt(2*np.pi)))

In [7]:
class HierarchicalSizeSimplified:
    def __init__(self, train_data):
        #self.train_original = train_data
        self.train = train_data
        self.iterations = 0 
        #self.customers = train_data["user_id"].sort_values().unique()
        #self.articles = train_data["item_id"].sort_values().unique()
        self.number_of_customers = train_data["user_id"].max()+1#self.customers.size
        self.number_of_articles = train_data["item_id"].max()+1#self.articles.size
        self.KEPT, self.BIG, self.SMALL = 0,1,2
        self._init_const()
        self._init_variables()
        self.fill_variables_in_train()

    def _init_const(self):
        self.Nc = self.train.groupby("user_id")["user_id"].count().values
        self.mu_0 = df["size"].mean()
        self.sigma_0 = df["size"].std()
        self.sigma_0_inverse_square = 1/(self.sigma_0**2)
        self.eta_kept = 0
        self.alpha_sigma_c = (self.Nc / 2 ) + 1
    
    def _init_variables(self):
        self.beta_sigma_c = np.ones_like(self.Nc)*2
        self.mean_mu_c = np.ones_like(self.Nc)*self.mu_0
        self.variance_mu_c = np.ones_like(self.Nc)*self.sigma_0
        self.mean_mu_a = np.zeros(self.number_of_articles)
        self.variance_mu_a = np.ones(self.number_of_articles)
        self.mean_eta_small = -1
        self.mean_eta_big = 1
        self.variance_eta_small = 1
        self.variance_eta_big = 1

    def fill_variables_in_train(self):
        self.fill_variables_sigma_c()
        self.fill_variables_mu_c()
        self.fill_variables_mu_a()
        self.train["Nc"] = self.Nc[self.train["user_id"]]
        self.fill_variables_eta_r()

    def fill_variables_sigma_c(self):
        self.train["alpha_to_beta"] = self.alpha_sigma_c[self.train["user_id"]] / self.beta_sigma_c[self.train["user_id"]]
    def fill_variables_mu_c(self):
        self.train["mean_mu_c"] = self.mean_mu_c[self.train["user_id"]]
        self.train["variance_mu_c"]= self.variance_mu_c[self.train["user_id"]]
    def fill_variables_mu_a(self):
        self.train["mean_mu_a"] = self.mean_mu_a[self.train["item_id"]]
        self.train["variance_mu_a"] = self.variance_mu_a[self.train["item_id"]]
    def fill_variables_eta_r(self):
        self.train["mean_eta_r"] = self.train["result"].map({self.SMALL: self.mean_eta_small, self.BIG: self.mean_eta_big, self.KEPT: self.eta_kept})
        self.train["variance_eta_r"] = self.train["result"].map({self.SMALL: self.variance_eta_small, self.BIG: self.variance_eta_big, self.KEPT: 0})

    def all_converged(self):
        return (self.converged_beta_sigma_c and
                self.converged_mean_mu_a and self.converged_variance_mu_a and 
                self.converged_variance_mu_c and self.converged_mean_mu_c and
                self.converged_mean_eta_r and self.converged_variance_eta_r)

    def _update_and_check_variance_mu_c(self, variance_mu_c):
        self.converged_variance_mu_c = np.allclose(self.variance_mu_c, variance_mu_c)
        self.variance_mu_c = variance_mu_c  
    def _update_and_check_beta_sigma_c(self, beta_sigma_c):
        self.converged_beta_sigma_c = np.allclose(self.beta_sigma_c, beta_sigma_c)
        self.beta_sigma_c = beta_sigma_c
    def _update_and_check_mean_mu_c(self, mean_mu_c):
        self.converged_mean_mu_c = np.allclose(self.mean_mu_c, mean_mu_c)
        self.mean_mu_c = mean_mu_c
    def _update_and_check_mean_mu_a(self, mean_mu_a):
        self.converged_mean_mu_a = np.allclose(self.mean_mu_a, mean_mu_a)
        self.mean_mu_a = mean_mu_a
    def _update_and_check_variance_mu_a(self, variance_mu_a):
        self.converged_variance_mu_a = np.allclose(self.variance_mu_a, variance_mu_a)
        self.variance_mu_a = variance_mu_a  
    def _update_and_check_variance_eta_r(self, small, big):
        self.converged_variance_eta_r = np.isclose(self.variance_eta_small, small) and np.isclose(self.variance_eta_big, big)
        self.variance_eta_small = small
        self.variance_eta_big = big
    def _update_and_check_mean_eta_r(self, small, big):
        self.converged_mean_eta_r = np.isclose(self.mean_eta_small, small) and np.isclose(self.mean_eta_big, big)
        self.mean_eta_small = small
        self.mean_eta_big = big

    def update_sigma_c(self):
        self.train["expected_sigma_c"] = ((self.train["size"] 
                                           -self.train["mean_mu_c"] -self.train["mean_mu_a"] - self.train["mean_eta_r"])**2
                                          + (self.train["variance_mu_c"]+self.train["variance_mu_a"]+self.train["variance_eta_r"]))
        beta_sigma_c = 2 + 0.5*self.train.groupby('user_id')["expected_sigma_c"].sum().values
        self._update_and_check_beta_sigma_c(beta_sigma_c)
        self.fill_variables_sigma_c()

    def update_mu_c(self):
        variance_mu_c = 1/(self.Nc*self.alpha_sigma_c/self.beta_sigma_c + self.sigma_0_inverse_square)
        self._update_and_check_variance_mu_c(variance_mu_c)

        self.train["expected_for_mu_c"] = self.train["mean_mu_a"]+self.train["mean_eta_r"]-self.train["size"]
        sum_over_c = self.train.groupby('user_id')["expected_for_mu_c"].sum().values
        mean_mu_c = (sum_over_c*self.alpha_sigma_c/self.beta_sigma_c + self.mu_0/self.sigma_0) * self.variance_mu_c ##CHANGED added + self.mu_0/self.sigma_0
        self._update_and_check_mean_mu_c(mean_mu_c)

        self.fill_variables_mu_c()

    def update_mu_a(self):
        variance_mu_a = 1/ (1 + self.train.groupby('item_id')["alpha_to_beta"].sum().values)
        self._update_and_check_variance_mu_a(variance_mu_a)

        self.train["expected_for_mu_a"] = (self.train["mean_mu_c"]+self.train["mean_eta_r"]-self.train["size"])*self.train["alpha_to_beta"]
        sum_over_a = self.train.groupby('item_id')["expected_for_mu_a"].sum().values
        mean_mu_a = sum_over_a * self.variance_mu_a
        self._update_and_check_mean_mu_a(mean_mu_a)

        self.fill_variables_mu_a()

    def update_eta_r(self):
        variance_eta_small = 1/(1+self.train[self.train["result"]==self.SMALL]["alpha_to_beta"].sum())
        variance_eta_big   = 1/(1+self.train[self.train["result"]==self.BIG]["alpha_to_beta"].sum())
        self._update_and_check_variance_eta_r(variance_eta_small, variance_eta_big)
        self.train["expected_for_eta_r"] =  (self.train["mean_mu_c"]+self.train["mean_mu_a"]-self.train["size"])*self.train["alpha_to_beta"]
        mean_eta_small = self.variance_eta_small * (-1+ self.train[self.train["result"]==self.SMALL]["expected_for_eta_r"].sum())
        mean_eta_big = self.variance_eta_big * (1+ self.train[self.train["result"]==self.BIG]["expected_for_eta_r"].sum())
        self._update_and_check_mean_eta_r(mean_eta_small, mean_eta_big)
        self.fill_variables_eta_r()
        
    def update(self):
        self.iterations+=1
        self.update_sigma_c()
        self.update_mu_c()
        self.update_mu_a()
        self.update_eta_r()

    def pdf(self, article, customer, return_status, customer_size, n_samples=1000):
        mu_a_samples = np.random.normal(self.mean_mu_a[article], self.variance_mu_a[article], size=n_samples)
        mu_c_samples = np.random.normal(self.mean_mu_c[customer], self.variance_mu_c[customer], size=n_samples)
        if return_status==FIT_LABEL:
            eta_r_samples = np.zeros(n_samples)
        elif return_status==LARGE_LABEL:
            eta_r_samples = np.random.normal(self.mean_eta_big, self.variance_eta_big, size=n_samples)
        elif return_status==SMALL_LABEL:
            eta_r_samples = np.ranodm.normal(self.mean_eta_small, self.variance_eta_small, size=n_samples)
        else:
            ValueError("unknown return status")
        sigma_c_samples = 1/np.random.gamma(self.alpha_sigma_c[customer], 1/self.beta_sigma_c[customer], size=n_samples)

        mu_samples = mu_a_samples+mu_c_samples+eta_r_samples
        pdf_values = normal_pdf(mu_samples, sigma_c_samples, customer_size)
        return pdf_values.mean()

In [9]:
hss_test = HierarchicalSizeSimplified(df)

In [None]:
# 100 iterations ~ 12s
for i in range(100):
    hss_test.update()
    if hss_test.all_converged():
        print(i)
        break

In [13]:
from tqdm import tqdm

In [15]:
for i in range(10):
    for j in tqdm(range(10000)):
        hss_test.update()
        if hss_test.all_converged():
            print(f"converged after {i*10000+j} iterations")
            break
    save_model(hss_test, "hss_full", add_date=True)
    

100%|██████████| 10000/10000 [25:18<00:00,  6.59it/s]
100%|██████████| 10000/10000 [24:41<00:00,  6.75it/s]
100%|██████████| 10000/10000 [21:57<00:00,  7.59it/s]
100%|██████████| 10000/10000 [21:29<00:00,  7.75it/s]
100%|██████████| 10000/10000 [21:22<00:00,  7.80it/s]
100%|██████████| 10000/10000 [20:39<00:00,  8.07it/s]
100%|██████████| 10000/10000 [20:40<00:00,  8.06it/s]
100%|██████████| 10000/10000 [20:06<00:00,  8.29it/s]
100%|██████████| 10000/10000 [20:10<00:00,  8.26it/s]
100%|██████████| 10000/10000 [19:59<00:00,  8.34it/s]


In [16]:
for i in range(10):
    for j in tqdm(range(10000)):
        hss_test.update()
        if hss_test.all_converged():
            print(f"converged after {i*10000+j} iterations")
            break
    save_model(hss_test, "hss_full", add_date=True)

100%|██████████| 10000/10000 [19:43<00:00,  8.45it/s]
100%|██████████| 10000/10000 [19:51<00:00,  8.39it/s]
100%|██████████| 10000/10000 [19:50<00:00,  8.40it/s]
100%|██████████| 10000/10000 [19:49<00:00,  8.41it/s]
100%|██████████| 10000/10000 [19:48<00:00,  8.42it/s]
100%|██████████| 10000/10000 [19:53<00:00,  8.38it/s]
100%|██████████| 10000/10000 [19:49<00:00,  8.41it/s]
100%|██████████| 10000/10000 [19:58<00:00,  8.34it/s]
100%|██████████| 10000/10000 [20:00<00:00,  8.33it/s]
100%|██████████| 10000/10000 [19:57<00:00,  8.35it/s]


In [20]:
print(hss_test.converged_beta_sigma_c,
        hss_test.converged_mean_mu_a, hss_test.converged_variance_mu_a,
        hss_test.converged_variance_mu_c, hss_test.converged_mean_mu_c,
        hss_test.converged_mean_eta_r, hss_test.converged_variance_eta_r)

False False False False False False False


In [None]:
customer = 1
n_samples = 10
1/np.random.gamma(hss_test.alpha_sigma_c, 
                  1/hss_test.beta_sigma_c[customer],
                   size=n_samples)


In [None]:
hss_test.pdf(0,0,0,20)

In [21]:
hss_test.mean_eta_small

7.267423361921556

In [None]:
save_model(hss_test, "hss_test", add_date=True)

In [None]:
load_model("hss_test", select="latest")

In [22]:
hss_test.mean_eta_big

3.7757170706561882

In [None]:
hss_test.train