In [None]:
import numpy as np
import pandas as pd


df = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv", dtype={"article_id": str})
print(df.shape)
df.head()

In [None]:
df["t_dat"] = pd.to_datetime(df["t_dat"])
df["t_dat"].max()

In [None]:
active_articles = df.groupby("article_id")["t_dat"].max().reset_index()
active_articles = active_articles[active_articles["t_dat"] >= "2019-09-01"].reset_index()
active_articles.shape

In [None]:
df = df[df["article_id"].isin(active_articles["article_id"])].reset_index(drop=True)
df.shape

In [None]:
df["week"] = (df["t_dat"].max() - df["t_dat"]).dt.days // 7
df["week"].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder


article_ids = np.concatenate([["placeholder"], np.unique(df["article_id"].values)])

le_article = LabelEncoder()
le_article.fit(article_ids)
df["article_id"] = le_article.transform(df["article_id"])

In [None]:
WEEK_HIST_MAX = 5

def create_dataset(df, week):
    hist_df = df[(df["week"] > week) & (df["week"] <= week + WEEK_HIST_MAX)]
    hist_df = hist_df.groupby("customer_id").agg({"article_id": list, "week": list}).reset_index()
    hist_df.rename(columns={"week": 'week_history'}, inplace=True)
    
    target_df = df[df["week"] == week]
    target_df = target_df.groupby("customer_id").agg({"article_id": list}).reset_index()
    target_df.rename(columns={"article_id": "target"}, inplace=True)
    target_df["week"] = week
    
    return target_df.merge(hist_df, on="customer_id", how="left")

val_weeks = [0]
train_weeks = [1, 2, 3, 4]


val_df = pd.concat([create_dataset(df, w) for w in val_weeks]).reset_index(drop=True)
train_df = pd.concat([create_dataset(df, w) for w in train_weeks]).reset_index(drop=True)
train_df.shape, val_df.shape

In [None]:
def adjust_lr(optimizer, epoch):
    if epoch < 1:
        lr = 5e-5
    elif epoch < 6:
        lr = 1e-3
    elif epoch < 9:
        lr = 1e-4
    else:
        lr = 1e-5

    for p in optimizer.param_groups:
        p['lr'] = lr
    return lr
    
def get_optimizer(net):
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=3e-4, betas=(0.9, 0.999),
                                 eps=1e-08)
    return optimizer

In [None]:
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm


In [None]:
class TreeStructure(nn.Module):
    def __init__(self, middle_index, item_index,layer_top_emb,layer_bottom_emb,first_train):
        super(TreeStructure, self).__init__()

        # Parameters
        self.first_train = first_train
        self.ntokens = 72582#the number of ouput.(72582)
        self.nhid = 512#dimension: the same length of customer dimension.(512)

        self.ntokens_per_class = 20#how many children one intermidiate node.(20)

        self.nclasses = int(np.ceil(self.ntokens * 1. / self.ntokens_per_class))#intermidiate nodes.(3630)
        self.ntokens_actual = self.nclasses * self.ntokens_per_class#72600
        if self.first_train:
            self.layer_top_emb = nn.Parameter(torch.FloatTensor(self.nclasses,self.nhid), requires_grad=True)
            self.layer_bottom_emb = nn.Parameter(torch.FloatTensor(self.ntokens_actual, self.nhid), requires_grad=True)
            self.init_weights()
            #for K-means to cluster the embedding.(Initialization)
            self.middle_index = np.arange(self.nclasses).tolist()
            self.item_index = np.arange(self.ntokens_actual).tolist()
        else:
            #(Inherit from the previous K-means clustering)
            self.middle_index = middle_index.tolist()
            self.item_index = item_index.tolist()
            self.layer_top_emb = nn.Parameter(layer_top_emb, requires_grad=True)
            self.layer_bottom_emb = nn.Parameter(layer_bottom_emb, requires_grad=True)
            

    def init_weights(self):

        initrange = 0.1
        self.layer_top_emb.data.uniform_(-initrange, initrange)
        self.layer_bottom_emb.data.uniform_(-initrange, initrange)


    def forward(self, purchase_hist_npos):
        #leaf index 
        hist = purchase_hist_npos
        
        #nonleaf index
        parent_index = (hist/ self.ntokens_per_class).long()#the position after clustering 

        #leaf embedding
        positive_leaf_emb = self.layer_bottom_emb[hist]#positive 1###[256, 512]
        negative_leaf_sample = torch.LongTensor(np.random.choice(72600, positive_leaf_emb.shape[0]))###[256] 
        negative_leaf_emb = self.layer_bottom_emb[negative_leaf_sample]#negative 1###[256, 512]
        #nonleaf embedding
        positive_nonleaf_emb = self.layer_top_emb[parent_index]#positive 2###[256, 512]
        negative_nonleaf_sample = torch.LongTensor(np.random.choice(3630, positive_leaf_emb.shape[0]))
        negative_nonleaf_emb = self.layer_top_emb[negative_nonleaf_sample]#negative 2
        
        
        return [positive_leaf_emb,negative_leaf_emb,positive_nonleaf_emb,negative_nonleaf_emb]


In [None]:
class HMModel(nn.Module):
    def __init__(self, article_shape,first_train,middle_index, item_index,layer_top_emb,layer_bottom_emb,pre_emb):
        super(HMModel, self).__init__()
        
        self.first_train = first_train
        if self.first_train:
            self.article_emb = torch.nn.Embedding(article_shape[0], embedding_dim=article_shape[1])
            middle_index = torch.ones(1)
            item_index = torch.ones(1)
            layer_top_emb = torch.ones(1)
            layer_bottom_emb = torch.ones(1)
        else:
            self.article_emb = torch.nn.Embedding.from_pretrained(torch.from_numpy(pre_emb).float())
            self.middle_index = middle_index
            self.item_index = item_index
            self.layer_top_emb = layer_top_emb
            self.layer_bottom_emb = layer_bottom_emb
            
        self.Tree = TreeStructure(middle_index, item_index,layer_top_emb,layer_bottom_emb,first_train=self.first_train)
    def forward(self, inputs):
        article_hist, week_hist, purchase_hist_npos = inputs[0], inputs[1], inputs[2]
        x = self.article_emb(article_hist)
        x = F.normalize(x, dim=2)###[256, 16, 512]
        
        x, indices = x.max(axis=1)##customer_emb[256,512]
        
        customer_emb = x
        
        global is_test
        
        if is_test:
            
            return customer_emb

        #print('0',purchase_hist_item,purchase_hist_item.shape)
        
        [p1,n1,p2,n2] = self.Tree(purchase_hist_npos)#get four logits for 2 positive and 2 negative samples
        
        p1_dot = torch.mul(x,p1).sum(dim=1).unsqueeze(0)
        n1_dot = torch.mul(x,n1).sum(dim=1).unsqueeze(0)
        p2_dot = torch.mul(x,p2).sum(dim=1).unsqueeze(0)
        n2_dot = torch.mul(x,n2).sum(dim=1).unsqueeze(0)
        
        
        logits = torch.cat((p1_dot,n1_dot,p2_dot,n2_dot),0).T
        
        return logits

#Train the model at the first time.
middle_index = torch.ones(1)
item_index = torch.ones(1)
layer_top_emb = torch.ones(1)
layer_bottom_emb = torch.ones(1)
first_train = True
global first
global is_test
is_test = False
first = True
article_emb = torch.ones(1)

model = HMModel((len(le_article.classes_), 512),first_train,middle_index, item_index,layer_top_emb,layer_bottom_emb,article_emb)
model = model.cuda()

In [None]:
global find_index
global item_index
global first_time
first_time = True
find_index = torch.ones(1)
item_index = torch.ones(1)

In [None]:
from sklearn.cluster import KMeans
def k_means(model):
    #Find the current embedding of the tree.
    cur_intermidiate_emb = model.Tree.layer_top_emb
    cur_bottom_emb = model.Tree.layer_bottom_emb
    article_emb = model.article_emb.weight.detach().cpu().numpy()
    
    #Find the current index of items in the tree.
    middle_index = np.array(model.Tree.middle_index)
    global item_index
    item_index = np.array(model.Tree.item_index)
    
    #Use embedding to cluster(Get the index after K-means)
    kmeans1 = KMeans(n_clusters=50, random_state=0).fit(cur_intermidiate_emb.cpu().detach().numpy())
    global find_index
    find_index = np.argsort(kmeans1.labels_)
    middle_cluster = middle_index[find_index]
    print('middle nodes clustering by K-means finished.')

    kmeans2 = KMeans(n_clusters=50, random_state=0).fit(cur_bottom_emb.cpu().detach().numpy())
    item_cluster = item_index[np.argsort(kmeans2.labels_)]
    print('leaf nodes clustering by K-means finished.')
    
    #Reconstruct embedding matrix using above index(reconstruct tree)
    suc_intermidiate_emb = torch.from_numpy(cur_intermidiate_emb.cpu().detach().numpy()[middle_cluster])
    suc_bottom_emb = torch.from_numpy(cur_bottom_emb.cpu().detach().numpy()[item_cluster])
          
           ###3630          ###72600      ###[3630, 512]       ###[72600, 512]
    return middle_cluster, item_cluster, suc_intermidiate_emb, suc_bottom_emb, article_emb
    

In [None]:
class HMDataset(Dataset):
    def __init__(self, df, seq_len, model,is_test=False):
        self.df = df.reset_index(drop=True)
        self.seq_len = seq_len
        self.is_test = is_test
        self.model = model
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        if self.is_test:
            target = torch.zeros(2).float()
        else:
            if not row.target:
                target = torch.tensor([0]).int()
            else:
                rand_target = np.random.choice(row.target,1)
                target = torch.tensor(rand_target).squeeze().int()
#             for t in row.target:
#                 target[t] = 1.0
#                 break
            
        article_hist = torch.zeros(self.seq_len).long()
        week_hist = torch.ones(self.seq_len).float()
        
        
        if isinstance(row.article_id, list):
            if len(row.article_id) >= self.seq_len:
                article_hist = torch.LongTensor(row.article_id[-self.seq_len:])
                week_hist = (torch.LongTensor(row.week_history[-self.seq_len:]) - row.week)/WEEK_HIST_MAX/2
            else:
                article_hist[-len(row.article_id):] = torch.LongTensor(row.article_id)
                week_hist[-len(row.article_id):] = (torch.LongTensor(row.week_history) - row.week)/WEEK_HIST_MAX/2
        target = torch.tensor([1,0,1,0]).float()
        
        purchase_hist_item = article_hist[-1].numpy()
    
        tree_item_index = self.model.Tree.item_index
        global first_time
        global find_index
        global item_index
        if first_time:
            purchase_hist_npos = torch.tensor(purchase_hist_item)
        else:
            purchase_hist_npos = torch.tensor(find_index[item_index.index(purchase_hist_item)])
        

        return article_hist, week_hist, purchase_hist_npos, target
    
HMDataset(val_df, 64,model)[100]

In [None]:
import sys

def calc_map(topk_preds, target_array, k=12):
    metric = []
    tp, fp = 0, 0
    
    for pred in topk_preds:
        if target_array[pred]:
            tp += 1
            metric.append(tp/(tp + fp))
        else:
            fp += 1
            
    return np.sum(metric) / min(k, target_array.sum())

def read_data(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()
    #return tuple(d for d in data[:-1]), data[-1]


def validate(model, val_loader, k=12):
    model.eval()
    
    tbar = tqdm(val_loader, file=sys.stdout)
    
    maps = []
    
    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            logits = model(inputs)

            _, indices = torch.topk(logits, k, dim=1)

            indices = indices.detach().cpu().numpy()
            target = target.detach().cpu().numpy()
            
            for i in range(indices.shape[0]):
                maps.append(calc_map(indices[i], target[i]))
        
    
    return np.mean(maps)

SEQ_LEN = 16

BS = 256
NW = 8

val_dataset = HMDataset(val_df, SEQ_LEN,model)
val_loader = DataLoader(val_dataset, batch_size=BS, shuffle=False, num_workers=NW,
                          pin_memory=False, drop_last=False)

### Train and validate

In [None]:
def dice_loss(y_pred, y_true):
    y_pred = y_pred.sigmoid()
    intersect = (y_true*y_pred).sum(axis=1)
    
    return 1 - (intersect/(intersect + y_true.sum(axis=1) + y_pred.sum(axis=1))).mean()


def train(model, train_loader, val_loader, epochs):
    np.random.seed(SEED)
    
    optimizer = get_optimizer(model)
    scaler = torch.cuda.amp.GradScaler()

    criterion = nn.BCEWithLogitsLoss()
    
    for e in range(epochs):
        model.train()
        tbar = tqdm(train_loader, file=sys.stdout)
        
        lr = adjust_lr(optimizer, e)
        
        loss_list = []

        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            optimizer.zero_grad()
            
            with torch.cuda.amp.autocast():
                logits = model(inputs)
                
#                 print('test1',logits,logits.shape)
#                 print('test2',target,target.shape)
                
                loss = criterion(logits, target.float())
            #loss.backward()
            scaler.scale(loss).backward()
            #optimizer.step()
            scaler.step(optimizer)
            scaler.update()
            
            loss_list.append(loss.detach().cpu().item())
            
            avg_loss = np.round(100*np.mean(loss_list), 4)

            tbar.set_description(f"Epoch {e+1} Loss: {avg_loss} lr: {lr}")
            
#         val_map = validate(model, val_loader)

#         log_text = f"Epoch {e+1}\nTrain Loss: {avg_loss}\nValidation MAP: {val_map}\n"
            
#         print(log_text)
        
        #logfile = open(f"models/{MODEL_NAME}_{SEED}.txt", 'a')
        #logfile.write(log_text)
        #logfile.close()
    return model


MODEL_NAME = "exp001"
SEED = 0

train_dataset = HMDataset(train_df, SEQ_LEN,model)
train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True, num_workers=NW,
                          pin_memory=False, drop_last=True)

#first training(Initializing)
model = train(model, train_loader, val_loader, epochs=5) 

In [None]:
#Train to reconstruct the interest tree many times after first training.
global first
first = False
def train_tree(model,train_loader,val_loader,epochs):
    cluster = k_means(model)
    first_train = False
    middle_index = cluster[0]
    item_index = cluster[1]
    layer_top_emb = cluster[2]
    layer_bottom_emb = cluster[3]
    article_emb = cluster[4]
    model = HMModel((len(le_article.classes_), 512),first_train,middle_index, item_index,layer_top_emb,layer_bottom_emb,article_emb)
    model = model.cuda()
    return train(model, train_loader, val_loader, epochs)

epochs = 5
train_tree_epochs = 1
for _ in range(train_tree_epochs):
    train_dataset = HMDataset(train_df, SEQ_LEN,model)
    train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True, num_workers=NW,
                          pin_memory=False, drop_last=True)
    model = train_tree(model,train_loader,val_loader,epochs)


# Inference

In [None]:
test_df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv').drop("prediction", axis=1)
print(test_df.shape)
test_df.head()

In [None]:
def create_test_dataset(test_df):
    week = -1
    test_df["week"] = week
    
    hist_df = df[(df["week"] > week) & (df["week"] <= week + WEEK_HIST_MAX)]
    hist_df = hist_df.groupby("customer_id").agg({"article_id": list, "week": list}).reset_index()
    hist_df.rename(columns={"week": 'week_history'}, inplace=True)
    
    
    return test_df.merge(hist_df, on="customer_id", how="left")

test_df = create_test_dataset(test_df)
test_df.head()

In [None]:
test_df["article_id"].isnull().mean()

In [None]:
global is_test
is_test = True

In [None]:
test_ds = HMDataset(test_df, SEQ_LEN, model,is_test=True)
test_loader = DataLoader(test_ds, batch_size=BS, shuffle=False, num_workers=NW,
                          pin_memory=False, drop_last=False)


def inference(model, loader, k=12):
    model.eval()
    
    tbar = tqdm(loader, file=sys.stdout)
    
    preds = []
    
    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            customer_emb = model(inputs)
                
            dot_top_layer = torch.matmul(customer_emb,model.Tree.layer_top_emb.T)
            _ , indices = torch.topk(dot_top_layer, 12, dim=1)
            indices = indices.detach().cpu().numpy()

            search_item = np.array(model.Tree.item_index)
            search_item = np.where(search_item >= 72582, 0, search_item)
            bottom_layer = model.Tree.layer_bottom_emb ##[0*indice,0*indice+20]
            
            for i in range(len(indices)):
                for j in range(12):
                    dot = torch.matmul(customer_emb[i],bottom_layer[indices[i][j]*20:indices[i][j]*20+20].T)
                    _ , item_pos = torch.topk(dot, 1, dim=0)
                    item_pos = indices[i][j]*20 + item_pos
                    indices[i][j] = search_item[item_pos]
   
            for i in range(len(indices)):
                preds.append(" ".join(list(le_article.inverse_transform(indices[i]))))
                
    return preds


test_df["prediction"] = inference(model, test_loader)

In [None]:
test_df.to_csv("submission.csv", index=False, columns=["customer_id", "prediction"])