In [1]:
import numpy as np
import pandas as pd


df = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv", dtype={"article_id": str})
print(df.shape)
df.head()

In [2]:
df["t_dat"] = pd.to_datetime(df["t_dat"])
df["t_dat"].max()

In [3]:
active_articles = df.groupby("article_id")["t_dat"].max().reset_index()
active_articles = active_articles[active_articles["t_dat"] >= "2019-09-01"].reset_index()
active_articles.shape

In [5]:
df = df[df["article_id"].isin(active_articles["article_id"])].reset_index(drop=True)
df.shape

In [5]:
df["week"] = (df["t_dat"].max() - df["t_dat"]).dt.days // 7
df["week"].value_counts()

In [6]:
from sklearn.preprocessing import LabelEncoder


article_ids = np.concatenate([["placeholder"], np.unique(df["article_id"].values)])

le_article = LabelEncoder()
le_article.fit(article_ids)
df["article_id"] = le_article.transform(df["article_id"])

In [7]:
le_article.classes_

In [7]:
len(np.unique(df["article_id"].values))


In [8]:
print(len(le_article.classes_))

In [None]:
WEEK_HIST_MAX = 5

def create_dataset(df, week):
    hist_df = df[(df["week"] > week) & (df["week"] <= week + WEEK_HIST_MAX)]
    hist_df = hist_df.groupby("customer_id").agg({"article_id": list, "week": list}).reset_index()
    hist_df.rename(columns={"week": 'week_history'}, inplace=True)
    
    target_df = df[df["week"] == week]
    target_df = target_df.groupby("customer_id").agg({"article_id": list}).reset_index()
    target_df.rename(columns={"article_id": "target"}, inplace=True)
    target_df["week"] = week
    
    return target_df.merge(hist_df, on="customer_id", how="left")

val_weeks = [0]
train_weeks = [1, 2, 3, 4]


val_df = pd.concat([create_dataset(df, w) for w in val_weeks]).reset_index(drop=True)
train_df = pd.concat([create_dataset(df, w) for w in train_weeks]).reset_index(drop=True)
train_df.shape, val_df.shape
train_df.head()
val_df.head()

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm

class HMDataset(Dataset):
    def __init__(self, df, seq_len, is_test=False):
        self.df = df.reset_index(drop=True)
        self.seq_len = seq_len
        self.is_test = is_test
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        if self.is_test:
            target = torch.zeros(2).float()
        else:
            target = torch.zeros(len(article_ids)).float()
            for t in row.target:
                target[t] = 1.0
            
        article_hist = torch.zeros(self.seq_len).long()
        week_hist = torch.ones(self.seq_len).float()
        
        
        if isinstance(row.article_id, list):
            if len(row.article_id) >= self.seq_len:
                article_hist = torch.LongTensor(row.article_id[-self.seq_len:])
                week_hist = (torch.LongTensor(row.week_history[-self.seq_len:]) - row.week)/WEEK_HIST_MAX/2
            else:
                article_hist[-len(row.article_id):] = torch.LongTensor(row.article_id)
                week_hist[-len(row.article_id):] = (torch.LongTensor(row.week_history) - row.week)/WEEK_HIST_MAX/2
        
        return article_hist, week_hist, target
    
HMDataset(val_df,64)[1]

In [None]:
def adjust_lr(optimizer, epoch):
    if epoch < 1:
        lr = 5e-5
    elif epoch < 6:
        lr = 1e-3
    elif epoch < 9:
        lr = 1e-4
    else:
        lr = 1e-5

    for p in optimizer.param_groups:
        p['lr'] = lr
    return lr
    
def get_optimizer(net):
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=3e-4, betas=(0.9, 0.999),
                                 eps=1e-08)
    return optimizer

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class HMModel(nn.Module):
    def __init__(self, article_shape):
        super(HMModel, self).__init__()
        
        self.article_emb = nn.Embedding(article_shape[0], embedding_dim=article_shape[1])
        
        self.article_likelihood = nn.Parameter(torch.zeros(article_shape[0]), requires_grad=True)
        self.top = nn.Sequential(nn.Conv1d(3, 8, kernel_size=1), nn.LeakyReLU(),
                                 nn.Conv1d(8, 3, kernel_size=1), nn.LeakyReLU(),
                                 nn.Conv1d(3, 1, kernel_size=1),nn.LeakyReLU(),)
        
    def forward(self, inputs):
        article_hist, week_hist = inputs[0], inputs[1]
        #print('output-1',article_hist.shape)  ###shape [256,16] #[batch_size,seq_len]
        x = self.article_emb(article_hist)
        x = F.normalize(x, dim=2)
        #print('x',x,x.shape)
        #print('output0',x,x.shape) ###shape [256,16,512] #[batch_size,seq_len,embedding_len]
        
        #x = x.mean(axis=1)
        x = x@F.normalize(self.article_emb.weight).T
        #print('output1',x,x.shape) ### [256, 16, 72582] #[batch_size,seq_len,all_articles]
        
        x, indices = x.max(axis=1)
        #one purchased article compare with all articles. get purchased article index
        #print('output2',x,x.shape) ### [256,72582]
        
        
        x = x.clamp(1e-3, 0.999)
        x = -torch.log(1/x - 1)
        #print('output3',x,x.shape) ### [256,72582]
        
        max_week = week_hist.unsqueeze(2).repeat(1, 1, x.shape[-1]).gather(1, indices.unsqueeze(1).repeat(1, week_hist.shape[1], 1))
        max_week = max_week.mean(axis=1).unsqueeze(1)
        
        x = torch.cat([x.unsqueeze(1), max_week,
                       self.article_likelihood[None, None, :].repeat(x.shape[0], 1, 1)], axis=1)
        
        #print('x',x,x.shape)
        x = self.top(x).squeeze(1)
        #print('output4',x,x.shape)### [256,72582]
        return x
    
#         [[-0.1248,  0.2905,  0.1383,  ...,  0.0856,  0.2905,  0.1722],
#         [-0.1248, -0.0590,  0.0184,  ...,  0.0856, -0.0077, -0.0311],
#         [-0.1248, -0.0867, -0.0757,  ..., -0.0688, -0.0009, -0.0469],
#         ...,
#         [-0.1248,  0.2905,  0.1383,  ...,  0.0856,  0.2905,  0.1722],
#         [-0.1248,  0.2905,  0.1383,  ...,  0.0856,  0.2905,  0.1722],
#         [-0.1248,  0.2905,  0.1383,  ...,  0.0856,  0.2905,  0.1722]]
    
model = HMModel((len(le_article.classes_), 512))
model = model.cuda()
print(len(le_article.classes_))

In [None]:
import sys

def calc_map(topk_preds, target_array, k=12):
    metric = []
    tp, fp = 0, 0
    
    for pred in topk_preds:
        if target_array[pred]:
            tp += 1
            metric.append(tp/(tp + fp))
        else:
            fp += 1
            
    return np.sum(metric) / min(k, target_array.sum())

def read_data1(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()


def validate(model, val_loader, k=12):
    model.eval()
    
    tbar = tqdm(val_loader, file=sys.stdout)
    
    maps = []
    
    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data1(data)
            logits = model(inputs)
            #print('logits',logits,logits.shape) ### [256, 72582]
            
            
            _, indices = torch.topk(logits, k, dim=1)
            #print('indices',indices,indices.shape) ### [256, 12]
            
#         indices =  [256, 12]
#         [[21900, 22588, 22006,  ..., 16804, 57402, 11302],
#         [21900, 22588, 22006,  ..., 16804, 57402, 11302],
#         [21900, 22588, 22006,  ..., 16804, 57402, 11302],
#         ...,
#         [21900, 22588, 22006,  ..., 16804, 57402, 11302],
#         [21900, 22588, 22006,  ..., 16804, 57402, 11302],
#         [21900, 22588, 22006,  ..., 16804, 57402, 11302]]
            
            indices = indices.detach().cpu().numpy()
            target = target.detach().cpu().numpy()  ### [256, 72582]
#             target = 
#             [[0. 0. 0. ... 0. 0. 0.]
#              [0. 0. 0. ... 0. 0. 0.]
#              [0. 0. 0. ... 0. 0. 0.]
#                        ...
#              [0. 0. 0. ... 0. 0. 0.]
#              [0. 0. 0. ... 0. 0. 0.]
#              [0. 0. 0. ... 0. 0. 0.]]
            for i in range(indices.shape[0]):
                maps.append(calc_map(indices[i], target[i]))
            #print('maps',maps,len(maps)) ### [256]
        
    
    return np.mean(maps)
SEQ_LEN = 16

BS = 256
NW = 8

val_dataset = HMDataset(val_df, SEQ_LEN)
val_loader = DataLoader(val_dataset, batch_size=BS, shuffle=False, num_workers=NW,
                          pin_memory=False, drop_last=False)

### Train and validate

In [None]:
def dice_loss(y_pred, y_true):
    y_pred = y_pred.sigmoid()
    intersect = (y_true*y_pred).sum(axis=1)
    
    return 1 - (intersect/(intersect + y_true.sum(axis=1) + y_pred.sum(axis=1))).mean()


def train(model, train_loader, val_loader, epochs):
    np.random.seed(SEED)
    
    optimizer = get_optimizer(model)
    scaler = torch.cuda.amp.GradScaler()

    criterion = torch.nn.BCEWithLogitsLoss()
    
    for e in range(epochs):
        model.train()
        tbar = tqdm(train_loader, file=sys.stdout)
        
        lr = adjust_lr(optimizer, e)
        
        loss_list = []
        index = 0
        for idx, data in enumerate(tbar):
            inputs, target = read_data1(data)
            

            optimizer.zero_grad()
            
            with torch.cuda.amp.autocast():
                logits = model(inputs)
                
#                 logits =   ###[256, 72582]
#         [[-0.0383, -0.0879, -0.0829,  ..., -0.0812, -0.0935, -0.0809],
#         [-0.0383, -0.0736, -0.0736,  ..., -0.0736, -0.1058, -0.0736],
#         [-0.0383, -0.0736, -0.0736,  ..., -0.0736, -0.1058, -0.0736],
#         ...,
#         [-0.0383, -0.0736, -0.0736,  ..., -0.0736, -0.1058, -0.0736],
#         [-0.0383, -0.0759, -0.0752,  ..., -0.0768, -0.1058, -0.0773],
#         [-0.0383, -0.0736, -0.0736,  ..., -0.0736, -0.1058, -0.0736]]
        
#                 target =   ###[256, 72582]
#         [[0., 0., 0.,  ..., 0., 0., 0.],
#         [0., 0., 0.,  ..., 0., 0., 0.],
#         [0., 0., 0.,  ..., 0., 0., 0.],
#         ...,
#         [0., 0., 0.,  ..., 0., 0., 0.],
#         [0., 0., 0.,  ..., 0., 0., 0.],
#         [0., 0., 0.,  ..., 0., 0., 0.]]
                
                loss = criterion(logits, target) + dice_loss(logits, target)
            #print('loss',loss) ### tensor(1.5361)
            
            #loss.backward()
            scaler.scale(loss).backward() 
            #optimizer.step()
            scaler.step(optimizer)
            scaler.update()
            
            loss_list.append(loss.detach().cpu().item())
            
            ###loss_list [1.721466064453125, 1.718216896057129, .......]
            
            avg_loss = np.round(100*np.mean(loss_list), 4)
            

            tbar.set_description(f"Epoch {e+1} Loss: {avg_loss} lr: {lr}")
                
        
        val_map = validate(model, val_loader)
        

        log_text = f"Epoch {e+1}\nTrain Loss: {avg_loss}\nValidation MAP: {val_map}\n"
            
        print(log_text)
        
        #logfile = open(f"models/{MODEL_NAME}_{SEED}.txt", 'a')
        #logfile.write(log_text)
        #logfile.close()
    return model


MODEL_NAME = "exp001"
SEED = 0

train_dataset = HMDataset(train_df, SEQ_LEN)
train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True, num_workers=NW,
                          pin_memory=False, drop_last=True)

model = train(model, train_loader, val_loader, epochs=10)

**Train the retrieval model again with the most recent data and use this data for ranking**

In [None]:
train_df = train_df[train_df["week"] < 2]
train_dataset = HMDataset(train_df, SEQ_LEN)
train_loader = DataLoader(train_dataset, batch_size=BS, num_workers=NW,
                          pin_memory=False)

model = train(model, train_loader, val_loader, epochs=10)

In [None]:
train_df.head()
print(train_df.shape)

**To train ranking model, first get candidates for that**

In [None]:
def generate_candidates(model, loader, k=500):#Choose 500 candidates from retrievel
    model.train()

    tbar = tqdm(loader, file=sys.stdout)

    candidates = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data1(data)

            logits = model(inputs)

            _, indices = torch.topk(logits, k, dim=1)

            indices = indices.detach().cpu().numpy()
            target = target.detach().cpu().numpy()

            for i in range(indices.shape[0]):
                candidates.append(list(indices[i]))
            


    return candidates
candidates = generate_candidates(model,train_loader)

In [None]:
train_df['candidates'] = candidates
train_df.head()

**Transform into standard data to feed the ranking model**

In [None]:
class HMRankDataset(Dataset):
    def __init__(self, df, seq_len, is_test=False):
        self.df = df.reset_index(drop=True)
        self.seq_len = seq_len
        self.is_test = is_test
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        
        row = self.df.iloc[index]
    
    
        article_hist = torch.zeros(self.seq_len).long()
        if self.is_test:
            target = torch.zeros(2).float()
            target_candidates = torch.zeros(2).float()
        else:
            target_candidates = torch.zeros(len(row.candidates)).float()
            for t in row.target:
                if t in row.candidates:
                    target_candidates[row.candidates.index(t)] = 1.0
        if isinstance(row.article_id, list):
            if len(row.article_id) >= self.seq_len:
                article_hist = torch.LongTensor(row.article_id[-self.seq_len:])
            else:
                article_hist[-len(row.article_id):] = torch.LongTensor(row.article_id)
            
        return article_hist, torch.LongTensor(row.candidates), target_candidates, torch.LongTensor(row.candidates)

    
print(HMRankDataset(train_df,8)[1])

**Train to rank**

In [None]:
class HMRankModel(nn.Module):
    def __init__(self, article_shape):
        super(HMRankModel, self).__init__()
        
        self.article_emb = nn.Embedding(article_shape[0], embedding_dim=article_shape[1])
        
        self.top = nn.Sequential(nn.Conv1d(1, 8, kernel_size=1), nn.LeakyReLU(),
                                 nn.Conv1d(8, 3, kernel_size=1), nn.LeakyReLU(),
                                 nn.Conv1d(3, 1, kernel_size=1), nn.LeakyReLU())
        
    def forward(self, inputs):
        article_hist, candidates = inputs[0], inputs[1]
#         print('article_his',article_hist.shape)###[256,8]
#         print('candidates',candidates.shape)###[256,500]

        x = self.article_emb(article_hist)###[256,500,512]
        y = self.article_emb(candidates) ###shape[256,500,512] # [batch_size,candidates_len,embedding_len]
        customer_emb = x.mean(axis=1) ###shape[256, 512] # [batch_size,embedding_len]
        
        score = torch.multiply(y, customer_emb.reshape(y.shape[0],1,512)).sum(2,keepdims=False)###[256,500]
    
        score = F.normalize(score, dim=1)#decrease the range.
        score = score.clamp(1e-4, 0.9999)#eliminate negative number.
        score = -torch.log(1/score - 1)#logits
        
        #print(score) ###[256,500]
#         [[-2.9936,  0.3114, -9.2102,  ..., -3.6603, -4.0293, -4.5075],
#         [ 0.3496, -3.6411, -9.2102,  ..., -9.2102, -9.2102, -9.2102],
#         [ 0.3496, -3.6411, -9.2102,  ..., -9.2102, -9.2102, -9.2102],
#         ...,
#         [ 0.3496, -3.6411, -9.2102,  ..., -9.2102, -9.2102, -9.2102],
#         [ 0.3496, -3.6411, -9.2102,  ..., -9.2102, -9.2102, -9.2102],
#         [-2.9248, -2.7940, -3.2359,  ..., -3.7172, -9.2102, -9.2102]]

        score = torch.unsqueeze(score, 1)#add one channel to feed network
        score = self.top(score)
        score = torch.squeeze(score,1)
        
        #print(score) ###[256,1000]
#         [[0.3872, 0.3806, 0.3806,  ..., 0.9624, 0.7007, 0.9624],
#         [0.4612, 0.4805, 0.2556,  ..., 0.5366, 0.9624, 0.5576],
#         [0.4617, 0.2727, 0.3862,  ..., 0.5752, 0.9624, 0.9624],
#         ...,
#         [0.4971, 0.2749, 0.7905,  ..., 0.6665, 0.7100, 0.7651],
#         [0.4519, 0.4731, 0.4785,  ..., 0.9624, 0.9624, 0.9624],
#         [0.2605, 0.4988, 0.9624,  ..., 0.9624, 0.9624, 0.6982]]

        return score
    
Rankmodel = HMRankModel((len(le_article.classes_), 512))
Rankmodel = Rankmodel.cuda()

In [None]:
def adjust_lr(optimizer, epoch):
    if epoch < 1:
        lr = 5e-5
    elif epoch < 6:
        lr = 1e-3
    elif epoch < 9:
        lr = 1e-4
    else:
        lr = 1e-5

    for p in optimizer.param_groups:
        p['lr'] = lr
    return lr
    
def get_optimizer(net):
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=3e-4, betas=(0.9, 0.999),
                                 eps=1e-08)
    return optimizer

In [None]:
def read_data2(data):
    return tuple(d.cuda() for d in data[:-2]),data[-2].cuda(), data[-1].cuda()


def Ranktrain(model, train_loader, epochs):
    np.random.seed(SEED)
    
    optimizer = get_optimizer(model)
    scaler = torch.cuda.amp.GradScaler()

    criterion = torch.nn.BCEWithLogitsLoss()
    
    for e in range(epochs):
        model.train()
        tbar = tqdm(train_loader, file=sys.stdout)
        
        lr = adjust_lr(optimizer, e)
        
        loss_list = []
        index = 0
        for idx, data in enumerate(tbar):
            inputs, target, candidates = read_data2(data)
            
            #print(target) ###[256,500]
            optimizer.zero_grad()
            
            with torch.cuda.amp.autocast():
                logits = model(inputs)
                
                loss = criterion(logits, target)
            
            #loss.backward()
            scaler.scale(loss).backward()
            #optimizer.step()
            scaler.step(optimizer)
            scaler.update()
            
            loss_list.append(loss.detach().cpu().item())
            
            ###loss_list [1.721466064453125, 1.718216896057129, .......]
            
            avg_loss = np.round(100*np.mean(loss_list), 4)
            

            tbar.set_description(f"Epoch {e+1} Loss: {avg_loss} lr: {lr}")
        

        log_text = f"Epoch {e+1}\nTrain Loss: {avg_loss}\n"
            
        print(log_text)
        
        #logfile = open(f"models/{MODEL_NAME}_{SEED}.txt", 'a')
        #logfile.write(log_text)
        #logfile.close()
    return model


MODEL_NAME = "exp001"
SEED = 0

train_dataset = HMRankDataset(train_df, SEQ_LEN)
train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True, num_workers=NW,
                          pin_memory=False)

Rankmodel = Ranktrain(Rankmodel, train_loader, epochs=10)

# **Prediction**

In [None]:
test_df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv').drop("prediction", axis=1)
print(test_df.shape)
test_df.head()

In [None]:
def create_test_dataset(test_df):

    week = -1
    test_df["week"] = week
    
    hist_df = df[(df["week"] > week) & (df["week"] <= week + WEEK_HIST_MAX)]
    hist_df = hist_df.groupby("customer_id").agg({"article_id": list, "week": list}).reset_index()
    hist_df.rename(columns={"week": 'week_history'}, inplace=True)
    
    
    return test_df.merge(hist_df, on="customer_id", how="left")

test_df = create_test_dataset(test_df)
test_ds = HMDataset(test_df, SEQ_LEN,is_test=True)
test_loader = DataLoader(test_ds, batch_size=BS, shuffle=False, num_workers=NW,
                          pin_memory=False)

In [None]:
test_ds[0]

**retrievel and ranking model**

In [None]:
preds = []
def predict(model,Rankmodel, loader, k=500):
    model.eval()

    tbar = tqdm(loader, file=sys.stdout)

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data1(data)
            logits = model(inputs)
        
            _, indices = torch.topk(logits, k, dim=1)
            indices = indices.detach().cpu().numpy()  

            part = test_df.iloc[idx:idx+len(indices)].copy(deep=True)
            part['candidates'] = indices.tolist()
            rank_ds = HMRankDataset(part, 8,is_test=True)
            
            rank_loader = DataLoader(rank_ds, batch_size=256, num_workers=NW,pin_memory=False)
            
            def inference(model, loader, k=12):
                model.eval()

                tbar = tqdm(loader, file=sys.stdout)


                with torch.no_grad():
                    for idx, data in enumerate(tbar):
                        tmp = []
                        inputs, target,candidates = read_data2(data)

                        logits = model(inputs)

                        _, indices = torch.topk(logits, k, dim=1)

                        indices = indices.detach().cpu().numpy()
                        candidates = candidates.detach().cpu().numpy()

                        for i in range(len(candidates)):
                            tmp = []
                            for j in range(12):
                                tmp += [candidates[i][indices[i][j]]]
                            preds.append(" ".join(le_article.inverse_transform(tmp[:])))
                        
            inference(Rankmodel,rank_loader)

        

In [None]:
predict(model,Rankmodel,test_loader, k=500)

In [None]:
preds[20]

In [None]:
test_df["prediction"] = preds

In [None]:
test_df.to_csv("submission.csv", index=False, columns=["customer_id", "prediction"])