In [1]:
import torch
import random
import numpy as np

def seed_all(seed_value):
    random.seed(seed_value) # Python
    torch.manual_seed(seed_value) # cpu vars
    np.random.seed(seed)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value) # gpu variables

seed = 56
seed_all(seed)

In [2]:
import pandas as pd
import os
import copy
from zipfile import ZipFile
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.nn import functional as F
import math
from torch.utils.data import Dataset, DataLoader
from torch import optim
from functools import partial

import sklearn.metrics as metrics
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.impute import KNNImputer, SimpleImputer

In [5]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Load train & test csv files

In [None]:
%%time
final_df = pd.DataFrame()
for f in os.listdir('../data/sharechat_recsys2023_data/train'):
    final_df = pd.concat([final_df, pd.read_csv('../data/sharechat_recsys2023_data/train/{}'.format(f), 
                                                delimiter='\t')])

In [4]:
final_test = pd.read_csv('../data/sharechat_recsys2023_data/test/000000000000.csv'.format(f),
                         delimiter='\t')

## Data Preprocessing

1. Categorical Data Preprocessing
    * Assign 0 index to NAN values.
    * Assign 1 index to Unknown category values observed during test time.
    * Assign further indices to unique categories in train data.
    
    
2. Binary Features Preprocessing
    * same as categorical features
    
    
3. Numerical Features Processing
    * We did calculated corr between is_installed and numerical columns and found certain numerical columns have very low corr score so we removed those cols from train and test set.
    ```
    ['f_7','f_5', 'f_9', 'f_13', 'f_15', 'f_18', 'f_26', 
                  'f_27', 'f_28', 'f_29', 'f_31', 'f_30', 'f_60', 'f_67',
                  'f_44', 'f_46', 'f_75', 'f_76',
                 ]
    ```
    * We imputed NaN values in numerical columns by the mean of the column
    * We performed numerical columns normalization using MinMaxScaler
    * We considered numerical column which had very less no unique values we considered them as categorical features as well during training and testing  
    ```
    ['f_79', 'f_78', 'f_77', 'f_76', 'f_75', 'f_74', 'f_73', 'f_72']
    ```
    

In [6]:
NAN_CATEG = -1
NAN_IDX = 0
UNK_IDX = 1
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [77]:
cols_to_remove = ['f_7','f_5', 'f_9', 'f_13', 'f_15', 'f_18', 'f_26', 
                  'f_27', 'f_28', 'f_29', 'f_31', 'f_30', 'f_60', 'f_67',
                  'f_44', 'f_46', 'f_75', 'f_76',
                 ]

num_to_categ_col = ['f_79', 'f_78', 'f_77', 'f_76', 'f_75', 'f_74', 'f_73', 'f_72']

original_id_cols = ['f_0']
original_date_cols = ['f_1']
original_categ_cols = [f'f_{id}' for id in range(2, 32+1)]
original_binary_cols = [f'f_{id}' for id in range(33, 41+1)]
original_num_cols = [f'f_{id}' for id in range(42, 79+1)]
original_label_cols = ['is_clicked', 'is_installed']


revised_id_cols = ['f_0']
revised_date_cols = ['f_1']
revised_categ_cols = list((set([f'f_{id}' for id in range(2, 32+1)]) | set(num_to_categ_col)) - set(cols_to_remove))
revised_categ_cols = sorted(revised_categ_cols)
revised_binary_cols = list(set([f'f_{id}' for id in range(33, 41+1)]) -  set(cols_to_remove))
revised_binary_cols = sorted(revised_binary_cols)
revised_num_cols = list(set([f'f_{id}' for id in range(42, 79+1)]) - set(cols_to_remove) - set(num_to_categ_col))
revised_label_cols = ['is_clicked', 'is_installed']

categ_feat_to_id = {}
for col in revised_categ_cols:
    start_idx = 2 if final_df[col].isna().sum() == 0 else 1
    categ_feat_to_id[col] = {feat:(id + start_idx if feat > -1 else 0)
                             for id, feat in enumerate(sorted(final_df[col].fillna(NAN_CATEG).unique()))}
    

# when using binary feats with embeddings
for col in revised_binary_cols:
    start_idx = 2 if final_df[col].isna().sum() == 0 else 1
    categ_feat_to_id[col] = {feat:(id + start_idx if feat > -1 else 0)
                             for id, feat in enumerate(sorted(final_df[col].fillna(NAN_CATEG).unique()))}
    

    
def get_feat_ids(df, cols):
    """
    convert categorical feature to Ids 
    """
    _df = df.copy()
    for col in cols:
        _df[col] = _df[col].apply(lambda x: (categ_feat_to_id[col][x] 
                                             if not math.isnan(x) else categ_feat_to_id[col][NAN_CATEG]) 
                                        if x in categ_feat_to_id[col] or math.isnan(x) else UNK_IDX
        )
    return _df

def rem_nan_values(train_df, test_df, cols):
    """
    Replace NaN with 0
    """
    train_df_col_mean = train_df[cols].mean()
    train_df[cols] = train_df[cols].fillna(train_df_col_mean)
    test_df[cols] = test_df[cols].fillna(train_df_col_mean)
    return train_df, test_df

def binary_cols_processing(_df, cols):
    """
    Replace 0 values with -1
    """
    _df[cols] = _df[cols].replace(0, -1)
    return _df

def num_cols_processing(train_df, test_df, cols):
    """
    Normalize/Standardize numerical columns
    """
    scaler = MinMaxScaler()
    scaler.fit(train_df[cols])
    train_df[cols] = scaler.transform(train_df[cols])
    test_df[cols] = scaler.transform(test_df[cols])
        
    return train_df, test_df

def imput_nan_values(train_df, test_df, cols):

    unique_values = {col: len(train_df[col].unique().tolist()) for col in cols}
    less_than_10 = [col for col, unique_count in unique_values.items() if unique_count < 20]
    greater_than_10 = [col for col, unique_count in unique_values.items() if unique_count >= 20]

    mean_imputer = SimpleImputer(strategy='mean')
    mean_imputer.fit(train_df[greater_than_10])

    train_df[greater_than_10] = mean_imputer.transform(train_df[greater_than_10])
    test_df[greater_than_10] = mean_imputer.transform(test_df[greater_than_10])

    knn_imputer = KNNImputer(n_neighbors=20, weights="uniform")
    knn_imputer.fit(train_df[less_than_10])
    train_df[less_than_10] = knn_imputer.transform(train_df[less_than_10])
    test_df[less_than_10] = knn_imputer.transform(test_df[less_than_10])
    
    return train_df, test_df
    

def cap_values(train_df, test_df, cols):
    clips = pd.Series({col: train_df[col].quantile(0.85) for col in cols})
    train_df[cols] = train_df.clip(upper=clips, axis=1)[cols]
    test_df[cols] = test_df.clip(upper=clips, axis=1)[cols]
    
    return train_df, test_df

def remove_unnecessary_columns(df, cols):
    df = df.drop(cols, axis = 1)
    return df

def power_num_cols(df, cols):
    _df = df.copy()
    _df = pd.concat([_df, 
                     np.log1p(_df[cols]).add_suffix('_log'),
                     np.sqrt(_df[cols]).add_suffix('_sqRoot')], axis=1)
    cols = cols.extend(
                            [col+'_log' for col in cols] +
                            [col+'_sqRoot' for col in cols]
                        )
    return _df


def preprocess_df(final_df, final_test):
    
    cols_to_remove = ['f_7','f_5', 'f_9', 'f_13', 'f_15', 'f_18', 'f_26', 
                      'f_27', 'f_28', 'f_29', 'f_31', 'f_30', 'f_60', 'f_67',
                      'f_44', 'f_46', 'f_75', 'f_76'
                     ]
    
    _final_df = remove_unnecessary_columns(final_df, cols_to_remove)
    _test_df = remove_unnecessary_columns(final_test, cols_to_remove)

    _final_df = get_feat_ids(_final_df, revised_categ_cols)
    _test_df = get_feat_ids(_test_df, revised_categ_cols)
        
    _final_df, _test_df = rem_nan_values(_final_df, _test_df, revised_num_cols)
    _final_df = binary_cols_processing(_final_df, revised_binary_cols)


#     _test_df = rem_nan_values(_test_df, revised_num_cols)
    _test_df = binary_cols_processing(_test_df, revised_binary_cols)

    
    _revised_num_cols = copy.deepcopy(revised_num_cols)
    _final_df = power_num_cols(_final_df, _revised_num_cols)
    print(revised_num_cols)
    _test_df = power_num_cols(_test_df, revised_num_cols)
    
    _final_df, _test_df = num_cols_processing(_final_df, _test_df, revised_num_cols)
    
    return _final_df, _test_df


def preprocess_df_bin(final_df, final_test):
    
    cols_to_remove = ['f_7','f_5', 'f_9', 'f_13', 'f_15', 'f_18', 'f_26',
                      'f_27','f_28','f_29','f_31','f_30','f_60','f_67','f_44',
                      'f_46','f_75','f_76'
                     ]
    
    _final_df = remove_unnecessary_columns(final_df, cols_to_remove)
    _test_df = remove_unnecessary_columns(final_test, cols_to_remove)

    _final_df = get_feat_ids(_final_df, revised_categ_cols)
    _final_df = get_feat_ids(_final_df, revised_binary_cols)
    
    _test_df = get_feat_ids(_test_df, revised_categ_cols)
    _test_df = get_feat_ids(_test_df, revised_binary_cols)
        
    _final_df, _test_df = rem_nan_values(_final_df, _test_df, revised_num_cols)
    
    _final_df, _test_df = num_cols_processing(_final_df, _test_df, revised_num_cols)
    
    return _final_df, _test_df


In [78]:
%%time
_final_df, _test_df = preprocess_df_bin(final_df, final_test)

CPU times: user 1min 22s, sys: 1min 58s, total: 3min 20s
Wall time: 3min 20s


In [10]:
rearranged_cols = ['f_0', 'f_1'] + \
                    list(revised_categ_cols) + \
                    list(revised_binary_cols) + \
                    list(revised_num_cols) + \
                    ['is_clicked', 'is_installed']
_final_df = _final_df[rearranged_cols]
_test_df = _test_df[rearranged_cols[:-2]]

## Pytorch Dataset

In [14]:
class AdsDataset(Dataset):
    def __init__(self, df, test=False):
        self.df = df
        self.test = test
        self.data = torch.from_numpy(df.values).to(torch.float32)
    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if self.test:
            return self.data[idx]
        x = self.data[idx, :-2]
        y = self.data[idx, -2:]
        return x, y


train_dataset = AdsDataset(_final_df)
test_dataset = AdsDataset(_test_df, test=True)

BS = 2048
trainloader = DataLoader(train_dataset, batch_size=BS, shuffle=True)
testloader = DataLoader(test_dataset, batch_size=len(_test_df))

## Model

    
    1. ModelV1
    * Process categorical features through embedding -> concat the output
    * Process binary features and numerical features through separate MLP layers
    * Concatenate output of Embedding, binary and numerical MLP layers.
    * Process concatenated output through separate MLP layers for `is_clicked` and `is_installed` logit output.
    
    2. ModelV1_BIN (Best Result Model) 
    Same as ModelV1, except binary features are also processed through Embeddings Instead of Dense layer.
    
    3. ModelV1_DEP
    Same as ModelV1_BIN, except having seperate independent prediction of `is_installed` and `is_clicked`, in this model we prediction `is_installed` conditioned on `is_clicked`.

In [15]:
class ModelV1(nn.Module):
    CATEG_COLS = revised_categ_cols
    BINARY_COLS = revised_binary_cols
    NUM_COLS = revised_num_cols
    
    START_COL_IDX = 2
    
    BINARY_COLS_IDX = range(START_COL_IDX + len(CATEG_COLS), START_COL_IDX + len(CATEG_COLS) + len(BINARY_COLS))
    
    NUM_COLS_IDX = range(BINARY_COLS_IDX[-1] + 1, BINARY_COLS_IDX[-1] + 1 + len(NUM_COLS))
    
    
    CATEG_EMB_DIM = 32
    BINARY_FC_OUT = 32
    NUM_FC_OUT = 32

    COMB_FC_OUT_1 = 256
    NUM_CLASSES = 1

    def __init__(self, df):
        super(ModelV1, self).__init__()
        # categ feats
        cat_emb = [None]*len(self.CATEG_COLS)
        self.emb_sizes = []
        for col, col_idx in zip(self.CATEG_COLS, range(len(self.CATEG_COLS))):
            n_emb = max(categ_feat_to_id[col].values())+1
            emb_size = int(6 * (n_emb ** 0.25))
            cat_emb[col_idx] = nn.Embedding(n_emb, emb_size).to(DEVICE)
            self.emb_sizes.append(emb_size)
            
        self.cat_emb = nn.ModuleList(cat_emb)
        
        # binary_feats
        self.bin_fc = nn.Sequential(
            nn.Linear(len(self.BINARY_COLS), self.BINARY_FC_OUT),
            nn.BatchNorm1d(self.BINARY_FC_OUT),
            nn.ReLU()
        )

        # numerical feats
        self.num_fc = nn.Sequential(
            nn.Linear(len(self.NUM_COLS), self.NUM_FC_OUT),
            nn.BatchNorm1d(self.NUM_FC_OUT),
            nn.ReLU()
        )

        # final cls layers
        self.fc_out_click = nn.Sequential(
            nn.Dropout(0.4),
            nn.Linear(
                sum(self.emb_sizes) + self.BINARY_FC_OUT + self.NUM_FC_OUT,
                self.COMB_FC_OUT_1
            ),
            nn.ReLU(),
            nn.Linear(self.COMB_FC_OUT_1, self.NUM_CLASSES)
        )

        self.fc_out_install = nn.Sequential(
            nn.Dropout(0.4),
            nn.Linear(
                sum(self.emb_sizes) + self.BINARY_FC_OUT + self.NUM_FC_OUT,
                self.COMB_FC_OUT_1
            ),
            nn.ReLU(),
            nn.Linear(self.COMB_FC_OUT_1, self.NUM_CLASSES)
        )

    def forward(self, x, return_click = True):
        cat_emb_out = [None]*len(self.CATEG_COLS)
        for col_idx in range(len(self.CATEG_COLS)):
            col_x = x[:, self.START_COL_IDX + col_idx].long()
            cat_emb_out[col_idx] = self.cat_emb[col_idx](col_x)
        cat_emb_out = torch.concat(cat_emb_out, axis=1)

        bin_out = self.bin_fc(x[:, self.BINARY_COLS_IDX])
        num_out = self.num_fc(x[:, self.NUM_COLS_IDX])

        comb = torch.concat([cat_emb_out, bin_out, num_out], axis=1)
        
        install_logits = torch.sigmoid(self.fc_out_install(comb))
        if(return_click):
            click_logits = torch.sigmoid(self.fc_out_click(comb))
        else:
            click_logits = torch.zeros_like(install_logits)
        return click_logits, install_logits

In [32]:
class ModelV1_BIN(nn.Module):
    CATEG_COLS = revised_categ_cols
    BINARY_COLS = revised_binary_cols
    NUM_COLS = revised_num_cols
    
    START_COL_IDX = 2
    
    BINARY_COLS_IDX = range(START_COL_IDX + len(CATEG_COLS), 
                            START_COL_IDX + len(CATEG_COLS) + len(BINARY_COLS))
    
    NUM_COLS_IDX = range(BINARY_COLS_IDX[-1] + 1, 
                         BINARY_COLS_IDX[-1] + 1 + len(NUM_COLS))
    
    CATEG_EMB_DIM = 32
    BINARY_FC_OUT = 32
    NUM_FC_OUT = 32

    COMB_FC_OUT_1 = 256
    NUM_CLASSES = 1

    def __init__(self, df):
        super(ModelV1_BIN, self).__init__()
        # categ feats
        cat_emb = [None]*len(self.CATEG_COLS)
        self.emb_sizes = []
        for col, col_idx in zip(self.CATEG_COLS, range(len(self.CATEG_COLS))):
            n_emb = max(categ_feat_to_id[col].values())+1
            emb_size = int(6 * (n_emb ** 0.25))
            cat_emb[col_idx] = nn.Embedding(n_emb, emb_size).to(DEVICE)
            self.emb_sizes.append(emb_size)
            
        self.cat_emb = nn.ModuleList(cat_emb)
        

        bin_emb = [None]*len(self.BINARY_COLS)
        self.bin_emb_sizes = []
        for col, col_idx in zip(self.BINARY_COLS, range(len(self.BINARY_COLS))):
            n_emb = max(categ_feat_to_id[col].values())+1
            self.bin_emb_sizes.append(1)
            bin_emb[col_idx] = nn.Embedding(n_emb, 1).to(DEVICE)
        self.bin_emb = nn.ModuleList(bin_emb)

        # numerical feats
        self.num_fc = nn.Sequential(
            nn.Linear(len(self.NUM_COLS), self.NUM_FC_OUT),
            nn.BatchNorm1d(self.NUM_FC_OUT),
            nn.ReLU()
        )

        # final cls layers
        self.fc_out_click = nn.Sequential(
            nn.Dropout(0.4),
            nn.Linear(
                sum(self.emb_sizes) + sum(self.bin_emb_sizes) + self.NUM_FC_OUT,
                self.COMB_FC_OUT_1
            ),
            nn.ReLU(),
            nn.Linear(self.COMB_FC_OUT_1, self.NUM_CLASSES)
        )

        self.fc_out_install = nn.Sequential(
            nn.Dropout(0.4),
            nn.Linear(
                sum(self.emb_sizes) + sum(self.bin_emb_sizes) + self.NUM_FC_OUT,
                self.COMB_FC_OUT_1
            ),
            nn.ReLU(),
            nn.Linear(self.COMB_FC_OUT_1, self.NUM_CLASSES)
        )

    def forward(self, x, return_click = True):
        cat_emb_out = [None]*len(self.CATEG_COLS)
        for col_idx in range(len(self.CATEG_COLS)):
            col_x = x[:, self.START_COL_IDX + col_idx].long()
            cat_emb_out[col_idx] = self.cat_emb[col_idx](col_x)
        cat_emb_out = torch.concat(cat_emb_out, axis=1)

        bin_emb_out = [None]*len(self.BINARY_COLS)
        for col_idx in range(len(self.BINARY_COLS)):
            col_x = x[:, self.START_COL_IDX + len(self.CATEG_COLS) + col_idx].long()
            bin_emb_out[col_idx] = self.bin_emb[col_idx](col_x)
        bin_emb_out = torch.concat(bin_emb_out, axis=1)
        
        num_out = self.num_fc(x[:, self.NUM_COLS_IDX])

        comb = torch.concat([cat_emb_out, bin_emb_out, num_out], axis=1)
        
        install_logits = torch.sigmoid(self.fc_out_install(comb))
        if(return_click):
            click_logits = torch.sigmoid(self.fc_out_click(comb))
        else:
            click_logits = torch.zeros_like(install_logits)
        return click_logits, install_logits

In [57]:
class ModelV1_DEP(nn.Module):
    CATEG_COLS = revised_categ_cols
    BINARY_COLS = revised_binary_cols
    NUM_COLS = revised_num_cols
    
    START_COL_IDX = 2
    
    BINARY_COLS_IDX = range(START_COL_IDX + len(CATEG_COLS), START_COL_IDX + len(CATEG_COLS) + len(BINARY_COLS))
    
    NUM_COLS_IDX = range(BINARY_COLS_IDX[-1] + 1, BINARY_COLS_IDX[-1] + 1 + len(NUM_COLS))
    
    CATEG_EMB_DIM = 32
    BINARY_FC_OUT = 32
    NUM_FC_OUT = 32

    COMB_FC_OUT_1 = 128
    NUM_CLASSES = 1

    def __init__(self, df):
        super(ModelV1_DEP, self).__init__()
        # categ feats
        cat_emb = [None]*len(self.CATEG_COLS)
        self.emb_sizes = []
        for col, col_idx in zip(self.CATEG_COLS, range(len(self.CATEG_COLS))):
            n_emb = max(categ_feat_to_id[col].values())+1
            emb_size = int(6 * (n_emb ** 0.25))
            cat_emb[col_idx] = nn.Embedding(n_emb, emb_size).to(DEVICE)
            self.emb_sizes.append(emb_size)
            
        self.cat_emb = nn.ModuleList(cat_emb)
        
        bin_emb = [None]*len(self.BINARY_COLS)
        self.bin_emb_sizes = []
        for col, col_idx in zip(self.BINARY_COLS, range(len(self.BINARY_COLS))):
            n_emb = max(categ_feat_to_id[col].values())+1
            self.bin_emb_sizes.append(1)
            bin_emb[col_idx] = nn.Embedding(n_emb, 1).to(DEVICE)
        self.bin_emb = nn.ModuleList(bin_emb)

        # numerical feats
        self.num_fc = nn.Sequential(
            nn.Linear(len(self.NUM_COLS), self.NUM_FC_OUT),
            nn.BatchNorm1d(self.BINARY_FC_OUT),
            nn.ReLU()
        )

        # final cls layers
        self.fc_intm_click = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(
                sum(self.emb_sizes) + sum(self.bin_emb_sizes) + self.NUM_FC_OUT,
                self.COMB_FC_OUT_1
            ),
            nn.ReLU(),
        )
        self.fc_out_click = nn.Linear(self.COMB_FC_OUT_1, self.NUM_CLASSES)
        
        self.fc_out_install = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(
                sum(self.emb_sizes) + sum(self.bin_emb_sizes) + self.NUM_FC_OUT + self.COMB_FC_OUT_1,
                self.COMB_FC_OUT_1
            ),
            nn.ReLU(),
            nn.Linear(self.COMB_FC_OUT_1, self.NUM_CLASSES)
        )

    def forward(self, x, return_click=True):
        cat_emb_out = [None]*len(self.CATEG_COLS)
        for col_idx in range(len(self.CATEG_COLS)):
            col_x = x[:, self.START_COL_IDX + col_idx].long()
            cat_emb_out[col_idx] = self.cat_emb[col_idx](col_x)
        cat_emb_out = torch.concat(cat_emb_out, axis=1)

        bin_out = [None]*len(self.BINARY_COLS)
        for col_idx in range(len(self.BINARY_COLS)):
            col_x = x[:, self.START_COL_IDX + len(self.CATEG_COLS) + col_idx].long()
            bin_out[col_idx] = self.bin_emb[col_idx](col_x)
        bin_out = torch.concat(bin_out, axis=1)
        
        num_out = self.num_fc(x[:, self.NUM_COLS_IDX])

        comb = torch.concat([cat_emb_out, bin_out, num_out], axis=1)
        
        click_fc_intm = self.fc_intm_click(comb)
        
        install_fc_inp = torch.cat([comb, click_fc_intm], axis=1)
        install_fc_out = self.fc_out_install(install_fc_inp)
        
        install_logits = torch.sigmoid(install_fc_out)
        
        if return_click:
            click_fc_out = self.fc_out_click(click_fc_intm)
            click_logits = torch.sigmoid(click_fc_out)
        else:
            click_logits = torch.zeros_like(install_logits)
        
        return click_logits, install_logits

In [59]:
# Initialize model

# model_v1 = ModelV1(_final_df).to(DEVICE)
# model_v1_dep = ModelV1_DEP(_final_df).to(DEVICE)
model_v1_bin = ModelV1_BIN(_final_df).to(DEVICE)

In [60]:
# Helper function for model 

def save_ckpt(model, optim, hist, ckpt_no, exp_name='/rs23'):
    "Function to save ckpt of ckpt_no and experiment name"
    data = {
        'model_state_dict': model.state_dict(),
        'optim_state_dict': optim.state_dict(),
        'hist': hist
    }
    ckpt_base = './models/'
    os.makedirs(os.path.join(ckpt_base, exp_name), exist_ok=True)
    
    save_path = os.path.join(ckpt_base, exp_name, f'model_{ckpt_no}.pth.tar')
    torch.save(data, save_path)
    print('Saved model ckpt - {}'.format(save_path), end='\n\n')

def resume_ckpt(model, optim, ckpt_no, exp_name='/rs23'):
    "Function to load ckpt given ckpt_no and experiment name"
    ckpt_base = './models/'
    load_path = os.path.join(ckpt_base, exp_name, f'model_{ckpt_no}.pth.tar')
    print('loading model ckpt - {}'.format(load_path), end='\n\n')

    data = torch.load(load_path)
    
    model.load_state_dict(data['model_state_dict'])
    
    if data.get('optim_state_dict', None) is not None:
        optim.load_state_dict(data['optim_state_dict'])
    if data.get('hist', None) is not None:
        hist = data['hist']
    else:
        hist = None

    return ckpt_no, model, optim, hist

def log_loss_metric(model_out, target, idx=1):
    """Normalized Log Loss as provided by organizers"""
    return metrics.log_loss(target[:, idx].cpu().detach().numpy().reshape(-1), 
                           model_out[idx].cpu().detach().numpy().reshape(-1),
                           labels=[0,1], eps=1e-7, normalize=True)


def loss_fn(model_out, target, criterion, idx=None):
    """
    Loss function 
        L = L_{is_clicked} + L_{is_installed}
        
    Args:
        model_out: output from model
        target: target tensor
        criterion: criterion for indiv output
        idx: (optional) if idx is not None then default Loss function 
                else loss func comprise criterion value of itemss at `idx` in model output
    """
    if idx is None:
        loss = 0.
        for i, logits in enumerate(model_out):
            loss += criterion(logits, target[:, i].reshape(-1, 1))
    else:
        loss = criterion(model_out[idx], target[:, idx].reshape(-1, 1))
    return loss

## Training

1. Use of BinaryCrossEntropy criterion for `is_installed` and `is_clicked` predictions
2. We utilize two training approaches
    
    a. only train on `is_installed` 
        $$ \mathcal{L}=\mathcal{L}_{is\_installed} $$
        
    b. Optimize jointly on `is_installed` and `is_clicked`.
        $$ \mathcal{L}=\mathcal{L}_{is\_installed} + \mathcal{L}_{is\_clicked} $$

In [61]:
# Experiment Config class
class Config:
    exp_name = 'None'
    
    resume = False
    ckpt_no = 0
    epoch = 0
    n_epochs = 40
    
    only_install = True
    lr = 0.005
    loss_fn = partial(loss_fn, criterion=nn.BCELoss())
    
    only_is_install = True
    use_valid = True

In [62]:
class Trainer:
    """
    Trainer class to run the experiments using passed config
    
    Args:
        config: exp config (Config object)
        model: pytorch model
        trainlaoder: training dataloader
        validloader: (optional) validation dataloader
    """
    def __init__(self, config, model, trainloader, validloader):
        self.config = config
        self.trainloader = trainloader
        if config.use_valid:
            self.validloader = validloader
        else:
            self.validloader = None
        
        self.exp_name = config.exp_name
        self.epoch = config.epoch
        self.n_epoch = config.n_epochs
        self.ckpt_no = config.ckpt_no
        
        self.hist = self.init_hist()
        self.model = model
        self.optim = optim.Adam(self.model.parameters(), lr=config.lr)
        
        self.loss_fn = config.loss_fn
        
        if config.resume:
            self.load(config.ckpt_no)
            
    def load(self, ckpt_no):
        "Load checkpoint from ckpt_no"
        self.epoch, self.model, self.optim, self.hist = resume_ckpt(
            self.model, self.optim, ckpt_no, self.exp_name)
        if self.hist is None:
            self.init_hist()
         
        
    def init_hist(self):
        "initialize history to track metrics"
        modes = ['train', 'valid', 'test']
        metrics = ['iter', 'loss', 'loss-std', 'log_loss', 'log_loss-std']
        hist = {}
        for mode in modes:
            hist[mode] = {}
            for metric in metrics:
                hist[mode][metric] = []
        return hist
    
    def log(self, mode, loss, log_loss):
        "log metrics in history"
        self.hist[mode]['iter'].append(self.epoch)
        self.hist[mode]['loss'].append(np.mean(loss))
        self.hist[mode]['loss-std'].append(np.std(loss))
        self.hist[mode]['log_loss'].append(np.mean(log_loss))
        self.hist[mode]['log_loss-std'].append(np.std(log_loss))
        
    def save(self):
        "Save model checkpoint"
        save_ckpt(self.model, self.optim, self.hist, self.epoch, self.exp_name)
    
    def train(self, ):
        "Training Loop"
        for self.epoch in range(1 + self.ckpt_no, self.n_epoch + 1 + self.ckpt_no):
            self.model.train()
            losses = []
            log_losses = []

            pbar = tqdm(self.trainloader, leave=False, total=len(self.trainloader))
            for batch in pbar:
                self.optim.zero_grad()

                batch_x, batch_y = batch
                batch_x = batch_x.to(DEVICE)
                batch_y = batch_y.to(DEVICE)

                out = self.model(batch_x, not self.config.only_is_install)

                loss = self.loss_fn(out, batch_y, 
                                    idx=None if not self.config.only_is_install else 1)
                losses.append(loss.item())
                log_losses.append(log_loss_metric(out, batch_y))

                loss.backward()
                self.optim.step()

            self.log('train', losses, log_losses)
            
            if self.config.use_valid:
                valid_losses, valid_log_losses = self.valid_loop()
                print("epoch: {} | loss: {:0.5f} | log_loss: {:0.5f} | \
                        valid_loss: {:0.5f} | valid_log_loss: {:0.5f}" \
                      .format(self.epoch, np.mean(losses), np.mean(log_losses), 
                              np.mean(valid_losses), np.mean(valid_log_losses)))

                if np.mean(valid_losses) <= min(self.hist['valid']['loss']):
                    self.save()
            else:
                print("epoch: {} | loss: {:0.5f} | log_loss: {:0.5f}" \
                      .format(self.epoch, np.mean(losses), np.mean(log_losses)))
                
                if np.mean(losses) <= min(self.hist['train']['loss']):
                    self.save()
        self.save()

    @torch.no_grad()
    def valid_loop(self,):
        "Validation loop"
        self.model.eval()
        losses = []
        log_losses = []

        pbar = tqdm(self.validloader, leave=False, total=len(self.validloader))
        for batch in pbar:
            batch_x, batch_y = batch
            batch_x = batch_x.to(DEVICE)
            batch_y = batch_y.to(DEVICE)

            out = self.model(batch_x, not self.config.only_is_install)

            loss = self.loss_fn(out, batch_y, 
                                idx=None if not self.config.only_is_install else 1)
            losses.append(loss.item())
            log_losses.append(log_loss_metric(out, batch_y))

        self.log('valid', losses, log_losses)
        return losses, log_losses
    
    def get_best_model(self):
        best_ckpt_no = self.hist['valid']['iter'][
                            self.hist['valid']['loss'].index(min(self.hist['valid']['loss']))
                        ]
        self.load(best_ckpt_no)

In [63]:
def plot_metrics(hist, metrics = ['loss', 'log_loss']):
    "Helper function to plot training statistics"
    fig, _ax = plt.subplots(1, len(metrics), figsize=(5*len(metrics), 3))
    modes = ['train', 'valid']
    for i, metric in enumerate(metrics):
        ax = _ax[i] if len(metrics) > 1 else _ax
        colors = ['grey', 'blue']
        colors = iter(colors)
        for mode in modes:
            color = next(colors)
            x_axis = hist[mode]['iter']
            y_axis = np.array(hist[mode][metric])
            
            if metric+'-std' in hist[mode]:
                y_axis_std = np.array(hist[mode][metric+'-std'])
                ax.fill_between(x_axis, y_axis-y_axis_std, y_axis+y_axis_std, 
                                   alpha=0.2, color=color)
            ax.plot(x_axis, y_axis, alpha=0.9, color=color, label=mode)
            
        ax.set_xlabel('epoch')
        ax.set_ylabel(metric)
        ax.grid()
        ax.legend()
    
    if len(metrics)>1: plt.subplots_adjust(wspace=0.3)
    plt.show()  

In [None]:
# best res cell copy
config_binEmb_allData = Config()
config_binEmb_allData.exp_name = "sub_model_v1_emb_binEmb_removal_low_corr_custom_col_nanMeanImpute_only_install_num_to_cat_dynEmb_bin32_num32_alldata_copy"
config_binEmb_allData.n_epochs = 41
config_binEmb_allData.lr = 5e-4
config_binEmb_allData.use_valid = False

# config_binEmb_allData.resume=True
# config_binEmb_allData.ckpt_no = 46

trainer_binEmb_allData = Trainer(config_binEmb_allData, model_v1_bin, trainloader, None)
trainer_binEmb_allData.train()

## Gen Submission file

In [None]:
# load best model
# trainer_binEmb_allData.get_best_model()

In [None]:
@torch.no_grad()
def get_preds(testloader, model):
    "get predictions from model on test data"
    testit = next(iter(testloader)).to(DEVICE)
    model.eval()
    testout = model(testit, False)
    return testout, testit

testout, testit = get_preds(testloader, trainer_binEmb_allData.model)

In [67]:
# format submission columns
_test_df['f_0'] = testit[:,0].cpu().detach().numpy().astype(int)
_test_df['is_clicked'] = testout[0].cpu().detach().numpy()
_test_df['is_installed'] = testout[1].cpu().detach().numpy()

In [69]:
def get_sub(df):
    "Generate submission df"
    cols = ['f_0', 'is_clicked', 'is_installed']
    res_df = df[cols]
    res_df = res_df.rename(columns={'f_0':'RowId'})
    return res_df

# save submission csv
sub_path = os.path.join('./sub', _trainer_pow_alldata.exp_name + '10epochs' + '.csv')
os.makedirs('./sub', exist_ok=True)
get_sub(_test_df).to_csv(sub_path, sep='\t', index=False)