In [1]:
import albumentations as A
import cv2
import pandas as pd
from pathlib import Path
import numpy as np
import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from efficientnet_pytorch import EfficientNet
from catalyst import dl
from catalyst.dl import utils
from catalyst.callbacks.scheduler import SchedulerCallback
import catalyst.metrics.functional as CF
from catalyst.contrib.callbacks.neptune_logger import NeptuneLogger
from catalyst.contrib.nn.schedulers.onecycle import OneCycleLRWithWarmup
from catalyst.callbacks.metric import LoaderMetricCallback
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from catalyst.utils import set_global_seed
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
set_global_seed(12345786)

In [3]:
freq_map = {
    'BI-Monthly': 24/12,
    'Monthly': 12/12,
    'Quatrly': 4/12,
    'Half Yearly': 2/12
}

def bureausplit_dates(x):
    if x == '':
        return None
    if isinstance(x, float):
        return np.nan
    dates = x.split(',')
    if len(dates) == 1:
        return None
    else:
        d = dates[-2]
        return f"{d[:4]}-{d[4:6]}-{d[6:8]}"        
    

def correct_bureau_data(df):
    df["DISBURSED-DT2"] = pd.to_datetime(df["REPORTED DATE - HIST"].apply(bureausplit_dates), errors='ignore')
    return df

def get_min_date(x):
    loc = x["max_diff_loc"] + 1
    dates = str(x["REPORTED DATE - HIST"])
    if dates == 'nan':
        return "1970-01-01"
    else:
        dates = dates.split(",")[:-1][::-1]
        if len(dates) - 1 < loc:
            return dates[-1]
        min_date = dates[loc]
        return f"{min_date[:4]}-{min_date[4:6]}-{min_date[6:8]}"


def process_bal(x):
    if x == 'nan':
        return [0]
    xl = x.split(",")[:-1][::-1]
    prev_val = 0
    vals = []
    for val in xl:
        if val == '':
            val = prev_val
            vals.append(val)
        else:
            val = int(val)
            vals.append(val)
            prev_val = val
    return np.array(vals)


def get_loan_frequency(bur):
    mapping = {
        "Monthly": 12/12,
        "Quarterly": 3/12,
        "Weekly": 12*4.33/12,
        "BiWeekly": 2.16,
        "Other": -1,
        "BiMonthly": 2/12,
        "Annually": 1/12,
        "Semi annually": 6/12,
        "Single Payment Loan (bullet": 0
    }
    bur["frequency"] = bur["INSTALLMENT-AMT"].str.split("/").str.get(1).map(mapping)
    bur["installment"] = bur["INSTALLMENT-AMT"].str.split("/").str.get(0).str.replace(",", "").astype(float)
    bur["installment_pm"] = bur["installment"] * bur["frequency"]

    bur["interest"] = bur["TENURE"] * bur["installment"] * bur["frequency"]/ bur["DISBURSED-AMT/HIGH CREDIT"] 
    return bur


def read_files(data_path):
    train = pd.read_csv(Path(DATA_FOLDER) / Files.train_csv, thousands=",")
    test = pd.read_csv(Path(DATA_FOLDER) / Files.test_csv, thousands=",")
    
    train["DisbursalDate"] = pd.to_datetime(train["DisbursalDate"], errors='coerce')
    test["DisbursalDate"] = pd.to_datetime(test["DisbursalDate"], errors='coerce')

    train["MaturityDAte"] = pd.to_datetime(train["MaturityDAte"], errors='coerce')
    test["MaturityDAte"] = pd.to_datetime(test["MaturityDAte"], errors='coerce')

    train["AuthDate"] = pd.to_datetime(train["AuthDate"], errors='coerce')
    test["AuthDate"] = pd.to_datetime(test["AuthDate"], errors='coerce')

    train["Frequency"] = train["Frequency"].map(freq_map)
    test["Frequency"] = test["Frequency"].map(freq_map)

    train["days_to_enddate"] = (pd.to_datetime("2020-12-01") - train["DisbursalDate"]).dt.days
    test["days_to_enddate"] = (pd.to_datetime("2020-12-01") - test["DisbursalDate"]).dt.days

    train["days_from_start"] =  (train["DisbursalDate"] - pd.to_datetime("2010-12-01")).dt.days
    test["days_from_start"] = (test["DisbursalDate"] - pd.to_datetime("2010-12-01")).dt.days

    train_bureau = pd.read_csv(Path(DATA_FOLDER) / Files.train_bureau_csv, thousands=",")
    test_bureau = pd.read_csv(Path(DATA_FOLDER) / Files.test_bureau_csv, thousands=",")
    
    data = pd.concat([train, test])
    
    bureau = pd.concat([train_bureau, test_bureau]).reset_index(drop=True)
    bureau["DISBURSED-DT"] = pd.to_datetime(bureau["DISBURSED-DT"])
    # bureau = bureau.drop_duplicates(subset=["ID", "DISBURSED-DT"])
    bureau = bureau.sort_values(by=["ID", "DISBURSED-DT"])
    bureau = pd.merge(bureau, data[["ID", "DisbursalDate", "Tenure", "DisbursalAmount", "AssetCost", "EMI", "MonthlyIncome", "MaturityDAte", "ZiPCODE", "Area", "State"]], on="ID", how="left")
    bureau = correct_bureau_data(bureau)
    bureau.loc[bureau["DISBURSED-DT"].isnull(), "DISBURSED-DT"] = pd.to_datetime(bureau.loc[bureau["DISBURSED-DT"].isnull(), "DISBURSED-DT2"])
    bureau["post_days"] = (bureau["DISBURSED-DT"] - bureau["DisbursalDate"]).dt.days
    bureau["close_days"] = (pd.to_datetime(bureau["CLOSE-DT"], errors='coerce') - pd.to_datetime(bureau["DisbursalDate"])).dt.days
    bureau["days_to_enddate"] = (pd.to_datetime("2020-12-01") - bureau["DISBURSED-DT"]).dt.days
    bureau["post_end_days_diff"] = bureau["days_to_enddate"] - bureau["post_days"]
    bureau["post_days_diff"] = bureau.groupby("ID")["post_days"].transform(lambda x: x.duplicated(keep=False))
    bureau["cur_bal_int"] = bureau["CUR BAL - HIST"].astype(str).apply(process_bal)  # str.split(',').apply(lambda x: [int(val) if val != '' else np.nan for w, val in enumerate(x)])
    bureau["max_diff"] = bureau["cur_bal_int"].apply(lambda x:  np.diff(x).max() if len(x) > 1 else 0)
    bureau["max_diff_loc"] = bureau["cur_bal_int"].apply(lambda x:  np.argmax(np.diff(x)) if len(x) > 1 else 0)
    bureau["max_diff_date"] = pd.to_datetime(bureau[["REPORTED DATE - HIST", "max_diff_loc"]].apply(get_min_date, axis=1), errors="coerce")
    bureau["days_to_max_diff"] = (bureau["max_diff_date"] - bureau["DisbursalDate"]).dt.days
    bureau = get_loan_frequency(bureau)

    train["ACCT-TYPE"] = train.ID.map(bureau.loc[(bureau["SELF-INDICATOR"] == True) & (bureau["post_days"] >= 0)].groupby("ID")["ACCT-TYPE"].first())
    test["ACCT-TYPE"] = test.ID.map(bureau.loc[(bureau["SELF-INDICATOR"] == True) & (bureau["post_days"] >= 0)].groupby("ID")["ACCT-TYPE"].first())

    train["ownership"] = train.ID.map(bureau.loc[(bureau["SELF-INDICATOR"] == True) & (bureau["post_days"] >= 0)].groupby("ID")["OWNERSHIP-IND"].first())
    test["ownership"] = test.ID.map(bureau.loc[(bureau["SELF-INDICATOR"] == True) & (bureau["post_days"] >= 0)].groupby("ID")["OWNERSHIP-IND"].first())

    train["status"] = train.ID.map(bureau.loc[(bureau["SELF-INDICATOR"] == True) & (bureau["post_days"] >= 0)].groupby("ID")["ACCOUNT-STATUS"].first())
    test["status"] = test.ID.map(bureau.loc[(bureau["SELF-INDICATOR"] == True) & (bureau["post_days"] >= 0)].groupby("ID")["ACCOUNT-STATUS"].first())
    
    train["disbursed_bur"] = train.ID.map(bureau.loc[(bureau["SELF-INDICATOR"] == True) & (bureau["post_days"] >= 0)].groupby("ID")["DISBURSED-AMT/HIGH CREDIT"].first())
    test["disbursed_bur"] = test.ID.map(bureau.loc[(bureau["SELF-INDICATOR"] == True) & (bureau["post_days"] >= 0)].groupby("ID")["DISBURSED-AMT/HIGH CREDIT"].first())

    train["CLOSE-DT"] = pd.to_datetime(train.ID.map(bureau.loc[(bureau["SELF-INDICATOR"] == True) & (bureau["post_days"] >= 0)].groupby("ID")["CLOSE-DT"].first()), errors='coerce')
    test["CLOSE-DT"] = pd.to_datetime(test.ID.map(bureau.loc[(bureau["SELF-INDICATOR"] == True) & (bureau["post_days"] >= 0)].groupby("ID")["CLOSE-DT"].first()), errors='coerce')

    train["DATE-REPORTED"] = pd.to_datetime(train.ID.map(bureau.loc[(bureau["SELF-INDICATOR"] == True) & (bureau["post_days"] >= 0)].groupby("ID")["DATE-REPORTED"].first()), errors='coerce')
    test["DATE-REPORTED"] = pd.to_datetime(test.ID.map(bureau.loc[(bureau["SELF-INDICATOR"] == True) & (bureau["post_days"] >= 0)].groupby("ID")["DATE-REPORTED"].first()), errors='coerce')

    train["DISBURSED-DT"] = pd.to_datetime(train.ID.map(bureau.loc[(bureau["SELF-INDICATOR"] == True) & (bureau["post_days"] >= 0)].groupby("ID")["DISBURSED-DT"].first()), errors='coerce')
    test["DISBURSED-DT"] = pd.to_datetime(test.ID.map(bureau.loc[(bureau["SELF-INDICATOR"] == True) & (bureau["post_days"] >= 0)].groupby("ID")["DISBURSED-DT"].first()), errors='coerce')

    train["max_diff"] = train.ID.map(bureau.loc[(bureau["SELF-INDICATOR"] == True) & (bureau["post_days"] >= 0)].groupby("ID")["max_diff"].first())
    test["max_diff"] = test.ID.map(bureau.loc[(bureau["SELF-INDICATOR"] == True) & (bureau["post_days"] >= 0)].groupby("ID")["max_diff"].first())

    train["days_to_max_diff"] = train.ID.map(bureau.loc[(bureau["SELF-INDICATOR"] == True) & (bureau["post_days"] >= 0)].groupby("ID")["days_to_max_diff"].first())
    test["days_to_max_diff"] = test.ID.map(bureau.loc[(bureau["SELF-INDICATOR"] == True) & (bureau["post_days"] >= 0)].groupby("ID")["days_to_max_diff"].first())

    return train, test, bureau

In [4]:
DATA_FOLDER = "data"


class Files:
    """Easy retrieval of file paths."""
    train = "Train/train_Data.xlsx"
    test = "test_Data.xlsx"
    data_dict = "Train/data_dict.xlsx"
    train_csv = "train_Data.csv"
    test_csv = "test_Data.csv"
    train_bureau = "Train/train_bureau.xlsx"
    test_bureau = "test_bureau.xlsx"
    train_bureau_csv = "train_bureau.csv"
    test_bureau_csv = "test_bureau.csv"


TargetMap = {
    "No Top-up Service": 6,
    " > 48 Months": 5,
    "36-48 Months": 4,
    "30-36 Months": 3,
    "24-30 Months": 2,
    "18-24 Months": 1,
    "12-18 Months": 0
}

TargetRevMap = {v: k for k, v in TargetMap.items()}

#train, test, bureau = read_files("data")
# train.to_parquet("data/train_v1.pq",)
# test.to_parquet("data/test_v1.pq",)
# bureau.to_parquet("data/bureau_v1.pq",)

train = pd.read_parquet("data/train_v1.pq")
test = pd.read_parquet("data/test_v1.pq")
bureau = pd.read_parquet("data/bureau_v1.pq")

In [5]:
def more_cat_features(df):
    df["close_date_diff"] = (df["CLOSE-DT"] - df["MaturityDAte"]).dt.days
    df["close_disburse_diff"] = (df["CLOSE-DT"] - df["DisbursalDate"]).dt.days
    df["reported_maturity_diff"] = (df["DATE-REPORTED"] - df["MaturityDAte"]).dt.days
    return df

train = more_cat_features(train)
test = more_cat_features(test)

In [6]:
from sklearn.preprocessing import QuantileTransformer
def prepare_bureau_data(bur_df, max_len=15):
    bur_df = bur_df.copy()
    cat_cols = ["ACCT-TYPE", "OWNERSHIP-IND", "CONTRIBUTOR-TYPE", "ACCOUNT-STATUS"]
    cont_cols = ["DISBURSED-AMT/HIGH CREDIT", "post_days", "close_days", "TENURE", "CURRENT-BAL"]
    cat_maps = {col: {val: i+1 for i, val in enumerate(bur_df[col].unique())} for col in cat_cols}
    cat_levels = {cat: len(cat_d) for cat, cat_d in cat_maps.items()}
    
    medians = bur_df[cont_cols].median()
    bur_qnt = QuantileTransformer(output_distribution="normal")
    bur_qnt.fit(bur_df[cont_cols].fillna(medians))
    for col in cat_cols:
        bur_df[f"{col}_enc"] = bur_df[col].map(cat_maps[col])
    
    bur_cats = []
    bur_conts = []
    bur_df = bur_df.loc[bur_df.post_days > -732]
    bur_ids = set(bur_df.ID.unique())
    for bid in tqdm(range(bur_df.ID.max())):
        if (bid in bur_ids):
            sub_df = bur_df.loc[bur_df.ID == bid]
            cat_encs = []
            for col in cat_cols:
                enc = sub_df[f"{col}_enc"].values
                cat_encs.append(enc)
            cat_encs = pad_sequences(np.vstack(cat_encs), max_len, 'int64', 'post', 'pre', 0)
            bur_cats.append(cat_encs)
            conts = pad_sequences(bur_qnt.transform(sub_df[cont_cols].fillna(medians)).T, max_len, 'float32', 'post', 'pre', 0)/2.0
            bur_conts.append(conts)
        else:
            cat_encs = np.zeros(shape=(len(cat_cols), max_len), dtype='int64')
            bur_cats.append(cat_encs)
            conts = np.zeros(shape=(len(cont_cols), max_len), dtype='float32')
            bur_conts.append(conts)
        
    return np.moveaxis(np.moveaxis(np.dstack(bur_cats), 2, 0), 1, 2), np.moveaxis(np.moveaxis(np.dstack(bur_conts), 2, 0), 1, 2)

In [7]:
from sklearn.preprocessing import PowerTransformer
def prepare_data_tr_te(train, test):
    cont_features = ['Tenure', 'AssetCost', 'AmountFinance', 'DisbursalAmount', 'EMI', 'MonthlyIncome', 'AGE', 'LTV',
                            "Frequency", "days_to_enddate", 'disbursed_bur', 'days_from_start', 'close_date_diff', 'close_disburse_diff',
                     'reported_maturity_diff']
    cat_features = ["InstlmentMode", "LoanStatus", "PaymentMode", "SEX", "State", "Area", "ManufacturerID", "BranchID", "SupplierID"]
    
    data = pd.concat([train, test]).reset_index(drop=True)
    medians = data[cont_features].median()
    cont_enc = PowerTransformer()
    data[cont_features] = cont_enc.fit_transform(data[cont_features].fillna(medians))/2.0

    cat_maps = {col: {val: i+1 for i, val in enumerate(data[col].astype(str).unique())} for col in cat_features}
    for col in cat_features:
        data[col] = data[col].astype(str).map(cat_maps[col])
    data = data.set_index("ID")
    data.to_parquet("data/data_prepared.pq")

In [8]:
#bureau_cats, bureau_conts = prepare_bureau_data(bureau, 15)

In [9]:
#np.save("data/bureau_cats.npy", bureau_cats)
#np.save("data/bureau_conts.npy", bureau_conts)

In [10]:
#prepare_data_tr_te(train, test)

In [11]:
def ordinal_encode(x):
    if (x["Top-up Month"] == 'No Top-up Service'):
        return [0, 0, 0, 0, 0, 0]
    elif x["Top-up Month"] == '12-18 Months':
        return [1, 0, 0, 0, 0, 0]
    elif x["Top-up Month"] == '18-24 Months':
        return [1, 1, 0, 0, 0, 0]
    elif x["Top-up Month"] == '24-30 Months':
        return [1, 1, 1, 0, 0, 0]
    elif x["Top-up Month"] == '30-36 Months':
        return [1, 1, 1, 1, 0, 0]
    elif x["Top-up Month"] == '36-48 Months':
        return [1, 1, 1, 1, 1, 0]
    elif x["Top-up Month"] == ' > 48 Months':
        return [1, 1, 1, 1 ,1, 1]
    else:
        return [0, 0, 0, 0, 0, 0]

class LoanData(Dataset):
    def __init__(self, df, cont_features, cat_features, training=True, maxlen=20):
        self.df = df
        self.cont_features = cont_features
        self.cat_features = cat_features
        self.training = training
        self.maxlen = maxlen
        self.y = None
        self.data = None
        self.prepare_data()
        
    def prepare_data(self):
        self.ids = self.df.ID.values
        self.data = pd.read_parquet("data/data_prepared.pq")
        if self.training:
            self.y = np.vstack(self.df.apply(ordinal_encode, axis=1)).astype(int)
        self.bur_cats = np.load("data/bureau_cats.npy")
        self.bur_conts = np.load("data/bureau_conts.npy")
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        ID = self.ids[idx]
        bur_cat, bur_cont = self.bur_cats[ID], self.bur_conts[ID]
        if self.training:
            return {
                "x_cat": {cat: self.data.loc[ID, cat] for i, cat in enumerate(self.cat_features)},
                "x_cont": self.data.loc[ID, self.cont_features].astype(np.float32).values,
                "bur_cat": bur_cat,
                "bur_cont": bur_cont.astype(np.float32)
            }, self.y[idx].astype(np.float32)
        else:
            return {
                "x_cat": {cat: self.data.loc[ID, cat] for i, cat in enumerate(self.cat_features)},
                "x_cont": self.data.loc[ID, self.cont_features].astype(np.float32).values,
                    "bur_cat": bur_cat,
                    "bur_cont": bur_cont.astype(np.float32)
                   },
        None

In [12]:
from sklearn.preprocessing import QuantileTransformer, OrdinalEncoder, PowerTransformer
from sklearn.model_selection import KFold

cvlist = list(KFold(5, shuffle=True, random_state=123995786).split(train))
cat_features = ["InstlmentMode", "LoanStatus", "PaymentMode", "SEX", "State", "Area", "ManufacturerID", "BranchID", "SupplierID"]


In [13]:
tr, vl = train.iloc[cvlist[0][0]], train.iloc[cvlist[0][1]]

In [14]:
tr.shape, vl.shape

((102924, 40), (25731, 40))

In [15]:
cont_features = ['Tenure', 'AssetCost', 'AmountFinance', 'DisbursalAmount', 'EMI', 'MonthlyIncome', 'AGE', 'LTV',
                        "Frequency", "days_to_enddate", 'disbursed_bur', 'days_from_start', 'close_date_diff', 'close_disburse_diff',
                 'reported_maturity_diff']
cat_features = ["InstlmentMode", "LoanStatus", "PaymentMode", "SEX", "State", "Area", "ManufacturerID", "BranchID", "SupplierID"]

len(cat_features), len(cont_features)

(9, 15)

In [16]:
class LoanModel(nn.Module):
    def __init__(self, hparams, device):
        super(LoanModel, self).__init__()
        self.hparams = hparams
        # [2, 2, 11, 3, 22, 93, 10, 10, 5]
        embeds = [8, 8, 8, 8, 16, 32, 16, 128, 128]
        embeds2 = [100, 16, 16, 16]
        bur_cont_dim = 512
        cont_dim = 1024
        max_len = 15
        gru_dim = 128
        compress_dim=4096
        bidirectional=True
        self.install_embed = nn.Embedding(4, embeds[0])
        self.status1_embed = nn.Embedding(4, embeds[1])
        self.payment_embed = nn.Embedding(13, embeds[2])
        self.sex_embed = nn.Embedding(4, embeds[3])
        self.state_embed = nn.Embedding(24, embeds[4])
        self.area_embed = nn.Embedding(95, embeds[5], max_norm=1)
        self.manu_embed = nn.Embedding(12, embeds[6])
        self.branch_embed = nn.Embedding(192, embeds[7])
        self.supp_embed = nn.Embedding(4542, embeds[8])
    
        self.acct_embed = nn.Embedding(52, embeds2[0])
        self.own_embed = nn.Embedding(6, embeds2[1])
        self.con_embed = nn.Embedding(13, embeds2[2])
        self.status2_embed = nn.Embedding(12, embeds2[3])


        self.cont_fc = nn.Linear(hparams["num_cont_features"], cont_dim)
        self.bur_fc = nn.Linear(5, bur_cont_dim)
        self.num_features = cont_dim + sum(embeds[:-2])
        self.tanh = nn.Tanh()
        self.attn = nn.Linear(self.num_features, self.num_features)
        self.compress = nn.Linear(max_len*(gru_dim * (1 + bidirectional)), compress_dim)
        self.attn2 = nn.Linear(compress_dim, compress_dim)
        self.fc1 = nn.Linear(self.num_features + compress_dim, 1024)
        self.fc2 = nn.Linear(1024, 2048)
        self.fc3 = nn.Linear(1024, 512)
        self.fc4 = nn.Linear(2048, 6)
        # self.gru = nn.Tra
        self.gru = nn.GRU(sum(embeds2) + bur_cont_dim, gru_dim, bidirectional=bidirectional, num_layers=2, dropout=0.0)
        self.relu = nn.GELU()
        self.drop = nn.Dropout(0.0)
        self.bnorm1 = nn.BatchNorm1d(5)
        self.bnorm2 = nn.BatchNorm1d(hparams["num_cont_features"])
        self.batch_norm = nn.BatchNorm1d(self.num_features + compress_dim)
        self.batch_norm2 = nn.BatchNorm1d(2048)

        
    def forward(self, x):
        cats = x["x_cat"]
        cat_layers = []
        c1 = self.install_embed(cats["InstlmentMode"])
        c2 = self.status1_embed(cats["LoanStatus"])
        c3 = self.payment_embed(cats["PaymentMode"])
        c4 = self.sex_embed(cats["SEX"])
        c5 = self.state_embed(cats["State"])
        c6 = self.area_embed(cats["Area"])
        c7 = self.manu_embed(cats["ManufacturerID"])
        c8 = self.branch_embed(cats["BranchID"])
        c9 = self.supp_embed(cats["SupplierID"])
        c12 = self.acct_embed(x["bur_cat"][:, :, 0])
        c13 = self.own_embed(x["bur_cat"][:, :, 1])
        c14 = self.con_embed(x["bur_cat"][:, :, 2])
        c15 = self.status2_embed(x["bur_cat"][:, :, 3])
        c16 = self.bur_fc(self.bnorm1(x["bur_cont"].permute(0, 2, 1).contiguous()).permute(0, 2, 1).contiguous())
        bur_out = torch.cat((c12, c13, c14, c15, c16), -1)
        b1, _ = self.gru(bur_out)
        bs = b1.size()[0]
        b1 = b1.view(bs, -1).contiguous()
        b1 = self.compress(b1)
        b1 = b1 + self.tanh(b1) * self.attn2(b1)
        conts = x["x_cont"]
        conts = self.cont_fc(self.bnorm2(conts))
        out = torch.cat((c1, c2, c3, c4, c5, c6, c7, conts), -1)
        out1 = self.tanh(out)
        out2 = self.attn(out)
        out = out + out1 * out2
        out = torch.cat((out, b1), -1)
#        out = self.batch_norm(self.relu(out))
        out = self.relu(out)
        out = self.relu(self.fc1(out))
        out = self.drop(self.relu(self.fc2(out)))
        #out = self.drop(self.relu(self.fc3(out)))
        out = self.fc4(out)
        return out
        

In [17]:
def predict(model, loader, cuda=True):
    if cuda:
        model.cuda()
    model.eval()
    preds = []
    images = []
    with torch.no_grad():
        for batch in tqdm(loader):
            x, y = batch
            if cuda:
                outputs = nn.Sigmoid()(model(x))
                preds.append(outputs.cpu().numpy())

            else:
                outputs = nn.Sigmoid()(model.cpu()(x))
                preds.append(outputs.numpy())
        preds = np.concatenate(preds)
    return preds 


def train_folds(train, test, cvlist, hparams, device, logger, cont_features, cat_features):
    val_preds = []
    test_preds = []
    for tr_idx, vl_idx in cvlist:
        tr, vl = train.iloc[tr_idx], train.iloc[vl_idx]
        tr_ds = LoanData(tr, cont_features, cat_features, training=True)
        tr_dl = DataLoader(tr_ds, shuffle=True, drop_last=False, batch_size=hparams["batch_size"], num_workers=8)

        vl_ds = LoanData(vl, cont_features, cat_features, training=True)
        vl_dl = DataLoader(vl_ds, shuffle=False, drop_last=False, batch_size=hparams["batch_size"], num_workers=8)

        te_ds = LoanData(test, cont_features, cat_features, training=False)
        te_dl = DataLoader(te_ds, shuffle=False, drop_last=False, batch_size=hparams["batch_size"], num_workers=8)

        hparams["num_cont_features"] = len(cont_features)    
        # hparams["cat_features"] = [(cat, len(cat_cats[i])+1, 8) for i, cat in enumerate(cat_features)]

        model = LoanModel(hparams, device)
        if hparams.get("optimizer", "adam") == "adam":
            optimizer = torch.optim.Adam(
                model.parameters(), lr=hparams.get("lr", 1e-3), weight_decay=hparams.get("wd", 0), amsgrad=True,
            )
        else:
            optimizer = torch.optim.SGD(
                model.parameters(),
                lr=hparams.get("lr", 1e-2),
                weight_decay=hparams.get("wd", 0),
                momentum=0.9,
                nesterov=True,
            )
        if hparams.get("scheduler", "reducelrplateau") == "reducelrplateau":
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True, factor=0.5, min_lr=1e-6)
        if hparams.get("scheduler", "reducelrplateau") == "onecycle":
            total_steps = hparams.get("num_epochs") * int(np.ceil(len(tr) / hparams.get("batch_size")))
            max_lr = hparams.get("lr", 1e-3)
            scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr, total_steps)
        if hparams.get("scheduler", "reducelrplateau") == "steplr":
            scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 3, 0.1)

        logdir = Path("test") / f"fold_0"
        logdir.mkdir(exist_ok=True, parents=True)

        criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(1.0), reduction='mean')

        runner = dl.SupervisedRunner(device=device)
        runner.train(
            loaders={"train": tr_dl, "valid": vl_dl},
            model=model,
            criterion=criterion,
            optimizer=optimizer,
            scheduler=scheduler,
            num_epochs=hparams.get("num_epochs", 10),
            logdir=logdir,
            verbose=True,
            callbacks=[logger, SchedulerCallback(mode="epoch"), LoaderMetricCallback("f1", ordinal_f1_score)],
            load_best_on_end=True,
            main_metric='f1',
            minimize_metric=True
        )
        vpreds = predict(model, vl_dl, False)
        tpreds = predict(model, te_dl, False)
        val_preds.append(vpreds)
        test_preds.append(tpreds)
    return model, val_preds, test_preds

In [18]:
BATCH_SIZE = 10000
hparams = {
    "scheduler": "reducelrplateau",
    "batch_size": BATCH_SIZE,
    "optimizer": "adam",
    "num_epochs": 20,
    "lr": 5e-4,
}

In [19]:
from catalyst.metrics import fbeta_score
def ordinal_f1_score(
    outputs: torch.Tensor,
    targets: torch.Tensor,
    eps: float = 1e-7,
    argmax_dim: int = -1,
    num_classes = None,
):
    """
    Fbeta_score with beta=1.

    Args:
        outputs: A list of predicted elements
        targets:  A list of elements that are to be predicted
        eps: epsilon to avoid zero division
        argmax_dim: int, that specifies dimension for argmax transformation
            in case of scores/probabilities in ``outputs``
        num_classes: int, that specifies number of classes if it known

    Returns:
        float: F_1 score
    """
    targets = targets.sum(axis=-1)
    ordinal_preds = []
    outputs = nn.Sigmoid()(outputs)
    n = 7
    for i in range(n):
        if i == 0:
            ordinal_preds.append(1 - outputs[:, i])
        elif i == n - 1:
            ordinal_preds.append(outputs[:, i-1])
        else:
            ordinal_preds.append(outputs[:, i-1] - outputs[:, i])
    outputs = torch.stack(ordinal_preds, -1)
    score = fbeta_score(
        outputs=outputs,
        targets=targets,
        beta=1,
        eps=eps,
        argmax_dim=argmax_dim,
        num_classes=num_classes,
    )

    return -1.0 * score.mean().item()

In [None]:
device = utils.get_device()
neptune_logger = NeptuneLogger(
    api_token=os.environ["NEPTUNE_API_TOKEN"],
    project_name="tezdhar/wind-speed",
    name="wind_speed",
    params=hparams,
    tags=[f"fold_0"],
    upload_source_files=["nn_model1.ipynb"],
)

trained_model, val_preds, test_preds = train_folds(train, test, cvlist, hparams, device, neptune_logger, cont_features, cat_features)

https://ui.neptune.ai/tezdhar/wind-speed/e/WIN-442
1/20 * Epoch (train): 100% 11/11 [00:34<00:00,  3.14s/it, loss=0.348]
1/20 * Epoch (valid): 100% 3/3 [00:11<00:00,  3.84s/it, loss=0.380]
[2021-02-07 16:34:20,738] 
1/20 * Epoch 1 (_base): lr=0.0005 | momentum=0.9000
1/20 * Epoch 1 (train): f1=-1.299e-01 | loss=0.5083
1/20 * Epoch 1 (valid): f1=-1.293e-01 | loss=0.3595
2/20 * Epoch (train): 100% 11/11 [00:33<00:00,  3.01s/it, loss=0.326]
2/20 * Epoch (valid): 100% 3/3 [00:10<00:00,  3.64s/it, loss=0.350]
[2021-02-07 16:35:15,509] 
2/20 * Epoch 2 (_base): lr=0.0005 | momentum=0.9000
2/20 * Epoch 2 (train): f1=-1.517e-01 | loss=0.3364
2/20 * Epoch 2 (valid): f1=-1.721e-01 | loss=0.3351
3/20 * Epoch (train):   0% 0/11 [00:00<?, ?it/s]

In [None]:
np.save("data/val_preds_nn.npy", val_preds)
np.save("data/test_preds_nn.npy", test_preds)

In [None]:
from sklearn.metrics import f1_score
def process_preds(y_pred):
    ordinal_preds = []
    n = 7
    for i in range(n):
        if i == 0:
            ordinal_preds.append(1 - y_pred[:, i])
        elif i == n - 1:
            ordinal_preds.append(y_pred[:, i-1])
        else:
            ordinal_preds.append(y_pred[:, i-1] - y_pred[:, i])
    ordinal_preds = np.argmax(np.vstack(ordinal_preds).T, 1)
    return ordinal_preds

def ordinal_f1_score(y_true, y_pred, thresh=[0.9]):
    y_true = np.sum(y_true, axis=1)
    ordinal_preds = process_preds(y_pred)
    print(sum(ordinal_preds == 7))
    y_true[y_true == 7] = 0
    ordinal_preds[ordinal_preds == 7] = 0
    print(y_true, ordinal_preds)
    return f1_score(y_true, ordinal_preds, average='macro')



In [None]:
# val_preds = predict(trained_model, vl_dl, cuda=False)

In [None]:
y_vl = np.vstack(vl.apply(ordinal_encode, axis=1))
ordinal_f1_score(y_vl, val_preds)