## Get Data

In [1]:
from IPython.display import clear_output
from google.colab import files
files.upload()

!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

!kaggle datasets download -d jackstapleton/petfinder-pf-cc-ua-dataset

!mkdir ~/.data
!unzip -q petfinder-pf-cc-ua-dataset.zip -d /.data

clear_output()

In [2]:
from google.colab import drive
drive.mount("/content/gdrive")

clear_output()

## Library Imports

In [1]:
import os
import re
import pickle
import random as r
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch import nn, optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader as DL
from torch.nn.utils import weight_norm as WN
from torchvision import models, transforms

from time import time
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

## Constants and Utilities

In [2]:
SEED = 0
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PATH = "/.data"
verbose = False

sc_y = StandardScaler()

In [3]:
def breaker(num=50, char="*") -> None:
    print("\n" + num*char + "\n")


def get_targets() -> np.ndarray:
    df = pd.read_csv("/content/gdrive/My Drive/train.csv", engine="python")
    targets = df["Pawpularity"].copy().values
    return targets.reshape(-1, 1)


def show_graphs(L: list, title=None) -> None:
    TL, VL = [], []
    for i in range(len(L)):
        TL.append(L[i]["train"])
        VL.append(L[i]["valid"])
    x_Axis = np.arange(1, len(L) + 1)
    plt.figure()
    plt.plot(x_Axis, TL, "r", label="train")
    plt.plot(x_Axis, VL, "b", label="valid")
    plt.grid()
    plt.legend()
    if title:
        plt.title("{} Loss".format(title))
    else:
        plt.title("Loss")
    plt.show()

## Dataset Template and Build Dataloader

In [4]:
class DS(Dataset):
    def __init__(self, features=None, targets=None):
        self.features = features
        self.targets  = targets
        
    def __len__(self):
        return self.features.shape[0]
    
    def __getitem__(self, idx):
        return torch.FloatTensor(self.features[idx]), torch.FloatTensor(self.targets[idx])
    

def build_dataloaders(tr_features: np.ndarray, va_features: np.ndarray,
                      tr_targets: np.ndarray, va_targets: np.ndarray,
                      batch_size: int, seed: int):

    if verbose:
        breaker()
        print("Building Train and Validation DataLoaders ...")
    
    tr_data_setup = DS(features=tr_features, targets=tr_targets)
    va_data_setup = DS(features=va_features, targets=va_targets)
    
    dataloaders = {
        "train" : DL(tr_data_setup, batch_size=batch_size, shuffle=True, generator=torch.manual_seed(seed)),
        "valid" : DL(va_data_setup, batch_size=batch_size, shuffle=False)
    }
    
    return dataloaders

## Build Model

In [5]:
def build_model(IL: int, seed: int):
    class ANN(nn.Module):
        def __init__(self, IL=None):
            super(ANN, self).__init__()

            self.predictor = nn.Sequential()
            self.predictor.add_module("BN", nn.BatchNorm1d(num_features=IL, eps=1e-5))
            self.predictor.add_module("FC", WN(nn.Linear(in_features=IL, out_features=1)))

        def get_optimizer(self, lr=1e-3, wd=0):
            params = [p for p in self.parameters() if p.requires_grad]
            return optim.Adam(params, lr=lr, weight_decay=wd)

        def get_plateau_scheduler(self, optimizer=None, patience=5, eps=1e-8):
            return optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, patience=patience, eps=eps, verbose=True)

        def forward(self, x1, x2=None):
            if x2 is not None:
                return self.predictor(x1), self.predictor(x2)
            else:
                return self.predictor(x1)
    
    if verbose:
        breaker()
        print("Building Model ...")
        print("\n{} -> 1".format(IL))
    
    torch.manual_seed(seed)
    model = ANN(IL=IL)
    
    return model

## Fit and Predict Helpers

In [6]:
def fit(model=None, optimizer=None, scheduler=None, 
        epochs=None, early_stopping_patience=None,
        dataloaders=None, fold=None, seed=None, verbose=False) -> tuple:
    
    name = "./Seed_{}_Fold_{}_state.pt".format(seed, fold)
    
    if verbose:
        breaker()
        print("Training Fold {}...".format(fold))
        breaker()
    # else:
    #    print("Training Fold {}...".format(fold))

    Losses = []
    bestLoss = {"train" : np.inf, "valid" : np.inf}

    start_time = time()
    for e in range(epochs):
        e_st = time()
        epochLoss = {"train" : np.inf, "valid" : np.inf}

        for phase in ["train", "valid"]:
            if phase == "train":
                model.train()
            else:
                model.eval()
            
            lossPerPass = []

            for X, y in dataloaders[phase]:
                X, y = X.to(DEVICE), y.to(DEVICE)

                optimizer.zero_grad()
                with torch.set_grad_enabled(phase == "train"):
                    output = model(X)
                    loss = torch.nn.MSELoss()(output, y)
                    if phase == "train":
                        loss.backward()
                        optimizer.step()
                lossPerPass.append(loss.item())
            epochLoss[phase] = np.mean(np.array(lossPerPass))
        Losses.append(epochLoss)

        if early_stopping_patience:
            if epochLoss["valid"] < bestLoss["valid"]:
                bestLoss = epochLoss
                BLE = e + 1
                torch.save({"model_state_dict": model.state_dict(),
                            "optim_state_dict": optimizer.state_dict()},
                           name)
                early_stopping_step = 0
            else:
                early_stopping_step += 1
                if early_stopping_step > early_stopping_patience:
                    if verbose:
                        print("\nEarly Stopping at Epoch {}".format(e))
                    break
        
        if epochLoss["valid"] < bestLoss["valid"]:
            bestLoss = epochLoss
            BLE = e + 1
            torch.save({"model_state_dict": model.state_dict(),
                        "optim_state_dict": optimizer.state_dict()},
                       name)
        
        if scheduler:
            scheduler.step(epochLoss["valid"])
        
        if verbose:
            print("Epoch: {} | Train Loss: {:.5f} | Valid Loss: {:.5f} | Time: {:.2f} seconds".format(e+1, epochLoss["train"], epochLoss["valid"], time()-e_st))
    
    if verbose:
        breaker()
        print("Best Validation Loss at Epoch {}".format(BLE))
        breaker()
        print("Time Taken [{} Epochs] : {:.2f} minutes".format(len(Losses), (time()-start_time)/60))
        breaker()
        print("Training Completed")
        breaker()

    return Losses, BLE, name

#####################################################################################################

def predict_batch(model=None, dataloader=None, mode="test", path=None) -> np.ndarray:    
    model.load_state_dict(torch.load(path, map_location=DEVICE)["model_state_dict"])
    model.to(DEVICE)
    model.eval()

    y_pred = torch.zeros(1, 1).to(DEVICE)
    if re.match(r"valid", mode, re.IGNORECASE):
        for X, _ in dataloader:
            X = X.to(DEVICE)
            with torch.no_grad():
                output = model(X)
            y_pred = torch.cat((y_pred, output.view(-1, 1)), dim=0)
    elif re.match(r"test", mode, re.IGNORECASE):
        for X in dataloader:
            X = X.to(DEVICE)
            with torch.no_grad():
                output = model(X)
            y_pred = torch.cat((y_pred, output.view(-1, 1)), dim=0)
    
    return y_pred[1:].detach().cpu().numpy()

## Train

In [7]:
def train(features: np.ndarray, targets: np.ndarray,
          n_splits: int, batch_size: int, lr: float, wd: float, 
          epochs: int, early_stopping: int, seed: int,
          patience=None, eps=None) -> list:        
    
    metrics = []
        
    KFold_start_time = time()
    if verbose:
        breaker()
        print("\tSeed: {}".format(seed))
        breaker()
        print("Performing {} Fold CV ...".format(n_splits))
    fold = 1
    for tr_idx, va_idx in KFold(n_splits=n_splits, shuffle=True, random_state=seed).split(features):

        tr_features, va_features = features[tr_idx], features[va_idx]
        tr_targets, va_targets   = targets[tr_idx], targets[va_idx]

        tr_targets = sc_y.fit_transform(tr_targets)
        va_targets = sc_y.transform(va_targets)

        dataloaders = build_dataloaders(tr_features, va_features,
                                        tr_targets, va_targets, 
                                        batch_size, seed)
        model = build_model(IL=tr_features.shape[1], seed=seed).to(DEVICE)
        optimizer = model.get_optimizer(lr=lr, wd=wd)
        scheduler = None
        if isinstance(patience, int) and isinstance(eps, float):
            scheduler = model.get_plateau_scheduler(optimizer, patience, eps)

        L, _, name = fit(model=model, optimizer=optimizer, scheduler=scheduler, 
                         epochs=epochs, early_stopping_patience=early_stopping,
                         dataloaders=dataloaders, fold=fold, seed=seed, verbose=verbose)
        y_pred = predict_batch(model=model, dataloader=dataloaders["valid"], mode="valid", path=name)
        RMSE = np.sqrt(mean_squared_error(sc_y.inverse_transform(y_pred), sc_y.inverse_transform(va_targets)))
        if verbose:
            print("Validation RMSE [Fold {}]: {:.5f}".format(fold, RMSE))
            breaker()
            show_graphs(L)
        
        metrics_dict = {"Seed" : seed, "Fold" : fold, "RMSE" : RMSE}
        metrics.append(metrics_dict)
        
        fold += 1
    
    if verbose:
        breaker()
        print("Total Time to {} Fold CV : {:.2f} minutes".format(n_splits, (time() - KFold_start_time)/60))
    
    return metrics, (time() - KFold_start_time)/60

## Main

In [8]:
def main():
    DEBUG = False

    r.seed(SEED)
    seeds = [r.randint(0, 99) for _ in range(20)]
    
    ########### Params ###########
    
    if DEBUG:
        n_splits = 3
        patience, eps = 5, 1e-8
        epochs, early_stopping = 5, 5
        batch_size = 32
        lr = 5e-4
        wd = 1e-1
        seeds = seeds[:2]
    else:
        n_splits = 10
        patience, eps = 5, 1e-8
        epochs, early_stopping = 100, 5
        batch_size = 32
        lr = 5e-4
        wd = 1e-1
    
    ##############################

    complete_metrics = []

    if verbose:
        breaker()
        print("Loading Data ...")
    
    features = np.load(os.path.join(PATH, "densenet169_features.npy"))
    targets  = get_targets()

    breaker()
    for seed in seeds:
        # Without Scheduler
        metrics, time_taken = train(features, targets, n_splits, batch_size, lr, wd, epochs, early_stopping, seed, patience=None, eps=None)
        
        # # With Plateau Scheduler
        # metrics = train(features, targets, n_splits, batch_size, lr, wd, epochs, early_stopping, seed, patience=patience, eps=eps)

        complete_metrics.append(metrics)

        if not verbose:
            print("Seed: {} -> {:.2f} minutes".format(seed, time_taken))

    if verbose:
        breaker()
        for i in range(len(complete_metrics)):
            for j in range(len(complete_metrics[i])):
                print(complete_metrics[i][j])
        breaker()
    
    rmse = []
    for i in range(len(complete_metrics)):
        for j in range(len(complete_metrics[i])):
            rmse.append(complete_metrics[i][j]["RMSE"])

    best_index = rmse.index(min(rmse))
    best_index_1 = best_index // n_splits
    best_index_2 = best_index % n_splits

    breaker()
    print("Best RMSE ({:.5f}) using Seed: {}".format(complete_metrics[best_index_1][best_index_2]["RMSE"],    
                                                     complete_metrics[best_index_1][best_index_2]["Seed"]))
    breaker()

    with open("complete_metrics.pkl", "wb") as fp:
        pickle.dump(complete_metrics, fp)

In [9]:
main()


**************************************************

Seed: 49 -> 1.39 minutes
Seed: 97 -> 1.46 minutes
Seed: 53 -> 1.50 minutes
Seed: 5 -> 1.42 minutes
Seed: 33 -> 1.44 minutes
Seed: 65 -> 1.47 minutes
Seed: 62 -> 1.44 minutes
Seed: 51 -> 1.47 minutes
Seed: 38 -> 1.30 minutes
Seed: 61 -> 1.45 minutes
Seed: 45 -> 1.41 minutes
Seed: 74 -> 1.43 minutes
Seed: 27 -> 1.54 minutes
Seed: 64 -> 1.39 minutes
Seed: 17 -> 1.40 minutes
Seed: 36 -> 1.44 minutes
Seed: 17 -> 1.40 minutes
Seed: 96 -> 1.49 minutes
Seed: 12 -> 1.44 minutes
Seed: 79 -> 1.40 minutes

**************************************************

Best RMSE (16.88328) using Seed: 49

**************************************************



In [14]:
if not verbose:
    with open("complete_metrics.pkl", "rb") as fp:
            params = pickle.load(fp)

    rmse = []
    for i in range(len(params)):
        for j in range(len(params[i])):
            rmse.append(params[i][j]["RMSE"])
    

    best_index = rmse.index(min(rmse))
    best_index_1 = best_index // 10
    best_index_2 = best_index % 10

    breaker()
    print("Params : {}".format(params[best_index_1][best_index_2]))
    breaker()


**************************************************

Params : {'Seed': 49, 'Fold': 8, 'RMSE': 16.883282329889802}

**************************************************

