In [None]:
# |default_exp utils
# |default_cls_lvl 3

In [None]:
#|hide
%reload_ext autoreload
%autoreload 2

# utils

> Competition utilities

In [None]:
# |export
from __future__ import annotations

import abc, datetime, random, os
from pathlib import Path

import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from fastai.losses import CrossEntropyLossFlat
from fastai.test_utils import show_install
from sklearn.metrics import mean_squared_error

from fastai.metrics import rmse, AccumMetric
from fastcore.test import *


In [None]:
# | hide
import pdb

from IPython.display import display

# from fastcore.test import *
from nbdev.showdoc import show_doc


In [None]:
# |hide
# |cuda
gpu_num = int(os.getenv("USE_GPU", 0))

torch.cuda.set_device(gpu_num)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")


Using GPU #0: GeForce GTX 1660 Ti with Max-Q Design


## Defaults

Application wide defaults go here

In [None]:
# | export
default_seed = int(os.getenv("RANDOM_SEED", 42))
kaggle_comp = os.getenv("KAGGLE_COMP", "feedback-prize-english-language-learning")


## Development environment

Information about where your code is running and your compute capabilities

In [None]:
# | export
def detect_env():
    """A helper function that detects where you are running code"""
    if os.environ.get("KAGGLE_KERNEL_RUN_TYPE", False):
        run_env = "kaggle"
    elif os.path.isdir("/content"):
        run_env = "colab"
    elif os.path.isdir("../nbs") or os.path.isdir("../../nbs"):
        run_env = "local_nb"
    else:
        run_env = "script"

    return run_env


run_env = detect_env()


In [None]:
# |eval: false
print(run_env)


local_nb


In [None]:
# | export
def print_dev_environment():
    """Provides details on your development environment including packages installed, cuda/cudnn availability, GPUs, etc."""
    print(show_install())


In [None]:
# |output: false
print_dev_environment()




```text
=== Software === 
python        : 3.9.13
fastai        : 2.7.9
fastcore      : 1.5.26
fastprogress  : 1.0.3
torch         : 1.12.1+cu102
nvidia driver : 455.32
torch cuda    : 10.2 / is available
torch cudnn   : 7605 / is enabled

=== Hardware === 
nvidia gpus   : 1
torch devices : 1
  - gpu0      : GeForce GTX 1660 Ti with Max-Q Design

=== Environment === 
platform      : Linux-5.4.0-42-generic-x86_64-with-glibc2.31
distro        : #46-Ubuntu SMP Fri Jul 10 00:24:02 UTC 2020
conda env     : kaggle_feedback_ell
python        : /home/mhenze/miniconda3/envs/kaggle_feedback_ell/bin/python
sys.path      : /home/mhenze/kaggle_feedback_ell/nbs
/home/mhenze/miniconda3/envs/kaggle_feedback_ell/lib/python39.zip
/home/mhenze/miniconda3/envs/kaggle_feedback_ell/lib/python3.9
/home/mhenze/miniconda3/envs/kaggle_feedback_ell/lib/python3.9/lib-dynload

/home/mhenze/miniconda3/envs/kaggle_feedback_ell/lib/python3.9/site-packages
```

Please make sure to include opening/closing ``` when you

## Competition Setup

**NOTE**: The first thing you should run when setting things up after you've deinfed your `kaggle_comp`, is `setup_comp()`. This method will ensure all the necessary folders are created as well as download the competition data if necessary.

In [None]:
# | export
def get_paths(override_project_root=None):
    """Returns data, models, and log folder paths based on your where you are running the code"""
    if run_env == "kaggle":
        data_path = Path(".")
        comp_data_path = clean_data_path = Path(f"../input/{kaggle_comp}")
        working_path = Path("/kaggle/working")
        models_path = working_path / "models"
        logs_path = working_path / "logs"

    elif run_env == "colab":
        proj_root_path = override_project_root or Path(".")

        data_path = proj_root_path
        comp_data_path = clean_data_path = data_path
        models_path = data_path / "models"
        logs_path = data_path / "logs"

    elif run_env == "local_nb":
        proj_root_path = override_project_root or Path("..")

        data_path = Path(proj_root_path / "data")
        comp_data_path = data_path / "comp"
        clean_data_path = data_path / "clean"
        models_path = Path(proj_root_path / "models")
        logs_path = Path(proj_root_path / "logs")

        comp_data_path.mkdir(parents=True, exist_ok=True)
        clean_data_path.mkdir(parents=True, exist_ok=True)

    elif run_env == "script":
        proj_root_path = override_project_root or Path(".")

        data_path = Path(proj_root_path / "data")
        comp_data_path = data_path / "comp"
        clean_data_path = data_path / "clean"
        models_path = Path(proj_root_path / "models")
        logs_path = Path(proj_root_path / "logs")

        comp_data_path.mkdir(parents=True, exist_ok=True)
        clean_data_path.mkdir(parents=True, exist_ok=True)

    try:
        models_path.mkdir(parents=True, exist_ok=True)
        logs_path.mkdir(parents=True, exist_ok=True)
    except:
        print("Unable to create models and logs folders")

    return data_path, comp_data_path, clean_data_path, models_path, logs_path


In [None]:
# | export
def setup_comp(override_project_root=None, comp_data_path_override=None):
    """Ensures that the expected data, models, and logs folders exist and that the competition data exists in the 'comp_data_path'."""

    if comp_data_path_override is not None:
        comp_data_path = comp_data_path_override
    else:
        _, comp_data_path, *_ = get_paths(override_project_root)

    if run_env != "kaggle":
        from kaggle import api
        
        if not comp_data_path.exists() or not any(comp_data_path.iterdir()):
            import zipfile

            api.competition_download_cli(kaggle_comp)

            zipfile.ZipFile(f"{kaggle_comp}.zip").extractall(comp_data_path)
            Path(f"{kaggle_comp}.zip").unlink(missing_ok=True)

        return comp_data_path
    else:
        return Path(f"../input/{kaggle_comp}")


In [None]:
# |eval: false
setup_comp()


Path('../data/comp')

## Competition Metrics

This competition is evaluated using "columnwise root mean squared error", i.e. the rmse per target column averaged over all targets. From the competition website:

> For each text_id in the test set, you must predict a value for each of the six analytic measures 

In [None]:
# | export
def comp_metric_score(preds, targs):
    """This competition is evaluated using "columnwise root mean squared error". Expects numpy arrays."""
    len_target_cols = targs.shape[1]
    score = [0] * len_target_cols
    for i in range(len_target_cols):
        score[i] = np.sqrt(mean_squared_error(preds[:, i], targs[:, i]))
    return np.mean(score)

In [None]:
#| export
def MCRMSE(dim_argmax=None, **kwargs):
    "columnwise root mean squared error for regression problem"
    def mcrmse(x,y): return comp_metric_score(x.cpu().numpy(),y.cpu().numpy())
    return AccumMetric(mcrmse, invert_arg=False, flatten=False, dim_argmax=dim_argmax, **kwargs)

Verify our comp_metric_score calculates the metric correctly

In [None]:
preds = torch.tensor([[0, 1, 2], [4, 5, 6]]) #.transpose(0, 1)
targets = torch.tensor([[0.5, 2, 2.5], [3.5, 5.2, 5.8]]) #.transpose(0, 1)

print(preds)
print(targets)

tensor([[0, 1, 2],
        [4, 5, 6]])
tensor([[0.5000, 2.0000, 2.5000],
        [3.5000, 5.2000, 5.8000]])


In [None]:
check = ((rmse(preds[:, 0], targets[:, 0]) + rmse(preds[:, 1], targets[:, 1]) + rmse(preds[:, 2], targets[:, 2]))/3).item()
score = comp_metric_score(preds.numpy(), targets.numpy())

print(check, score)

0.5339663028717041 0.5339662779488101


In [None]:
test_close(score, check)

In [None]:
test_met = MCRMSE()
test_met(preds=preds, targs=targets)

0.5339662779488101

In [None]:
test_close(test_met(preds=preds, targs=targets), check)

### Loss Functions

In [None]:
#| export
class RMSELoss_0(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = eps

    def forward(self, yhat, y):
        loss = torch.sqrt(self.mse(yhat, y) + self.eps)
        return loss


class MCRMSELoss(nn.Module):
    def __init__(self, num_scored=6):
        super().__init__()
        self.rmse = RMSELoss_0()
        self.num_scored = num_scored

    def forward(self, yhat, y):
        score = 0
        for i in range(self.num_scored):
            score += self.rmse(yhat[:, i], y[:, i]) / self.num_scored

        return score

## Augmentations

In [None]:
#| export
def rev_phrase(foo: str):
    return " ".join(foo.split()[::-1])

In [None]:
rev_phrase("test phrase")

'phrase test'

## Competition Trainer

In [None]:
#| export
class CompTrainer(abc.ABC):
    def __init__(self, train_config, model_name, model_output_path="models", log_output_path="logs", **kwargs):
        self.train_config = train_config
        self.model_name = model_name
        self.model_output_path = Path(model_output_path)
        self.log_output_path = Path(log_output_path)

    @abc.abstractmethod
    def train(self, CFG, data, experiment_name=None , n_fold=5, run_id=-1, grid_id=-1, seed=None, verbose: bool = True):
        pass

    @abc.abstractmethod
    def predict(self, model_name, data):
        pass

    def get_value_for(self, attr, CFG, default):
        val = getattr(CFG, attr, None)
        return val if val is not None else self.train_config.get(attr, default)


## Other utilities

In [None]:
# | export
def get_run_id():
    run_id = str(datetime.datetime.now())[:16].replace(":", "_").replace(" ", "_").replace("-", "_")
    return run_id


In [None]:
print(get_run_id())


2022_11_10_22_03


## Export -

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()