In [None]:
# |default_exp utils
# |default_cls_lvl 3

In [None]:
#|hide
%reload_ext autoreload
%autoreload 2

# utils

> Fill in a module description here

In [None]:
# |export
from __future__ import annotations

import abc, datetime, random, os
from pathlib import Path

import numpy as np
import torch
from torch.nn import functional as F
from fastai.losses import CrossEntropyLossFlat
from fastai.test_utils import show_install

In [None]:
#| hide
import pdb

from IPython.display import display
from fastcore.test import *
from nbdev.showdoc import show_doc

In [None]:
# |hide
# |cuda
gpu_num = int(os.getenv("USE_GPU", 0))

torch.cuda.set_device(gpu_num)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")

Using GPU #0: GeForce GTX 1080 Ti


## Defaults

Application wide defaults go here

In [None]:
#| export
default_seed = int(os.getenv("RANDOM_SEED", 42))
kaggle_comp = os.getenv("KAGGLE_COMP","feedback-prize-english-language-learning")

## Development environment

Information about where your code is running and your compute capabilities

In [None]:
#| export
def detect_env():
    """A helper function that detects where you are running code"""
    if os.environ.get("KAGGLE_KERNEL_RUN_TYPE", False):
        run_env = "kaggle"
    elif os.path.isdir("/content"):
        run_env = "colab"
    elif os.path.isdir("../nbs") or  os.path.isdir("../../nbs"):
        run_env = "local_nb"
    else:
        run_env = "script"

    return run_env


run_env = detect_env()

if run_env != "kaggle":
    from kaggle import api

In [None]:
print(run_env)

local_nb


In [None]:
#| export
def print_dev_environment():
    """Provides details on your development environment including packages installed, cuda/cudnn availability, GPUs, etc."""
    print(show_install())

In [None]:
# print_dev_environment()

## Competition Setup

**NOTE**: The first thing you should run when setting things up after you've deinfed your `kaggle_comp`, is `setup_comp()`. This method will ensure all the necessary folders are created as well as download the competition data if necessary.

In [None]:
#| export
def get_paths(override_project_root=None):
    """Returns data, models, and log folder paths based on your where you are running the code"""
    if run_env == "kaggle":
        data_path = Path(".")
        comp_data_path = clean_data_path= Path(f"../input/{kaggle_comp}")
        working_path = Path("/kaggle/working")
        models_path = working_path / "models"
        logs_path = working_path / "logs"

    elif run_env == "colab":
        proj_root_path = override_project_root or Path(".")

        data_path = proj_root_path
        comp_data_path = clean_data_path = data_path
        models_path = data_path / "models"
        logs_path = data_path / "logs"

    elif run_env == "local_nb":
        proj_root_path = override_project_root or Path("..")

        data_path = Path(proj_root_path/"data")
        comp_data_path = data_path / "comp"
        clean_data_path = data_path / "clean"
        models_path = Path(proj_root_path/"models")
        logs_path = Path(proj_root_path/"logs")

        comp_data_path.mkdir(parents=True, exist_ok=True)
        clean_data_path.mkdir(parents=True, exist_ok=True)

    elif run_env == "script":
        proj_root_path = override_project_root or Path(".")

        data_path = Path(proj_root_path/"data")
        comp_data_path = data_path / "comp"
        clean_data_path = data_path / "clean"
        models_path = Path(proj_root_path/"models")
        logs_path = Path(proj_root_path/"logs")

        comp_data_path.mkdir(parents=True, exist_ok=True)
        clean_data_path.mkdir(parents=True, exist_ok=True)

    try:
        models_path.mkdir(parents=True, exist_ok=True)
        logs_path.mkdir(parents=True, exist_ok=True)
    except:
        print("Unable to create models and logs folders")
        
    return data_path, comp_data_path, clean_data_path, models_path, logs_path

In [None]:
#| export
def setup_comp(override_project_root=None, comp_data_path_override=None):
    """Ensures that the expected data, models, and logs folders exist and that the competition data exists in the 'comp_data_path'."""

    if comp_data_path_override is not None:
        comp_data_path = comp_data_path_override
    else:
        _, comp_data_path, *_ = get_paths(override_project_root)

    if run_env != "kaggle":
        if not comp_data_path.exists() or not any(comp_data_path.iterdir()):
            import zipfile

            api.competition_download_cli(kaggle_comp)

            zipfile.ZipFile(f"{kaggle_comp}.zip").extractall(comp_data_path)
            Path(f"{kaggle_comp}.zip").unlink(missing_ok=True)

        return comp_data_path
    else:
        return Path(f"../input/{kaggle_comp}")

In [None]:
setup_comp()

Path('../data/comp')

## Competition Metrics

This competition is evaluated using "multi-class logarithmic loss" (e.g., cross-entropy loss). From the competition website ...

> Each row in the dataset has been labeled with one true effectiveness label. For each row, you must submit the predicted probabilities that the product belongs to each quality label.

In [None]:
#| export
def comp_metric_score(preds, targs):
    """This competition is evaluated using "multi-class logarithmic loss" (e.g., cross-entropy loss). Expects numpy arrays."""
    probs = np.exp(preds) / np.sum(np.exp(preds), axis=1, keepdims=True)

    correct_class_probs = probs[range(len(preds)), targs]
    nll = -np.log(correct_class_probs)
    return nll.mean()

Verify our comp_metric_score calculates cross-entropy loss correctly

In [None]:
preds = torch.randn(3, 5)
targets = torch.tensor([0, 3, 4])

print(preds)
print(targets)

tensor([[ 0.2828,  0.1889, -0.0737, -0.4460, -0.0887],
        [ 0.3788,  2.0066,  0.0591, -0.7007,  0.3658],
        [-0.8615,  0.4423, -0.3266,  0.2098, -0.1276]])
tensor([0, 3, 4])


In [None]:
pytorch_loss = F.cross_entropy(preds, targets)
fastai_loss = CrossEntropyLossFlat()(preds, targets)
np_loss = comp_metric_score(preds.numpy(), targets.numpy())

test_close(pytorch_loss.item(), fastai_loss.item())
test_close(pytorch_loss.item(), np_loss)

pytorch_loss, fastai_loss, np_loss

(tensor(2.0688), TensorBase(2.0688), 2.0688422)

## Other utilities

In [None]:
#| export
def get_run_id():
    run_id = str(datetime.datetime.now())[:16].replace(":", "_").replace(" ", "_").replace("-", "_")
    return run_id

In [None]:
print(get_run_id())

2022_09_13_20_38


## Export -

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()