In [1]:
import pickle
import json
import gc
import os
import shutil
import random
from typing import Any, Dict, List, Tuple, Optional, Union

import wandb
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
from torch_ema import ExponentialMovingAverage
from tqdm import tqdm

from joblib import Parallel, delayed
from scipy.optimize import minimize
from sklearn.decomposition import PCA
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold, PredefinedSplit
from sklearn.metrics import matthews_corrcoef, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts



def save_file(path: str, filename=None, base_path=None) -> None:
    filename = filename or os.path.basename(path)
    shutil.copy(
        path,
        os.path.join(wandb.run.dir, filename),
    )
    wandb.save(os.path.join(wandb.run.dir, filename), base_path)

    
def mcc_sweep(y_true, y_pred):
    best_score = 0
    min_th = 0.3
    max_th = 0.45
    th_step = 0.01
    
    th = min_th
    while th <= max_th:
        score = matthews_corrcoef(y_true, y_pred >= th)
        if score > best_score:
            best_score = score
        th += th_step
    return best_score


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True



class AverageMeter:
    """Computes and stores the average and current value"""

    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


class TabularDataset(Dataset):
    def __init__(self, x_num: np.ndarray, x_cat: np.ndarray, y: Optional[np.ndarray]):
        super().__init__()
        self.x_num = x_num
        self.x_cat = x_cat
        self.y = y

    def __len__(self):
        return len(self.x_num)

    def __getitem__(self, idx):
        if self.y is None:
            return self.x_num[idx], torch.LongTensor(self.x_cat[idx])
        else:
            return self.x_num[idx], torch.LongTensor(self.x_cat[idx]), self.y[idx]


class MLP(nn.Module):
    def __init__(self,
                 src_num_dim: int,
                 n_categories: List[int],
                 dropout: float = 0.0,
                 hidden: int = 50,
                 emb_dim: int = 10,
                 dropout_cat: float = 0.2,
                 bn: bool = False):
        super().__init__()

        self.embs = nn.ModuleList([
            nn.Embedding(x, emb_dim) for x in n_categories])
        self.cat_dim = emb_dim * len(n_categories)
        self.dropout_cat = nn.Dropout(dropout_cat)

        if bn:
            self.sequence = nn.Sequential(
                nn.Linear(src_num_dim + self.cat_dim, hidden),
                nn.Dropout(dropout),
                nn.BatchNorm1d(hidden),
                nn.ReLU(),
                nn.Linear(hidden, hidden),
                nn.Dropout(dropout),
                nn.BatchNorm1d(hidden),
                nn.ReLU(),
                nn.Linear(hidden, 1)
            )
        else:
            self.sequence = nn.Sequential(
                nn.Linear(src_num_dim + self.cat_dim, hidden),
                nn.Dropout(dropout),
                nn.ReLU(),
                nn.Linear(hidden, hidden),
                nn.Dropout(dropout),
                nn.ReLU(),
                nn.Linear(hidden, 1)
            )

    def forward(self, x_num, x_cat):
        embs = [embedding(x_cat[:, i]) for i, embedding in enumerate(self.embs)]
        x_cat_emb = self.dropout_cat(torch.cat(embs, 1))
        x_all = torch.cat([x_num, x_cat_emb], 1)
        x = self.sequence(x_all)
        return torch.squeeze(x)



class CNN(nn.Module):
    def __init__(self,
                 num_features: int,
                 hidden_size: int,
                 n_categories: List[int],
                 emb_dim: int = 10,
                 dropout_cat: float = 0.2,
                 channel_1: int = 256,
                 channel_2: int = 512,
                 channel_3: int = 512,
                 dropout_top: float = 0.1,
                 dropout_mid: float = 0.3,
                 dropout_bottom: float = 0.2,
                 weight_norm: bool = True,
                 two_stage: bool = True,
                 celu: bool = True,
                 kernel1: int = 5,
                 no_cat: bool = True,
                 leaky_relu: bool = False):
        super().__init__()

        num_targets = 1

        cha_1_reshape = int(hidden_size / channel_1)
        cha_po_1 = int(hidden_size / channel_1 / 2)
        cha_po_2 = int(hidden_size / channel_1 / 2 / 2) * channel_3
        
        print(f"cha_po: {cha_1_reshape}/{cha_po_1}/{cha_po_2}")
        
        assert cha_1_reshape > 0
        assert cha_po_1 > 0
        assert cha_po_2 > 0

        self.cat_dim = 0 if no_cat else emb_dim * len(n_categories)
        self.cha_1 = channel_1
        self.cha_2 = channel_2
        self.cha_3 = channel_3
        self.cha_1_reshape = cha_1_reshape
        self.cha_po_1 = cha_po_1
        self.cha_po_2 = cha_po_2
        self.two_stage = two_stage

        self.expand = nn.Sequential(
            nn.BatchNorm1d(num_features + self.cat_dim),
            nn.Dropout(dropout_top),
            nn.utils.weight_norm(nn.Linear(num_features + self.cat_dim, hidden_size), dim=None),
            nn.CELU(0.06) if celu else nn.ReLU()
        )

        def _norm(layer, dim=None):
            return nn.utils.weight_norm(layer, dim=dim) if weight_norm else layer

        self.conv1 = nn.Sequential(
            nn.BatchNorm1d(channel_1),
            nn.Dropout(dropout_top),
            _norm(nn.Conv1d(channel_1, channel_2, kernel_size=kernel1, stride=1, padding=kernel1 // 2, bias=False)),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(output_size=cha_po_1),
            nn.BatchNorm1d(channel_2),
            nn.Dropout(dropout_top),
            _norm(nn.Conv1d(channel_2, channel_2, kernel_size=3, stride=1, padding=1, bias=True)),
            nn.ReLU()
        )

        if self.two_stage:
            self.conv2 = nn.Sequential(
                nn.BatchNorm1d(channel_2),
                nn.Dropout(dropout_mid),
                _norm(nn.Conv1d(channel_2, channel_2, kernel_size=3, stride=1, padding=1, bias=True)),
                nn.ReLU(),
                nn.BatchNorm1d(channel_2),
                nn.Dropout(dropout_bottom),
                _norm(nn.Conv1d(channel_2, channel_3, kernel_size=5, stride=1, padding=2, bias=True)),
                nn.ReLU()
            )

        self.max_po_c2 = nn.MaxPool1d(kernel_size=4, stride=2, padding=1)

        self.flt = nn.Flatten()

        if leaky_relu:
            self.dense = nn.Sequential(
                nn.BatchNorm1d(cha_po_2),
                nn.Dropout(dropout_bottom),
                _norm(nn.Linear(cha_po_2, num_targets), dim=0),
                nn.LeakyReLU()
            )
        else:
            self.dense = nn.Sequential(
                nn.BatchNorm1d(cha_po_2),
                nn.Dropout(dropout_bottom),
                _norm(nn.Linear(cha_po_2, num_targets), dim=0)
            )

        self.no_cat = no_cat
        
        if not no_cat:
            self.embs = nn.ModuleList([nn.Embedding(x, emb_dim) for x in n_categories])
            self.cat_dim = emb_dim * len(n_categories)
            self.dropout_cat = nn.Dropout(dropout_cat)

    def forward(self, x_num, x_cat):
        if self.no_cat:
            x = x_num
        else:
            embs = [embedding(x_cat[:, i]) for i, embedding in enumerate(self.embs)]
            x_cat_emb = self.dropout_cat(torch.cat(embs, 1))
            x = torch.cat([x_num, x_cat_emb], 1)

        x = self.expand(x)

        x = x.reshape(x.shape[0], self.cha_1, self.cha_1_reshape)

        x = self.conv1(x)

        if self.two_stage:
            x = self.conv2(x) * x

        x = self.max_po_c2(x)
        x = self.flt(x)
        x = self.dense(x)

        return torch.squeeze(x)


def preprocess_nn(
        X: pd.DataFrame,
        non_feature_cols: List[str],
        cat_cols: List[str],
        null_check_cols: List[str],
        scaler: Optional[StandardScaler] = None):
    for c in null_check_cols:
        if c in X.columns:
            X[f"{c}_isnull"] = X[c].isnull().astype(int)
            
    print(f"null_check_cols: {null_check_cols}")

    cat_cols = [c for c in X.columns if c not in non_feature_cols and c in cat_cols]
    num_cols = [c for c in X.columns if c not in cat_cols and c not in non_feature_cols]

    X_num = X[num_cols].values.astype(np.float32)
    X_cat = np.nan_to_num(X[cat_cols].values.astype(np.int32))

    if scaler is None:
        scaler = StandardScaler()
        X_num = scaler.fit_transform(X_num)
        X_num = np.nan_to_num(X_num, posinf=0, neginf=0)
        return X_num, X_cat, cat_cols, scaler
    else:
        X_num = scaler.transform(X_num) #TODO: infでも大丈夫？
        X_num = np.nan_to_num(X_num, posinf=0, neginf=0)
        return X_num, X_cat, cat_cols


def train_epoch(data_loader: DataLoader,
                model: nn.Module,
                optimizer,
                scheduler,
                device,
                clip_grad: float = 1.5,
                debug: bool = False,
                ema = None):
    model.train()
    losses = AverageMeter()
    step = 0
 
    criterion = torch.nn.BCEWithLogitsLoss()

    for x_num, x_cat, y in tqdm(data_loader, position=0, leave=True, desc='Training', disable=not debug):
        batch_size = x_num.size(0)
        x_num = x_num.to(device, dtype=torch.float)
        x_cat = x_cat.to(device)
        y = y.to(device, dtype=torch.float)
        
        loss = criterion(model(x_num, x_cat), y)
        
        losses.update(loss.detach().cpu().numpy(), batch_size)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
        optimizer.step()
        optimizer.zero_grad()

        if scheduler is not None:
            scheduler.step()
            
        if ema is not None:
            ema.update()

        step += 1

    return losses.avg


def evaluate(data_loader: DataLoader, model, device, ema=None):
    model.eval()

    losses = AverageMeter()

    final_targets = []
    final_outputs = []
    
    criterion = torch.nn.BCELoss()
    
    if ema is not None:
        ema.store()
        ema.copy_to()

    with torch.no_grad():
        for x_num, x_cat, y in tqdm(data_loader, position=0, leave=True, desc='Evaluating'):
            batch_size = x_num.size(0)
            x_num = x_num.to(device, dtype=torch.float)
            x_cat = x_cat.to(device)
            y = y.to(device, dtype=torch.float)

            with torch.no_grad():
                output = model(x_num, x_cat)
                output = torch.sigmoid(output)

            loss = criterion(output, y)
            # record loss
            losses.update(loss.detach().cpu().numpy(), batch_size)

            targets = y.detach().cpu().numpy()
            output = output.detach().cpu().numpy()

            final_targets.append(targets)
            final_outputs.append(output)

    final_targets = np.concatenate(final_targets)
    final_outputs = np.concatenate(final_outputs)

    try:
        auc = roc_auc_score(final_targets, final_outputs)
        mcc = mcc_sweep(final_targets, final_outputs)
    except:
        auc = None
        mcc = None
        
    if ema is not None:
        ema.restore()

    return final_outputs, final_targets, losses.avg, auc, mcc


def predict_nn(X: pd.DataFrame,
               non_feature_cols: List[str], # from: artifacts/metadata.json
               cat_cols: List[str], # from: artifacts/metadata.json
               null_check_cols: List[str], # from: artifacts/metadata.json
               model: Union[List[CNN], CNN],
               scaler: StandardScaler, # from: artifacts/scaler
               device,
               ensemble_method='mean'):
    if not isinstance(model, list):
        model = [model]

    for m in model:
        m.eval()
    X_num, X_cat, cat_cols = preprocess_nn(
        X.copy(),
        non_feature_cols=non_feature_cols,
        cat_cols=cat_cols,
        null_check_cols=null_check_cols,
        scaler=scaler
    )
    valid_dataset = TabularDataset(X_num, X_cat, None)
    valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                               batch_size=512,
                                               shuffle=False,
                                               num_workers=1)

    final_outputs = []

    with torch.no_grad():
        for x_num, x_cat in tqdm(valid_loader, position=0, leave=True, desc='Evaluating'):
            x_num = x_num.to(device, dtype=torch.float)
            x_cat = x_cat.to(device)

            outputs = []
            with torch.no_grad():
                for m in model:
                    output = m(x_num, x_cat)
                    output = torch.sigmoid(output)
                    outputs.append(output.detach().cpu().numpy())

            if ensemble_method == 'median':
                pred = np.nanmedian(np.array(outputs), axis=0)
            else:
                pred = np.array(outputs).mean(axis=0)
            final_outputs.append(pred)

    final_outputs = np.concatenate(final_outputs)
    return final_outputs


def train_nn(X: pd.DataFrame,
             y: pd.DataFrame,
             non_feature_cols: List[str],
             cat_cols: List[str],
             null_check_cols: List[str],
             folds: List[Tuple],
             device,
             emb_dim: int = 25,
             batch_size: int = 1024,
             cnn_hidden: int = 256,
             cnn_channel1: int = 32,
             cnn_channel2: int = 32,
             cnn_channel3: int = 32,
             cnn_kernel1: int = 5,
             cnn_celu: bool = False,
             cnn_weight_norm: bool = False,
             dropout_emb: bool = 0.0,
             two_stage: bool = False,
             lr: float = 1e-3,
             weight_decay: float = 0.0,
             model_path: str = 'fold_{}.pth',
             output_dir: str = 'artifacts',
             scheduler_type: str = 'onecycle',
             optimizer_type: str = 'adam',
             max_lr: float = 0.01,
             epochs: int = 30,
             seed: int = 42,
             batch_double_freq: int = 50,
             cnn_dropout: float = 0.1,
             cnn_leaky_relu: bool = False,
             patience: int = 8,
             factor: float = 0.5,
             debug: bool = False,
             ema_decay: float = None,
             use_wandb: bool = True):
    seed_everything(seed)

    os.makedirs(output_dir, exist_ok=True)

    y = y.values.astype(np.float32)
    X_num, X_cat, cat_cols, scaler = preprocess_nn(
        X.copy(),
        non_feature_cols,
        cat_cols,
        null_check_cols)

    best_losses = []
    best_predictions = []
    
    oof = np.zeros(len(X_num))

    for cv_idx, (train_idx, valid_idx) in enumerate(folds):
        X_tr, X_va = X_num[train_idx], X_num[valid_idx]
        X_tr_cat, X_va_cat = X_cat[train_idx], X_cat[valid_idx]
        y_tr, y_va = y[train_idx], y[valid_idx]

        cur_batch = batch_size
        best_loss = 0
        best_prediction = None

        if debug:
            print(f"fold {cv_idx} train: {X_tr.shape}, valid: {X_va.shape}")

        train_dataset = TabularDataset(X_tr, X_tr_cat, y_tr)
        valid_dataset = TabularDataset(X_va, X_va_cat, y_va)
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=cur_batch, shuffle=True,
                                                   num_workers=0)
        valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=cur_batch, shuffle=False,
                                                   num_workers=0)

        model = CNN(X_tr.shape[1],
                    hidden_size=cnn_hidden,
                    n_categories=[128],
                    emb_dim=emb_dim,
                    dropout_cat=dropout_emb,
                    channel_1=cnn_channel1,
                    channel_2=cnn_channel2,
                    channel_3=cnn_channel3,
                    two_stage=two_stage,
                    kernel1=cnn_kernel1,
                    celu=cnn_celu,
                    dropout_top=cnn_dropout,
                    dropout_mid=cnn_dropout,
                    dropout_bottom=cnn_dropout,
                    weight_norm=cnn_weight_norm,
                    leaky_relu=cnn_leaky_relu)

        model = model.to(device)

        if ema_decay:
            ema = ExponentialMovingAverage(model.parameters(), decay=ema_decay)
        else:
            ema = None

        if optimizer_type == 'adamw':
            opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
        elif optimizer_type == 'adam':
            opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
        else:
            raise NotImplementedError()

        scheduler = epoch_scheduler = None
        if scheduler_type == 'onecycle':
            scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=opt, pct_start=0.1, div_factor=1e3,
                                                            max_lr=max_lr, epochs=epochs,
                                                            steps_per_epoch=len(train_loader))
        elif scheduler_type == 'reduce':
            epoch_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=opt,
                                                                         mode='max',
                                                                         min_lr=1e-7,
                                                                         patience=patience,
                                                                         verbose=True,
                                                                         factor=factor)

        best_model_path = os.path.join(output_dir, model_path.format(cv_idx))
        
        for epoch in range(epochs):
            if debug:
                print(f"epoch {epoch}")
            if epoch > 0 and epoch % batch_double_freq == 0:
                cur_batch = cur_batch * 2
                if debug:
                    print(f'batch: {cur_batch}')
                train_loader = torch.utils.data.DataLoader(train_dataset,
                                                           batch_size=cur_batch,
                                                           shuffle=True,
                                                           num_workers=4)
            train_loss = train_epoch(train_loader, model, opt, scheduler, device, debug=debug, ema=ema)
            predictions, valid_targets, valid_loss, auc, mcc = evaluate(valid_loader, model, device=device, ema=ema)
            
            if debug:
                print(f"epoch {epoch}, train loss: {train_loss:.4f}, valid auc: {auc:.4f}, valid mcc: {mcc:.4f}")
                
            if use_wandb:
                wandb.log({
                    "epoch": epoch,
                    f"fold{cv_idx}_train_loss": train_loss,
                    f"fold{cv_idx}_valid_loss": valid_loss,
                    f"fold{cv_idx}_valid_auc": auc,
                    f"fold{cv_idx}_valid_mcc": mcc,
                })

            if epoch_scheduler is not None:
                epoch_scheduler.step(auc)

            if mcc > best_loss:
                if debug:
                    print(f'new best:{mcc}')
                best_loss = mcc
                best_prediction = predictions
                
                if ema is not None:
                    ema.store()
                    ema.copy_to()
                torch.save(model, best_model_path)
                if ema is not None:
                    ema.restore()
                oof[valid_idx] = best_prediction

        if use_wandb:
            wandb.run.summary[f"fold{cv_idx}_valid_mcc_best"] = best_loss
            save_file(best_model_path)

        best_predictions.append(best_prediction)
        best_losses.append(best_loss)
        del model, train_dataset, valid_dataset, train_loader, valid_loader, X_tr, X_va, X_tr_cat, X_va_cat, y_tr, y_va, opt
        if scheduler is not None:
            del scheduler
        gc.collect()

    return best_losses, best_predictions, oof, scaler


In [2]:

def search_best_threshold_pair(y_true, y_pred, is_ground):
    def func(x_list):
        score = matthews_corrcoef(y_true, binarize_pred(y_pred, x_list[0], x_list[1], is_ground))
        return -score

    x0 = [0.3, 0.3]
    result = minimize(func, x0,  method="nelder-mead")

    return result.x[0], result.x[1]


def binarize_pred(y_pred, threshold, threshold2, threshold2_mask):
    return ~threshold2_mask*(y_pred>threshold)+threshold2_mask*(y_pred>threshold2)


def train_nn_from_df(
    train_df: pd.DataFrame,
    split_defs: pd.DataFrame, 
    nn_params: Dict[str, Any],
    null_check_top_n: int = 30,
    debug: bool = True,
    output_dir: str = "artifacts",
    use_wandb: bool = True,
    wandb_entity: str = "nyanp",
    wandb_project: str = "nfl-nn"
): 
    if use_wandb:
        wandb.init(
            project=wandb_project,
            entity=wandb_entity,
            config={
                **params
            }
        )

    non_feature_cols = [
        "contacgt_id",
        "game_play",
        "datetime",
        "step",
        "nfl_player_id_1",
        "nfl_player_id_2",
        "contact",
        "team_1",
        "team_2",
        "contact_id",
        #"position_1",
        #"position_2"
        #"direction_1",
        #"direction_2",
        "x_position_1",
        "x_position_2",
        "y_position_1",
        "y_position_2",
        "x_position_start_1",
        "x_position_start_2",
        "y_position_start_1",
        "y_position_start_2",

        "x_position_future5_1",
        "x_position_future5_2",
        "y_position_future5_1",
        "y_position_future5_2",
        "x_position_past5_1",
        "x_position_past5_2",
        "y_position_past5_1",
        "y_position_past5_2",
        "nfl_player_id_interceptor_1",
        "nfl_player_id_interceptor_2",

        #"orientation_past5_1",
        #"direction_past5_1",
        #"orientation_past5_2",
        #"direction_past5_2",
    ]


    split_df = train_df[["game_play"]].copy()
    split_df["game"] = split_df["game_play"].str[:5].astype(int)
    split_df = pd.merge(split_df, split_defs, how="left")
    split = list(PredefinedSplit(split_df["fold"]).split())

    feature_names = [c for c in train_df.columns if c not in non_feature_cols]

    # 欠損数から代表的な列を抽出する
    num_nulls = train_df[feature_names].isnull().sum().drop_duplicates()
    n_top = train_df[feature_names].isnull().sum().value_counts().index[:null_check_top_n]

    null_check_cols = num_nulls[num_nulls.isin(set(n_top))].index.tolist()

    extras = [
        "cnn_pred_Endzone_roll21",
        "cnn_pred_Endzone_roll11",
        "cnn_pred_Endzone_roll5",
        "cnn_pred_Sideline_roll21",
        "cnn_pred_Sideline_roll11",
        "cnn_pred_Sideline_roll5",
        "distance",
    ]

    for e in extras:
        if e not in null_check_cols:
            null_check_cols.append(e)

    print(null_check_cols)

    X_train = np.empty((len(train_df), len(feature_names)), dtype=np.float32)

    cat_cols = []
    for i, c in enumerate(feature_names):
        if train_df[c].dtype.name == "object":
            X_train[:, i] = 0
            cat_cols.append(c)
        else:
            X_train[:, i] = train_df[c]

    X_train = pd.DataFrame(X_train, columns=feature_names)
    y = train_df["contact"]

    assert torch.cuda.is_available()
    device = torch.device("cuda")
    
    if use_wandb:
        wandb.config.update({
            "null_check_cols": null_check_cols, 
            "cat_cols": cat_cols,
            "n_features": X_train.shape[1],
            "n_rows": len(X_train),
            "null_check_top_n": null_check_top_n
        })

    loss, preds, oof, scaler = train_nn(
            X=X_train,
            y=y,
            non_feature_cols=non_feature_cols,
            folds=split,
            device=device,
            debug=debug,
            output_dir=output_dir,
            null_check_cols=null_check_cols,
            cat_cols=cat_cols,
            **nn_params
        )

    is_ground = train_df["nfl_player_id_2"] == -1
    th1, th2 = search_best_threshold_pair(y, oof, is_ground)
    y_pred = binarize_pred(oof, th1, th2, is_ground)

    mcc = matthews_corrcoef(y, y_pred)

    np.save(os.path.join(output_dir, "oof.npy"), oof)

    with open(os.path.join(output_dir, "scaler"), "wb") as f:
        pickle.dump(scaler, f)

    metadata = {
        "threshold_p": th1,
        "threshold_g": th2,
        "null_check_cols": null_check_cols,
        "cat_cols": cat_cols,
        "mcc": mcc,
        "non_feature_cols": non_feature_cols
    }

    with open(os.path.join(output_dir, "metadata.json"), "w") as f:
        json.dump(metadata, f)

    if use_wandb:
        wandb.run.summary["mcc"] = mcc
        wandb.run.summary["auc"] = roc_auc_score(y, oof)
        wandb.run.summary["th1"] = th1
        wandb.run.summary["th2"] = th2

        save_file(os.path.join(output_dir, "oof.npy"))
        save_file(os.path.join(output_dir, "scaler"))
        save_file(os.path.join(output_dir, "metadata.json"))
        
        wandb.finish()

    return oof, mcc, th1, th2

In [3]:
# train_dfは、make_featuresした後のhard sampleをto_featherで保存したやつです。

In [4]:

train_df = pd.read_feather("train_df.f")
print(train_df.shape)

split_defs = pd.read_csv(f"../input/nfl-game-fold/game_fold.csv")


params = {
    "optimizer_type": "adamw",
    "scheduler_type": "reduce",
    "factor": 0.4,
    "cnn_kernel1": 3,
    "weight_decay": 2e-5,
    "cnn_weight_norm": False,
    "cnn_channel1": 96,
    "cnn_channel2": 96,
    "cnn_channel3": 96,
    "cnn_hidden": 1536,
    "cnn_dropout": 0.65,
    "batch_size": 512,
    "max_lr": 0.0026,
    "lr": 0.00075,
    "epochs": 40,
    "ema_decay": 0.995, # 0.995,
    "patience": 8
}

oof, mcc, th1, th2 = train_nn_from_df(train_df, 
                                      split_defs, 
                                      params, 
                                      use_wandb=True,
                                      null_check_top_n=30)


FileNotFoundError: [Errno 2] No such file or directory: 'train_df.f'