In [None]:
import sys
import os
import pandas as pd
import json
from dataclasses import asdict


sys.path.append('../')

from src.parser import *
from src.train import *
from src.nn import *
from src.config import NNConfig
from src.util import TimeSeriesSplit
from src.dataset_helper import load_subdata

USE_UPDATED = True
OUTPUT_DIR = 'artifacts/nn'
INPUT_DIR = '../features'
DATA_DIR = '../input/mlb-player-digital-engagement-forecasting'

In [None]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
season_df = pd.read_csv(os.path.join(DATA_DIR, 'seasons.csv'))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

targets = ['target1', 'target2', 'target3', 'target4']

In [None]:
import logging
from logging import getLogger

logger = getLogger()
logger.addHandler(logging.FileHandler('log_nn.log', 'a'))
logger.setLevel(logging.DEBUG)

metadata = {
    'models': {}
}
postfix = '_updated' if USE_UPDATED else ''

#run = wandb.init(project='mlb-nn', entity='nyanp')

try:
    for model_type in ['mlp', 'cnn']:
        config = NNConfig(
            model_type=model_type,
            batch_size=2048,
            lr=0.0008 if model_type == 'mlp' else 4.4e-5,
            epochs=15 if model_type == 'mlp' else 30,
            batch_double_freq=50,
            scaler_type='standard',
            optimizer='madgrad' if model_type == 'mlp' else 'adam',
            weight_decay=3e-4 if model_type == 'mlp' else 0,
            scheduler_type='onecycle',
            max_lr=0.003 if model_type == 'mlp' else 0.00076,
            emb_dim=10,
            dropout_emb=0.2,
            mlp_bn=True,
            mlp_dropout=0.15,
            mlp_hidden=1280,
            seeds=[42, 2021, 1],
            cnn_hidden=2048,
            cnn_channel1=256,
            cnn_channel2=768,
            cnn_channel3=768,
            cnn_dropout_top=0.1*0.8,
            cnn_dropout_mid=0.3*0.8,
            cnn_dropout_bottom=0.2*0.8,
            cnn_weight_norm=False,
            cnn_two_stage=False,
            cnn_celu=True,
            cnn_kernel1=5
        )

        #wandb.config.update(asdict(config))

        metadata['use_updated'] = USE_UPDATED
        #metadata['run_id'] = str(run.id)

        for lag in [0, 3, 7, 14, 21, 28, 35, 45]:
            if lag not in metadata['models']:
                metadata['models'][lag] = {}
            metadata['models'][lag][f'{model_type}_mae'] = []
            metadata['models'][lag]['pkl_path'] = []
            metadata['models'][lag][f'{model_type}_model_path'] = []

            for si, seed in enumerate(config.seeds):
                df = pd.read_feather(os.path.join(INPUT_DIR, f'X_{lag}_2nd.f'))
                df_train = make_df_base_from_train_engagement(load_subdata(DATA_DIR, 'nextDayPlayerEngagement', USE_UPDATED))

                season_mask = get_mask_by_season_df(season_df, df_train)
                print(len(df_train))
                df_train = df_train[season_mask]
                df = df[season_mask]
                print(len(df_train))

                players = pd.read_csv(os.path.join(DATA_DIR, 'players.csv'))
                player_ids = set(players[players.playerForTestSetAndFuturePreds==True].playerId)

                new_user_mask = df_train['playerId'].isin(player_ids).values

                df_train = df_train[new_user_mask]
                df = df[new_user_mask]

                X = df.drop(targets, axis=1)
                Y = df[targets]

                metadata['models'][lag]['columns'] = list(X.columns)

                if USE_UPDATED:
                    cv = TimeSeriesSplit('dailyDataDate', [
                        (('2018-01-01', '2021-06-01'), ('2021-06-01', '2021-08-01'))
                    ])
                else:
                    cv = TimeSeriesSplit('dailyDataDate', [
                        (('2018-01-01', '2021-04-01'), ('2021-04-01', '2021-05-01'))
                    ])

                pkl_path = f"scaler_{lag}_s{si}{postfix}.pkl"
                model_path = f"{model_type}_{lag}_s{si}{postfix}.pth"

                maes, predictions = train_nn(X, Y, cv, df_train,
                                             pkl_path=pkl_path,
                                             model_path=model_path,
                                             device=device,
                                             config=config,
                                             seed=seed,
                                             output_dir=OUTPUT_DIR)

                np.save(os.path.join(OUTPUT_DIR, f'{model_type}_lag{lag}_s{si}_oof.npy'), predictions[0])

                metadata['models'][lag][f'{model_type}_mae'].append(float(maes[-1]))
                metadata['models'][lag]['pkl_path'].append(pkl_path)
                metadata['models'][lag][f'{model_type}_model_path'].append(model_path)

                #wandb.run.summary[f'lag{lag}_{model_type}_s{si}'] = float(maes[-1])
                logger.info(f'lag{lag}_{model_type}_s{si}: {float(maes[-1])}')
except:
    import traceback
    logger.error(traceback.format_exc())

In [None]:
with open('artifacts/nn/nn_meta.json', 'w') as f:
    json.dump(metadata, f, indent=4)