In [1]:
import sys
import os
import pandas as pd
import json
import wandb
from sklearn.metrics import mean_absolute_error

sys.path.append('../')

from src.parser import *
from src.train import *
from src.nn import *
from src.config import Config
from src.util import TimeSeriesSplit

USE_UPDATED = True
NN_DIR = 'artifacts/nn_new'
GBDT_DIR = 'artifacts/gbdt_new'
GBDT2_DIR = 'artifacts/gbdt_new2'
DATA_DIR = '../input/mlb-player-digital-engagement-forecasting'

base_df = pd.read_feather(os.path.join(DATA_DIR, 'train_nextDayPlayerEngagement_updated.f'))
train_df = make_df_base_from_train_engagement(base_df).reset_index()
season_df = pd.read_csv('../input/mlb-player-digital-engagement-forecasting/seasons.csv')
season_mask = get_mask_by_season_df(season_df, train_df)
players = pd.read_csv('../input/mlb-player-digital-engagement-forecasting/players.csv')
player_ids = set(players[players.playerForTestSetAndFuturePreds==True].playerId)

train_df = train_df[season_mask].reset_index(drop=True)


  import pandas.util.testing as tm


In [2]:
train_df['has_gbdt_pred'] = False
train_df['has_nn_pred'] = False
train_df['in_test'] = train_df['playerId'].isin(player_ids)

train_df

Unnamed: 0,date,dailyDataDate,playerId,target1,target2,target3,target4,has_gbdt_pred,has_nn_pred,in_test
0,20180329,2018-03-29,608365,2.040838,7.893376,0.052359,2.599964,False,False,False
1,20180329,2018-03-29,502624,1.068945,5.105543,2.722191,0.918550,False,False,True
2,20180329,2018-03-29,643338,0.385945,5.368812,0.074404,0.565661,False,False,True
3,20180329,2018-03-29,458681,0.013176,2.200179,0.915358,0.539713,False,False,True
4,20180329,2018-03-29,544925,0.000590,0.164543,0.000459,0.147902,False,False,False
...,...,...,...,...,...,...,...,...,...,...
1395292,20210717,2021-07-17,640451,0.000000,0.046667,0.001690,0.302612,False,False,True
1395293,20210717,2021-07-17,664208,0.000000,0.319489,0.001127,0.717058,False,False,True
1395294,20210717,2021-07-17,475054,0.000000,0.016154,0.003380,0.184198,False,False,True
1395295,20210717,2021-07-17,656492,0.000000,0.004487,0.000000,0.092099,False,False,False


In [14]:

MLP_SEEDS = 3
CNN_SEEDS = 3

recs = []

def get_mix_ratio(lag: int, target: str):
    # gbdt, cnn, mlp
    if lag == 0:
        return [1, 0, 0]
    
    if target == 'target1':
        return [1, 1, 1]
    
    if target == 'target2':
        if lag < 14:
            return [4, 1, 1]
        else:
            return [2, 1, 1]

    if target == 'target3':
        return [1, 1, 1]
    
    if target == 'target4':
        return [8, 1, 1]
    
    return [1, 0, 0]

for LAG in [0, 3, 7, 14, 21, 28, 35, 45]:# 28, 35, 45]:
    print(f"lag: {LAG}")
    for TARGET in [1, 2, 3, 4]:

        rec = {
            'lag': LAG,
            'target': TARGET
        }

        gbdt_oof = np.load(os.path.join(GBDT_DIR, f'target{TARGET}_lag{LAG}_oof.npy'))
        gbdt_mask = np.load(os.path.join(GBDT_DIR, f'target{TARGET}_lag{LAG}_indices.npy'))
        gbdt2_oof = np.load(os.path.join(GBDT2_DIR, f'target{TARGET}_lag{LAG}_oof.npy'))
        gbdt2_mask = np.load(os.path.join(GBDT2_DIR, f'target{TARGET}_lag{LAG}_indices.npy'))
        mlp_oof = [
            np.load(os.path.join(NN_DIR, f'mlp_lag{LAG}_s{s}_oof.npy')) for s in range(MLP_SEEDS)
        ]
        cnn_oof = [
            np.load(os.path.join(NN_DIR, f'cnn_lag{LAG}_s{s}_oof.npy')) for s in range(CNN_SEEDS)
        ]

        train_df.loc[gbdt_mask, 'has_gbdt_pred'] = True
        #train_df.loc[gbdt2_mask, 'has_gbdt2_pred'] = True
        train_df.loc[(train_df.date >= 20210601) & (train_df.in_test), 'has_nn_pred'] = True
        train_df.loc[(train_df['has_nn_pred']==True) & (train_df['has_gbdt_pred']==True), 'both_pred'] = True

        for i, mlp in enumerate(mlp_oof):
            train_df.loc[train_df['has_nn_pred']==True, f'mlp_pred_{i}'] = mlp[:, TARGET-1]
        train_df.loc[train_df['has_nn_pred']==True, f'mlp_pred_avg'] = np.array(mlp_oof).mean(axis=0)[:, TARGET-1]
        train_df.loc[train_df['has_nn_pred']==True, f'mlp_pred_avg_h'] = np.array(mlp_oof[:MLP_SEEDS//2]).mean(axis=0)[:, TARGET-1]

        for i, cnn in enumerate(cnn_oof):
            train_df.loc[train_df['has_nn_pred']==True, f'cnn_pred_{i}'] = cnn[:, TARGET-1]
        train_df.loc[train_df['has_nn_pred']==True, f'cnn_pred_avg'] = np.array(cnn_oof).mean(axis=0)[:, TARGET-1]
        train_df.loc[train_df['has_nn_pred']==True, f'cnn_pred_avg_h'] = np.array(cnn_oof[:CNN_SEEDS//2]).mean(axis=0)[:, TARGET-1]
        
        train_df.loc[train_df['has_gbdt_pred']==True, 'gbdt_pred'] = gbdt_oof[gbdt_mask]
        train_df.loc[train_df['has_gbdt_pred']==True, 'gbdt2_pred'] = gbdt2_oof[gbdt2_mask]
        train_df.loc[train_df['has_gbdt_pred']==True, 'gbdt_pred_avg'] = (gbdt_oof[gbdt_mask] + gbdt2_oof[gbdt2_mask]) / 2
        train_df.loc[train_df['has_gbdt_pred']==True, 'gbdt_pred_avg2'] = (2*gbdt_oof[gbdt_mask] + gbdt2_oof[gbdt2_mask]) / 3
          
        pred_cols = ['gbdt_pred', 'gbdt2_pred', 'gbdt_pred_avg', 'gbdt_pred_avg2', 'mlp_pred_avg', 'cnn_pred_avg'] + [f'cnn_pred_{i}' for i in range(CNN_SEEDS)] + [f'mlp_pred_{i}' for i in range(MLP_SEEDS)]

        
        train_df_both_pred = train_df[train_df['both_pred']==True].reset_index(drop=True)

        train_df_both_pred['mix_1_1_1'] = (train_df_both_pred['gbdt_pred_avg'] + train_df_both_pred['cnn_pred_avg'] + train_df_both_pred['mlp_pred_avg']) / 3
        train_df_both_pred['mix_2_1_1'] = (2*train_df_both_pred['gbdt_pred_avg'] + train_df_both_pred['cnn_pred_avg'] + train_df_both_pred['mlp_pred_avg']) / 4
        train_df_both_pred['mix_3_1_1'] = (3*train_df_both_pred['gbdt_pred_avg'] + train_df_both_pred['cnn_pred_avg'] + train_df_both_pred['mlp_pred_avg']) / 5
        train_df_both_pred['mix_4_1_1'] = (4*train_df_both_pred['gbdt_pred_avg'] + train_df_both_pred['cnn_pred_avg'] + train_df_both_pred['mlp_pred_avg']) / 6
        train_df_both_pred['mix_6_1_1'] = (6*train_df_both_pred['gbdt_pred_avg'] + train_df_both_pred['cnn_pred_avg'] + train_df_both_pred['mlp_pred_avg']) / 8
        train_df_both_pred['mix_8_1_1'] = (8*train_df_both_pred['gbdt_pred_avg'] + train_df_both_pred['cnn_pred_avg'] + train_df_both_pred['mlp_pred_avg']) / 10

        mx = get_mix_ratio(LAG, f"target{TARGET}")
        train_df_both_pred['mix_best'] = (mx[0]*train_df_both_pred['gbdt_pred_avg'] + mx[1]*train_df_both_pred['cnn_pred_avg'] + mx[2]*train_df_both_pred['mlp_pred_avg']) / (mx[0]+mx[1]+mx[2])

        rec['gbdt'] = mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['gbdt_pred'])
        rec['gbdt2'] = mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['gbdt2_pred'])
        rec['gbdt_avg'] = mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['gbdt_pred_avg'])
        rec['gbdt_avg2'] = mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['gbdt_pred_avg2'])
             
        rec['cnn'] = mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['cnn_pred_avg'])
        rec['mlp'] = mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['mlp_pred_avg'])
        rec['cnn(single)'] = mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['cnn_pred_0'])
        rec['mlp(single)'] = mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['mlp_pred_0'])
        rec['cnn(half)'] = mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['cnn_pred_avg_h'])
        rec['mlp(half)'] = mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['mlp_pred_avg_h'])

        rec['mix_1_1_1'] = mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['mix_1_1_1'])
        rec['mix_2_1_1'] = mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['mix_2_1_1'])
        rec['mix_3_1_1'] = mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['mix_3_1_1'])
        rec['mix_4_1_1'] = mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['mix_4_1_1'])
        rec['mix_6_1_1'] = mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['mix_6_1_1'])
        rec['mix_8_1_1'] = mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['mix_8_1_1'])
        rec['mix_best'] = mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['mix_best'])

        #train_df_both_pred[pred_cols].corr().style.background_gradient(axis=None)

        recs.append(rec)

lag: 0
lag: 3
lag: 7
lag: 14
lag: 21
lag: 28
lag: 35
lag: 45


In [10]:
train_df_both_pred[pred_cols].corr().style.background_gradient(axis=None)

Unnamed: 0,gbdt_pred,gbdt2_pred,gbdt_pred_avg,gbdt_pred_avg2,mlp_pred_avg,cnn_pred_avg,cnn_pred_0,cnn_pred_1,cnn_pred_2,mlp_pred_0,mlp_pred_1,mlp_pred_2
gbdt_pred,1.0,0.959333,0.989292,0.995189,0.735622,0.713548,0.714988,0.698199,0.695441,0.738357,0.738446,0.725786
gbdt2_pred,0.959333,1.0,0.990259,0.982373,0.774123,0.758776,0.756542,0.744436,0.741229,0.776702,0.776247,0.764954
gbdt_pred_avg,0.989292,0.990259,1.0,0.998832,0.76312,0.744296,0.743851,0.72931,0.726292,0.765802,0.765611,0.753527
gbdt_pred_avg2,0.995189,0.982373,0.998832,1.0,0.755829,0.735884,0.736063,0.720743,0.717803,0.758535,0.758436,0.746133
mlp_pred_avg,0.735622,0.774123,0.76312,0.755829,1.0,0.970646,0.973247,0.946285,0.949325,0.998386,0.998296,0.997884
cnn_pred_avg,0.713548,0.758776,0.744296,0.735884,0.970646,1.0,0.981121,0.987403,0.986508,0.967962,0.967035,0.971766
cnn_pred_0,0.714988,0.756542,0.743851,0.736063,0.973247,0.981121,1.0,0.948991,0.950986,0.970907,0.970709,0.972891
cnn_pred_1,0.698199,0.744436,0.72931,0.720743,0.946285,0.987403,0.948991,1.0,0.966292,0.943902,0.942209,0.947703
cnn_pred_2,0.695441,0.741229,0.726292,0.717803,0.949325,0.986508,0.950986,0.966292,1.0,0.94606,0.945285,0.95161
mlp_pred_0,0.738357,0.776702,0.765802,0.758535,0.998386,0.967962,0.970907,0.943902,0.94606,1.0,0.995344,0.99428


In [15]:
df_all = pd.DataFrame(recs)
#df_all['gbdt2'] = df_all['gbdt']
df_all.sort_values(by=['target', 'lag']).set_index(['lag', 'target']).style.background_gradient(cmap='hot',axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,gbdt,gbdt2,gbdt_avg,gbdt_avg2,cnn,mlp,cnn(single),mlp(single),cnn(half),mlp(half),mix_1_1_1,mix_2_1_1,mix_3_1_1,mix_4_1_1,mix_6_1_1,mix_8_1_1,mix_best
lag,target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,1,0.848095,0.848441,0.843627,0.844104,0.900726,0.905486,0.905865,0.907418,0.905865,0.907418,0.868465,0.857854,0.852783,0.850093,0.847505,0.846245,0.843627
3,1,0.924368,0.928236,0.92198,0.921825,0.927646,0.923043,0.931907,0.922681,0.931907,0.922681,0.916501,0.915982,0.916246,0.916709,0.917526,0.918156,0.916501
7,1,0.925305,0.928364,0.922216,0.922171,0.923753,0.926289,0.926379,0.927677,0.926379,0.927677,0.915863,0.915303,0.915641,0.916121,0.917017,0.917767,0.915863
14,1,0.934797,0.936791,0.931456,0.931552,0.925604,0.928689,0.930897,0.930178,0.930897,0.930178,0.9195,0.92024,0.9214,0.92245,0.924154,0.925313,0.9195
21,1,0.929686,0.932971,0.927218,0.927062,0.931299,0.92976,0.939206,0.930729,0.939206,0.930729,0.920797,0.920113,0.920538,0.921076,0.921996,0.922741,0.920797
28,1,0.9303,0.933704,0.927234,0.927058,0.923,0.931207,0.931611,0.93277,0.931611,0.93277,0.916934,0.917279,0.918285,0.919207,0.920648,0.921723,0.916934
35,1,0.939675,0.933198,0.931809,0.933321,0.926691,0.931549,0.934878,0.931793,0.934878,0.931793,0.921232,0.921551,0.922478,0.92343,0.925117,0.926265,0.921232
45,1,0.938459,0.938485,0.933876,0.934355,0.930039,0.934203,0.949426,0.933628,0.949426,0.933628,0.923167,0.923397,0.924532,0.925517,0.927022,0.928114,0.923167
0,2,1.195693,1.197007,1.186159,1.18702,1.455945,1.44644,1.457888,1.448667,1.457888,1.448667,1.313338,1.263869,1.240124,1.226209,1.211395,1.204149,1.186159
3,2,1.436649,1.434425,1.426053,1.427436,1.496457,1.487531,1.501845,1.491192,1.501845,1.491192,1.430252,1.417173,1.413174,1.412084,1.412543,1.413786,1.412084


In [None]:
train_df_both_pred[pred_cols].corr().style.background_gradient(axis=None)

In [None]:
print(mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['gbdt_pred']))

In [None]:
print(mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['mlp_pred_1']))

In [None]:

assert train_df['has_gbdt_pred'].sum() == len(gbdt_oof[gbdt_mask])
assert train_df['has_nn_pred'].sum() == len(mlp_oof[0])

In [None]:
train_df_both_pred = train_df[train_df['both_pred']==True].reset_index(drop=True)


In [None]:
print(mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['gbdt_pred']))
print(mean_absolute_error(train_df_both_pred[f'target{TARGET}'], train_df_both_pred['nn_pred']))


In [None]:
# blend
for blend in [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]:
    blended = blend * train_df_both_pred['gbdt_pred'] + (1-blend) * train_df_both_pred['nn_pred']
    print('{},{}'.format(blend, mean_absolute_error(train_df_both_pred[f'target{TARGET}'], blended)))