In [1]:
import json
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.pipeline import make_union, make_pipeline
from sklearn.metrics import mean_absolute_error as mae

from mllib.transformers import *
from src.pipelines.artifacts import *
from src.constants import (
    TARGETS,
    PLTWITTER,
    SCORES1,
    SCORES2,
    SCORES3,
    SCORES4,
    SCORES5,
    TEAM_SCORES1,
    TEAM_SCORES2,
    TEAM_SCORES3,
    TEAM_STANDINGS,
    AWARDS,
    ROSTERS,
    TRANSACTIONS,
    AWARDID_DICT
)
from src.constants import (
    playerid_mapping,
    teamid_mapping,
    targets_artifact,
    scores1_mean_artifact,
    scores1_first_artifact,
    scores1_last_artifact,
    scores2_mean_artifact,
    scores2_first_artifact,
    scores2_last_artifact,
    scores3_mean_artifact,
    scores3_first_artifact,
    scores3_last_artifact,
    scores4_mean_artifact,
    scores4_first_artifact,
    scores4_last_artifact,
    scores5_mean_artifact,
    scores5_first_artifact,
    scores5_last_artifact,
    team_scores1_mean_artifact,
    team_scores2_mean_artifact,
    team_scores3_mean_artifact,
    awards_artifact,
    rosters_artifact,
    player_twitter_artifact,
    transactions_artifact,
    team_standings_artifact,
    event_artifact
)

In [2]:
# Generate index
TRAIN_FILE = "data/train_updated.csv"
PLAYERS_FILE = "data/players.csv"
VAL_START_DATE = 20210601
DEVICE = 'gpu'
device = DEVICE
artifacts_path = 'data/artifacts/v01'
SAVE_FEATURES = False
LOAD_FEATURES = True
TRAIN_SEASON_ONLY = True
SEED = 786

In [3]:
# raw_data = pd.read_csv(TRAIN_FILE)
# tr = raw_data.loc[raw_data.date < VAL_START_DATE]
# val = raw_data.loc[raw_data.date >= VAL_START_DATE]
# print(raw_data.shape, val.shape)

# roster_2021 = pd.read_csv(PLAYERS_FILE)
# roster_2021 = roster_2021.loc[roster_2021.playerForTestSetAndFuturePreds == True]
# target_enc = ParsePlayerData("nextDayPlayerEngagement", TARGETS)
# tr_index = target_enc.fit_transform(tr).reset_index(drop=False)
# tr_index = tr_index.loc[tr_index.playerId.isin(roster_2021.playerId.astype(str))]
# # tr_index['debutdate'] = tr_index.map()
# vl_index = target_enc.fit_transform(val).reset_index(drop=False)
# vl_index = vl_index.loc[vl_index.playerId.isin(roster_2021.playerId.astype(str))]
# tr_index.to_csv("data/tr_index_smallv01.csv", index=False)
# vl_index.to_csv("data/vl_index_smallv01.csv", index=False)

In [4]:
tr_index = pd.read_csv("data/tr_index_smallv01.csv")
vl_index = pd.read_csv("data/vl_index_smallv01.csv")

tr_index.shape, vl_index.shape

((1480189, 6), (55789, 6))

In [5]:
pd.set_option("display.max_rows", 100)
seasons = pd.read_csv("data/seasons_formatted.csv")
seasons.loc[seasons.date > 20210701].head(100)

Unnamed: 0,date,seasonflag,season_start,season_end,all_star
1278,20210702,2,2021-02-28,2021-10-31,2021-07-13
1279,20210703,2,2021-02-28,2021-10-31,2021-07-13
1280,20210704,2,2021-02-28,2021-10-31,2021-07-13
1281,20210705,2,2021-02-28,2021-10-31,2021-07-13
1282,20210706,2,2021-02-28,2021-10-31,2021-07-13
1283,20210707,2,2021-02-28,2021-10-31,2021-07-13
1284,20210708,2,2021-02-28,2021-10-31,2021-07-13
1285,20210709,2,2021-02-28,2021-10-31,2021-07-13
1286,20210710,2,2021-02-28,2021-10-31,2021-07-13
1287,20210711,2,2021-02-28,2021-10-31,2021-07-13


In [None]:
targets_cols = [0, 1, 2, 3]

In [6]:
target_stats_train2 = make_union(
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=7, skip=10, device=DEVICE, fill_value=0),
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=30, skip=10, device=DEVICE, fill_value=0),
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=60, skip=10, device=DEVICE, fill_value=0), 
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=300, skip=10, device=DEVICE, fill_value=0),
)

target_stats_test2 = make_union(
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=7, skip=1, device=DEVICE, fill_value=0),
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=30, skip=1, device=DEVICE, fill_value=0),
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=60, skip=1, device=DEVICE, fill_value=0), 
    ExpandingMean('date', 'playerId', list(range(0, 4)), f'{artifacts_path}/{targets_artifact}', N=300, skip=1, device=DEVICE, fill_value=0),
)

other_features2 = make_union(
    LagN('date', 'playerId', [0], f'{artifacts_path}/train_awards.pkl', fill_value=-1, N=1, skip=0, device=device),
    ExpandingCount('date', 'playerId', [0], f'{artifacts_path}/train_awards.pkl', fill_value=0, N=365, skip=0, device=device),
    LagN('date', 'playerId', [0, 1, 2], f'{artifacts_path}/train_transactions.pkl', fill_value=-1, N=1, skip=0, device=device),
    LagN('date', 'playerId', [0], f'{artifacts_path}/train_rosters.pkl', fill_value=-1, N=1, skip=0, device=device),
    ExpandingCount('date', 'playerId', [0], f'{artifacts_path}/train_rosters.pkl', fill_value=0, N=30, skip=0, device=device),
    ExpandingCount('date', 'playerId', [0], f'{artifacts_path}/train_rosters.pkl', fill_value=0, N=300, skip=0, device=device),
    make_pipeline(LagN('date', 'playerId', [0], f'{artifacts_path}/train_pltwitter.pkl', fill_value=0, N=1, skip=0, device=device),
                  FunctionTransfomer(np.log1p)),
)

scores12 = make_union(
        LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{scores1_mean_artifact}', fill_value=-1, N=1, skip=0, device=device),
        LagN('date', 'playerId', [4], f'{artifacts_path}/{scores1_mean_artifact}', N=  1, skip=0, fill_value=0, device=device),
    *[
        LagN('date', 'playerId', [0, 4], f'{artifacts_path}/{scores1_mean_artifact}', fill_value=-1, N=j+1, skip=0, device=device)
        for j in range(1, 14)
    ],

    ExpandingCount('date', 'playerId', [0], f'{artifacts_path}/{scores1_mean_artifact}', fill_value=0, N=30, skip=0, device=device),
)

scores32 = make_union(
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores3_mean_artifact}', fill_value=0, N=1, skip=0, device=device)
         for i in range(2)
     ],
    LagN('date', 'playerId', [8, 9, 10], f'{artifacts_path}/{scores3_mean_artifact}', fill_value=0, N=1, skip=0, device=device),

    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores3_mean_artifact}', fill_value=0, N=10, skip=0, device=device)
         for i in range(2)
     ],
    ExpandingSum('date', 'playerId', [8, 9, 10], f'{artifacts_path}/{scores3_mean_artifact}', fill_value=0, N=10, skip=0, device=device),
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores3_mean_artifact}', fill_value=0, N=200, skip=0, device=device)
         for i in range(2)
     ],
    ExpandingSum('date', 'playerId', [8, 9, 10], f'{artifacts_path}/{scores3_mean_artifact}', fill_value=0, N=200, skip=0, device=device),

)

scores22 = make_union(
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores2_mean_artifact}', fill_value=0, N=j+1, skip=0, device=device)
        for i in range(6) for j in range(3)
     ],
    *[
        ExpandingMean('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores2_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for i in range(6) for j in [30]
     ],
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores2_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for i in range(6) for j in [30]
     ],
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores2_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for i in range(6) for j in [150]
     ],
)

scores42 = make_union(
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores4_mean_artifact}', fill_value=0, N=j+1, skip=0, device=device)
        for i in range(3) for j in range(2)
     ],
    *[
        LagN('date', 'playerId', [12, 13, 14], f'{artifacts_path}/{scores4_mean_artifact}', fill_value=0, N=j+1, skip=0, device=device)
        for j in range(2)
     ],
    *[
        ExpandingMean('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores4_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for i in range(3) for j in [30]
     ],
    *[
        ExpandingMean('date', 'playerId', [12, 13, 14], f'{artifacts_path}/{scores4_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for j in [30]
     ],
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores4_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for i in range(3) for j in [30]
     ],
    *[
        ExpandingSum('date', 'playerId', [12, 13, 14], f'{artifacts_path}/{scores4_last_artifact}', fill_value=0, N=j, skip=0, device=device)
        for j in [30]
     ],
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores4_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for i in range(3) for j in [150]
     ],
    *[
        ExpandingSum('date', 'playerId', [12, 13, 14], f'{artifacts_path}/{scores4_last_artifact}', fill_value=0, N=j, skip=0, device=device)
        for j in [150]
     ],
)


scores52 = make_union(
    *[
        LagN('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores5_mean_artifact}', fill_value=0, N=j+1, skip=0, device=device)
        for i in range(5) for j in range(1)
     ],
    *[
        LagN('date', 'playerId', [20], f'{artifacts_path}/{scores5_mean_artifact}', fill_value=0, N=j+1, skip=0, device=device)
        for j in range(1)
     ],
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores5_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for i in range(3) for j in [30]
     ],
    *[
        ExpandingSum('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores5_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for i in range(3) for j in [150]
     ],
    *[
        ExpandingSum('date', 'playerId', [12, 13, 14], f'{artifacts_path}/{scores5_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for j in [30]
     ],
    *[
        ExpandingSum('date', 'playerId', [12, 13, 14], f'{artifacts_path}/{scores5_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for j in [150]
     ],
    *[
        ExpandingMean('date', 'playerId', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores5_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for i in range(3) for j in [30]
     ],
    *[
        ExpandingMean('date', 'playerId', [12, 13, 14], f'{artifacts_path}/{scores5_mean_artifact}', fill_value=0, N=j, skip=0, device=device)
        for j in [30]
     ],
)

all_players2 = make_union(
    *[
        DateLagN('date', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores2_mean_artifact}', N=j+1, skip=0, device=device)
        for i in range(6) for j in range(1)
     ],
    *[
        DateLagN('date', list(range(i*4, (i+1)*4)), f'{artifacts_path}/{scores4_mean_artifact}', N=j+1, skip=0, device=device)
        for i in range(6) for j in range(1)
     ],
    *[
        DateLagN('date', [12, 13, 14], f'{artifacts_path}/{scores4_mean_artifact}', N=j+1, skip=0, device=device)
        for j in range(1)
     ],
)

events2 = make_union(
        LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{event_artifact}', fill_value=0, N=1, skip=0, device=device),
        LagN('date', 'playerId', [4, 5, 6, 7], f'{artifacts_path}/{event_artifact}', fill_value=0, N=1, skip=0, device=device),
        LagN('date', 'playerId', [8, 9, 10, 11], f'{artifacts_path}/{event_artifact}', fill_value=0, N=1, skip=0, device=device),
        LagN('date', 'playerId', [12, 13, 14, 15], f'{artifacts_path}/{event_artifact}', fill_value=0, N=1, skip=0, device=device),
        LagN('date', 'playerId', [0, 1, 2, 3], f'{artifacts_path}/{event_artifact}', fill_value=0, N=2, skip=0, device=device),
        LagN('date', 'playerId', [4, 5, 6, 7], f'{artifacts_path}/{event_artifact}', fill_value=0, N=2, skip=0, device=device),
        LagN('date', 'playerId', [8, 9, 10, 11], f'{artifacts_path}/{event_artifact}', fill_value=0, N=2, skip=0, device=device),
        LagN('date', 'playerId', [12, 13, 14, 15], f'{artifacts_path}/{event_artifact}', fill_value=0, N=2, skip=0, device=device),
        MapToCol(map_col='date', attr='seasonflag', mapper_input='seasons_formatted.csv', mapper_pipeline=DataLoader(artifacts_path))
)

In [7]:
feature_pipeline_tr2 = make_union(target_stats_train2, other_features2, scores12, scores22, scores32, scores42, scores52, all_players2, events2)
feature_pipeline_te2 = make_union(target_stats_test2, other_features2, scores12, scores22, scores32, scores42, scores52, all_players2, events2)

In [8]:
%%time
if not LOAD_FEATURES:
    X_tr = feature_pipeline_tr2.transform(tr_index)
    X_vl = feature_pipeline_te2.transform(vl_index)
else:
    X_tr = np.load("data/X_tr_v201_skip10.npy")
    X_vl = np.load("data/X_vl_v201_skip10.npy")
    
y_tr = tr_index[TARGETS].values
y_vl = vl_index[TARGETS].values
print(X_tr.shape, X_vl.shape)

(1480189, 459) (55789, 459)
CPU times: user 8.33 ms, sys: 2 s, total: 2.01 s
Wall time: 2.01 s


In [9]:
if SAVE_FEATURES:
    np.save("data/X_tr_v201_skip10.npy", X_tr)
    np.save("data/X_vl_v201_skip10.npy", X_vl)

In [10]:
if TRAIN_SEASON_ONLY:
    cond = X_tr[:, -1] > 0
    X_tr = X_tr[cond]
    y_tr = y_tr[cond]

    cond = X_vl[:, -1] > 0
    X_vl = X_vl[cond]
    y_vl = y_vl[cond]    
    
    print(X_tr.shape, X_vl.shape, y_tr.shape, y_vl.shape)

(1003015, 459) (55789, 459) (1003015, 4) (55789, 4)


In [11]:
tr1 = lgb.Dataset(X_tr, y_tr[:, 0])
tr2 = lgb.Dataset(X_tr, y_tr[:, 1])
tr3 = lgb.Dataset(X_tr, y_tr[:, 2])
tr4 = lgb.Dataset(X_tr, y_tr[:, 3])

vl1 = lgb.Dataset(X_vl, y_vl[:, 0], reference=tr1)
vl2 = lgb.Dataset(X_vl, y_vl[:, 1], reference=tr2)
vl3 = lgb.Dataset(X_vl, y_vl[:, 2], reference=tr3)
vl4 = lgb.Dataset(X_vl, y_vl[:, 3], reference=tr4)

params = {
    'n_estimators': 5000,
    'learning_rate': 0.02,
    'num_leaves': 255,
    'max_depth': -1,
    'min_data_in_leaf': 20,
    'colsample_bytree': 0.6,
    'subsample': 0.95,
    'bagging_freq': 1,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'extra_trees': False,
    'max_bin': 127,
    #'device': 'gpu',
    #'gpu_use_dp': False,
    #'gpu_device_id': 0,
    'boost_from_average': True,
    'reg_sqrt': True,
    'objective': 'mae',
    'metric': 'mae',
    'verbose': -1,
    'seed': SEED,
    'min_data_per_group': 10,
    'cat_l2': 10,
    'cat_smooth': 10,
    'num_threads': 16
}
bst1 = lgb.train(params, tr1, valid_sets=[vl1], early_stopping_rounds=200, verbose_eval=50)
pred21 = bst1.predict(X_vl)
print(mae(y_vl[:, 0], pred21))   

# 0.9509 (skip 3); 0.9478 (skip 10); 0.9481 (skip 15); 0.9473 (skip 7); 0.9475 (skip 12)
# sticking with skip10
# targetmean7 -> 10; 0.9477
# targetmean10, 30 -> 15, 60 ; ?
# targetmean7, 30, 60, 300 - 0.9470
# event features-lag2 - 0.9466
# dont filter outsideseason - 0.9482
# colsample - 0.6 -> 0.9474



Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 1.07444
[100]	valid_0's l1: 1.00887
[150]	valid_0's l1: 0.983712
[200]	valid_0's l1: 0.973629
[250]	valid_0's l1: 0.968343
[300]	valid_0's l1: 0.964994
[350]	valid_0's l1: 0.962589
[400]	valid_0's l1: 0.961117
[450]	valid_0's l1: 0.960301
[500]	valid_0's l1: 0.959514
[550]	valid_0's l1: 0.959007
[600]	valid_0's l1: 0.958506
[650]	valid_0's l1: 0.9579
[700]	valid_0's l1: 0.957547
[750]	valid_0's l1: 0.95671
[800]	valid_0's l1: 0.95606
[850]	valid_0's l1: 0.955646
[900]	valid_0's l1: 0.955002
[950]	valid_0's l1: 0.954481
[1000]	valid_0's l1: 0.954096
[1050]	valid_0's l1: 0.953801
[1100]	valid_0's l1: 0.95337
[1150]	valid_0's l1: 0.952905
[1200]	valid_0's l1: 0.952589
[1250]	valid_0's l1: 0.952136
[1300]	valid_0's l1: 0.951924
[1350]	valid_0's l1: 0.951725
[1400]	valid_0's l1: 0.95157
[1450]	valid_0's l1: 0.95152
[1500]	valid_0's l1: 0.951378
[1550]	valid_0's l1: 0.951056
[1600]	valid_0's l1: 0.950994
[1650]	

In [12]:
bst2 = lgb.train(params, tr2, valid_sets=[vl2], early_stopping_rounds=200, verbose_eval=50)
pred22 = bst2.predict(X_vl)
print(mae(y_vl[:, 1], pred22))   # (skip 15) 1.5609, 1.5578 (skip 10), 1.5864 (skip 3), 1.5665 (skip 7); 1.5591 (skip 12)
# targetmean7 -> 10 ; 1.5473
# targetmean10, 30 -> 15, 60; 1.5417
# targetmean7, 30, 60, 300 - 1.5444
# event features-lag2 - 1.5492

bst3 = lgb.train(params, tr3, valid_sets=[vl3], early_stopping_rounds=200, verbose_eval=50)
pred23 = bst3.predict(X_vl)
print(mae(y_vl[:, 2], pred23))   # (skip 15) 0.7469, 0.7459 (skip 10), 0.7488 (skip 3), 0.7463 (skip 7); 0.7442 (skip 12)
# targetmean7 -> 10 ; 0.7465
# targetmean10, 30 -> 15, 60; 0.7470
# targetmean7, 30, 60, 300 - 0.7458
# event features-lag2 - 0.7446


bst4 = lgb.train(params, tr4, valid_sets=[vl4], early_stopping_rounds=200, verbose_eval=50)
pred24 = bst4.predict(X_vl)
print(mae(y_vl[:, 3], pred24))   # (skip 15) 1.1930 , 1.1812 (skip 10), 1.1863 (skip 3); 1.1818 (skip 7); 1.1834 (skip 12)
# targetmean7 -> 10 1.1974; 
# targetmean10, 30 -> 15, 60; 1.1931
# targetmean7, 30, 60, 300 - 1.1909
# event features-lag2 - 1.1867

preds_2 = np.vstack((pred21, pred22, pred23, pred24)).T
print(mae(y_vl, preds_2))   # 1.1122 (skip 15), 1.1082 (skip 10), 1.1181 (skip 3); 1.1105 (skip 7); 1.1085 (skip 12)
# targetmean7 -> 10 1.1097; 
# targetmean10, 30 -> 15, 60; 1.1076
# targetmean7, 30, 60, 300 - 1.1070
# event features-lag2 - 1.1068
# colsample 0.6 - 1.1062



Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 1.69316
[100]	valid_0's l1: 1.59242
[150]	valid_0's l1: 1.57424
[200]	valid_0's l1: 1.57027
[250]	valid_0's l1: 1.56631
[300]	valid_0's l1: 1.56245
[350]	valid_0's l1: 1.55655
[400]	valid_0's l1: 1.55535
[450]	valid_0's l1: 1.55377
[500]	valid_0's l1: 1.55218
[550]	valid_0's l1: 1.55066
[600]	valid_0's l1: 1.55016
[650]	valid_0's l1: 1.54928
[700]	valid_0's l1: 1.54812
[750]	valid_0's l1: 1.54692
[800]	valid_0's l1: 1.54704
[850]	valid_0's l1: 1.54671
[900]	valid_0's l1: 1.54637
[950]	valid_0's l1: 1.54646
[1000]	valid_0's l1: 1.546
[1050]	valid_0's l1: 1.54618
[1100]	valid_0's l1: 1.54596
[1150]	valid_0's l1: 1.54567
[1200]	valid_0's l1: 1.54514
[1250]	valid_0's l1: 1.54425
[1300]	valid_0's l1: 1.54415
[1350]	valid_0's l1: 1.54453
[1400]	valid_0's l1: 1.54462
[1450]	valid_0's l1: 1.54471
[1500]	valid_0's l1: 1.54486
Early stopping, best iteration is:
[1303]	valid_0's l1: 1.5441
1.5440975551122953




Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 0.784458
[100]	valid_0's l1: 0.774495
[150]	valid_0's l1: 0.768536
[200]	valid_0's l1: 0.765054
[250]	valid_0's l1: 0.76345
[300]	valid_0's l1: 0.762171
[350]	valid_0's l1: 0.761037
[400]	valid_0's l1: 0.759674
[450]	valid_0's l1: 0.758554
[500]	valid_0's l1: 0.757864
[550]	valid_0's l1: 0.756976
[600]	valid_0's l1: 0.756197
[650]	valid_0's l1: 0.755564
[700]	valid_0's l1: 0.754954
[750]	valid_0's l1: 0.754593
[800]	valid_0's l1: 0.754312
[850]	valid_0's l1: 0.753369
[900]	valid_0's l1: 0.75291
[950]	valid_0's l1: 0.752671
[1000]	valid_0's l1: 0.752548
[1050]	valid_0's l1: 0.752312
[1100]	valid_0's l1: 0.752123
[1150]	valid_0's l1: 0.751798
[1200]	valid_0's l1: 0.751478
[1250]	valid_0's l1: 0.751005
[1300]	valid_0's l1: 0.7507
[1350]	valid_0's l1: 0.750566
[1400]	valid_0's l1: 0.750515
[1450]	valid_0's l1: 0.7501
[1500]	valid_0's l1: 0.749828
[1550]	valid_0's l1: 0.749814
[1600]	valid_0's l1: 0.749614
[165



Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 1.31683
[100]	valid_0's l1: 1.24172
[150]	valid_0's l1: 1.2267
[200]	valid_0's l1: 1.22049
[250]	valid_0's l1: 1.21927
[300]	valid_0's l1: 1.21403
[350]	valid_0's l1: 1.2128
[400]	valid_0's l1: 1.20983
[450]	valid_0's l1: 1.20727
[500]	valid_0's l1: 1.20567
[550]	valid_0's l1: 1.20498
[600]	valid_0's l1: 1.20368
[650]	valid_0's l1: 1.20352
[700]	valid_0's l1: 1.20351
[750]	valid_0's l1: 1.20292
[800]	valid_0's l1: 1.20251
[850]	valid_0's l1: 1.20174
[900]	valid_0's l1: 1.20117
[950]	valid_0's l1: 1.20064
[1000]	valid_0's l1: 1.20002
[1050]	valid_0's l1: 1.19943
[1100]	valid_0's l1: 1.19815
[1150]	valid_0's l1: 1.1971
[1200]	valid_0's l1: 1.19559
[1250]	valid_0's l1: 1.19503
[1300]	valid_0's l1: 1.19435
[1350]	valid_0's l1: 1.19456
[1400]	valid_0's l1: 1.19438
[1450]	valid_0's l1: 1.19404
[1500]	valid_0's l1: 1.19325
[1550]	valid_0's l1: 1.19302
[1600]	valid_0's l1: 1.19297
[1650]	valid_0's l1: 1.19261
[170

In [13]:
bst1.save_model("artifacts/bst1_train_v401_1.pkl")
bst2.save_model("artifacts/bst2_train_v401_1.pkl")
bst3.save_model("artifacts/bst3_train_v401_1.pkl")
bst4.save_model("artifacts/bst4_train_v401_1.pkl")

<lightgbm.basic.Booster at 0x7fe427094ee0>

In [15]:
np.save("data/lgb_t1_logv401_skip10_1.npy", pred21)
np.save("data/lgb_t2_logv401_skip10_1.npy", pred22)
np.save("data/lgb_t3_logv401_skip10_1.npy", pred23)
np.save("data/lgb_t4_logv401_skip10_1.npy", pred24)


In [14]:
SEED = 20201102

In [16]:
tr1 = lgb.Dataset(X_tr, y_tr[:, 0])
tr2 = lgb.Dataset(X_tr, y_tr[:, 1])
tr3 = lgb.Dataset(X_tr, y_tr[:, 2])
tr4 = lgb.Dataset(X_tr, y_tr[:, 3])

vl1 = lgb.Dataset(X_vl, y_vl[:, 0], reference=tr1)
vl2 = lgb.Dataset(X_vl, y_vl[:, 1], reference=tr2)
vl3 = lgb.Dataset(X_vl, y_vl[:, 2], reference=tr3)
vl4 = lgb.Dataset(X_vl, y_vl[:, 3], reference=tr4)

params = {
    'n_estimators': 5000,
    'learning_rate': 0.02,
    'num_leaves': 255,
    'max_depth': -1,
    'min_data_in_leaf': 20,
    'colsample_bytree': 0.6,
    'subsample': 0.95,
    'bagging_freq': 1,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'extra_trees': False,
    'max_bin': 127,
    #'device': 'gpu',
    #'gpu_use_dp': False,
    #'gpu_device_id': 0,
    'boost_from_average': True,
    'reg_sqrt': True,
    'objective': 'mae',
    'metric': 'mae',
    'verbose': -1,
    'seed': SEED,
    'min_data_per_group': 10,
    'cat_l2': 10,
    'cat_smooth': 10,
    'num_threads': 16
}
bst1 = lgb.train(params, tr1, valid_sets=[vl1], early_stopping_rounds=200, verbose_eval=50)
pred21 = bst1.predict(X_vl)
print(mae(y_vl[:, 0], pred21))  

bst2 = lgb.train(params, tr2, valid_sets=[vl2], early_stopping_rounds=200, verbose_eval=50)
pred22 = bst2.predict(X_vl)
print(mae(y_vl[:, 1], pred22))

bst3 = lgb.train(params, tr3, valid_sets=[vl3], early_stopping_rounds=200, verbose_eval=50)
pred23 = bst3.predict(X_vl)
print(mae(y_vl[:, 2], pred23))   # (skip 15) 0.7469, 0.7459 (skip 10), 0.7488 (skip 3), 0.7463 (skip 7); 0.7442 (skip 12)

bst4 = lgb.train(params, tr4, valid_sets=[vl4], early_stopping_rounds=200, verbose_eval=50)
pred24 = bst4.predict(X_vl)
print(mae(y_vl[:, 3], pred24))   # (skip 15) 1.1930 , 1.1812 (skip 10), 1.1863 (skip 3); 1.1818 (skip 7); 1.1834 (skip 12)

preds_2 = np.vstack((pred21, pred22, pred23, pred24)).T
print(mae(y_vl, preds_2))   # 1.1122 (skip 15), 1.1082 (skip 10), 1.1181 (skip 3); 1.1105 (skip 7); 1.1085 (skip 12)




Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 1.07336
[100]	valid_0's l1: 1.00759
[150]	valid_0's l1: 0.983914
[200]	valid_0's l1: 0.973718
[250]	valid_0's l1: 0.968317
[300]	valid_0's l1: 0.965097
[350]	valid_0's l1: 0.962651
[400]	valid_0's l1: 0.960712
[450]	valid_0's l1: 0.959735
[500]	valid_0's l1: 0.959482
[550]	valid_0's l1: 0.95901
[600]	valid_0's l1: 0.958676
[650]	valid_0's l1: 0.958325
[700]	valid_0's l1: 0.957557
[750]	valid_0's l1: 0.957184
[800]	valid_0's l1: 0.956315
[850]	valid_0's l1: 0.955277
[900]	valid_0's l1: 0.954856
[950]	valid_0's l1: 0.954535
[1000]	valid_0's l1: 0.954305
[1050]	valid_0's l1: 0.954171
[1100]	valid_0's l1: 0.954042
[1150]	valid_0's l1: 0.953948
[1200]	valid_0's l1: 0.953851
[1250]	valid_0's l1: 0.953358
[1300]	valid_0's l1: 0.95303
[1350]	valid_0's l1: 0.952978
[1400]	valid_0's l1: 0.952624
[1450]	valid_0's l1: 0.952386
[1500]	valid_0's l1: 0.952117
[1550]	valid_0's l1: 0.952114
[1600]	valid_0's l1: 0.952077
[1



Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 1.69435
[100]	valid_0's l1: 1.59682
[150]	valid_0's l1: 1.5821
[200]	valid_0's l1: 1.57239
[250]	valid_0's l1: 1.57063
[300]	valid_0's l1: 1.56919
[350]	valid_0's l1: 1.56664
[400]	valid_0's l1: 1.56377
[450]	valid_0's l1: 1.56134
[500]	valid_0's l1: 1.5601
[550]	valid_0's l1: 1.56002
[600]	valid_0's l1: 1.55787
[650]	valid_0's l1: 1.55592
[700]	valid_0's l1: 1.55601
[750]	valid_0's l1: 1.55572
[800]	valid_0's l1: 1.55513
[850]	valid_0's l1: 1.55487
[900]	valid_0's l1: 1.55503
[950]	valid_0's l1: 1.5557
[1000]	valid_0's l1: 1.5556
[1050]	valid_0's l1: 1.55543
Early stopping, best iteration is:
[866]	valid_0's l1: 1.55468
1.5546766122819182




Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 0.784612
[100]	valid_0's l1: 0.774231
[150]	valid_0's l1: 0.768378
[200]	valid_0's l1: 0.764638
[250]	valid_0's l1: 0.762716
[300]	valid_0's l1: 0.761197
[350]	valid_0's l1: 0.759885
[400]	valid_0's l1: 0.758842
[450]	valid_0's l1: 0.757302
[500]	valid_0's l1: 0.756094
[550]	valid_0's l1: 0.755156
[600]	valid_0's l1: 0.754532
[650]	valid_0's l1: 0.753902
[700]	valid_0's l1: 0.753616
[750]	valid_0's l1: 0.75318
[800]	valid_0's l1: 0.752581
[850]	valid_0's l1: 0.75194
[900]	valid_0's l1: 0.751655
[950]	valid_0's l1: 0.75132
[1000]	valid_0's l1: 0.750999
[1050]	valid_0's l1: 0.750832
[1100]	valid_0's l1: 0.750716
[1150]	valid_0's l1: 0.750573
[1200]	valid_0's l1: 0.75058
[1250]	valid_0's l1: 0.750383
[1300]	valid_0's l1: 0.750289
[1350]	valid_0's l1: 0.74998
[1400]	valid_0's l1: 0.749965
[1450]	valid_0's l1: 0.749932
[1500]	valid_0's l1: 0.749806
[1550]	valid_0's l1: 0.749638
[1600]	valid_0's l1: 0.749553
[16



Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 1.3166
[100]	valid_0's l1: 1.23945
[150]	valid_0's l1: 1.22259
[200]	valid_0's l1: 1.22003
[250]	valid_0's l1: 1.21799
[300]	valid_0's l1: 1.21529
[350]	valid_0's l1: 1.21157
[400]	valid_0's l1: 1.20921
[450]	valid_0's l1: 1.20784
[500]	valid_0's l1: 1.20732
[550]	valid_0's l1: 1.20661
[600]	valid_0's l1: 1.20636
[650]	valid_0's l1: 1.20627
[700]	valid_0's l1: 1.20585
[750]	valid_0's l1: 1.20527
[800]	valid_0's l1: 1.20468
[850]	valid_0's l1: 1.2038
[900]	valid_0's l1: 1.20307
[950]	valid_0's l1: 1.2027
[1000]	valid_0's l1: 1.20174
[1050]	valid_0's l1: 1.20145
[1100]	valid_0's l1: 1.20067
[1150]	valid_0's l1: 1.20019
[1200]	valid_0's l1: 1.19972
[1250]	valid_0's l1: 1.19935
[1300]	valid_0's l1: 1.19888
[1350]	valid_0's l1: 1.19864
[1400]	valid_0's l1: 1.19816
[1450]	valid_0's l1: 1.1979
[1500]	valid_0's l1: 1.19747
[1550]	valid_0's l1: 1.19722
[1600]	valid_0's l1: 1.19689
[1650]	valid_0's l1: 1.19664
[1700

In [17]:
bst1.save_model("artifacts/bst1_train_v401_2.pkl")
bst2.save_model("artifacts/bst2_train_v401_2.pkl")
bst3.save_model("artifacts/bst3_train_v401_2.pkl")
bst4.save_model("artifacts/bst4_train_v401_2.pkl")

np.save("data/lgb_t1_logv401_skip10_2.npy", pred21)
np.save("data/lgb_t2_logv401_skip10_2.npy", pred22)
np.save("data/lgb_t3_logv401_skip10_2.npy", pred23)
np.save("data/lgb_t4_logv401_skip10_2.npy", pred24)

In [18]:
pred211 = np.load("data/lgb_t1_logv401_skip10_1.npy")
pred221 = np.load("data/lgb_t2_logv401_skip10_1.npy")
pred231 = np.load("data/lgb_t3_logv401_skip10_1.npy")
pred241 = np.load("data/lgb_t4_logv401_skip10_1.npy")



In [19]:
preds_21 = np.vstack((pred211, pred221, pred231, pred241)).T
print(mae(y_vl, preds_21))   # 1.1122 (skip 15), 1.1082 (skip 10), 1.1181 (skip 3); 1.1105 (skip 7); 1.1085 (skip 12)


1.106175283898342


In [20]:
print(mae(y_vl, preds_21*0.5 + preds_2*0.5))

1.1069294432859755


In [1]:
import sys
sys.stdout.flush()