In [1]:
#時系列タスクで気を付ける点
#・説明変数として使って用データは何か？
#・古いデータは学習に使うべきか？
#・学習用データセットから検証用データをどう作るか？

#ライブラリのインポート
import numpy as np
import pandas as pd
import pickle
import gc
import os
import datetime as dt

import matplotlib.pyplot as plt

import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

#表示桁数の指定
pd.options.display.float_format = '{:10.4f}'.format

In [2]:
#ファイルの読み込み、データの確認
train = pd.read_csv("/kaggle/input/mlb-player-digital-engagement-forecasting/train_updated.csv")
print(train.shape)

#処理速度を上げるため、データを絞り込む
train = train.loc[train["date"]>=20200401,:].reset_index(drop=True)
print(train.shape)

(1308, 12)
(487, 12)


In [3]:
train.head()

Unnamed: 0,date,nextDayPlayerEngagement,games,rosters,playerBoxScores,teamBoxScores,transactions,standings,awards,events,playerTwitterFollowers,teamTwitterFollowers
0,20200401,"[{""engagementMetricsDate"":""2020-04-02"",""player...",,"[{""playerId"":430935,""gameDate"":""2020-04-01"",""t...",,,,,,,"[{""date"":""2020-04-01"",""playerId"":545361,""playe...","[{""date"":""2020-04-01"",""teamId"":147,""teamName"":..."
1,20200402,"[{""engagementMetricsDate"":""2020-04-03"",""player...",,"[{""playerId"":405395,""gameDate"":""2020-04-02"",""t...",,,,,,,,
2,20200403,"[{""engagementMetricsDate"":""2020-04-04"",""player...",,"[{""playerId"":425844,""gameDate"":""2020-04-03"",""t...",,,,,,,,
3,20200404,"[{""engagementMetricsDate"":""2020-04-05"",""player...",,"[{""playerId"":405395,""gameDate"":""2020-04-04"",""t...",,,,,,,,
4,20200405,"[{""engagementMetricsDate"":""2020-04-06"",""player...",,"[{""playerId"":408234,""gameDate"":""2020-04-05"",""t...",,,,,,,,


In [4]:
#json形式の列を表形式に直す関数
def unpack_json(json_str):
    return np.nan if pd.isna(json_str) else pd.read_json(json_str)

def extract_data(input_df, col="events", show=False):
    output_df = pd.DataFrame()
    for i in np.arange(len(input_df)):
        if show: print("\r{}/{}".format(i+1, len(input_df)), end="")
        try:
            output_df = pd.concat([
                output_df,
                unpack_json(input_df[col].iloc[i])
            ],axis=0, ignore_index=True)
        except:
            pass
    if show:print("")
    if show:print(output_df.shape)
    if show:display(output_df.head())
    return output_df

In [5]:
#engagementを取り出して表形式に変換
df_engagement = extract_data(train, col="nextDayPlayerEngagement",show=True)

487/487
(1003707, 6)


Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4
0,2020-04-02,425794,5.1249,9.434,0.1179,6.1947
1,2020-04-02,571704,0.0389,8.1761,0.0105,2.1304
2,2020-04-02,506702,0.0106,5.0314,0.0082,0.885
3,2020-04-02,607231,0.0247,2.8302,0.0222,0.59
4,2020-04-02,543193,0.0071,1.1006,0.0012,0.1967


In [6]:
#df_engagementの前処理
#結合キーの作成
df_engagement["date_playerId"] = df_engagement["engagementMetricsDate"].str.replace(
    "-","") + "_" + df_engagement["playerId"].astype(str)
df_engagement.head()

Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4,date_playerId
0,2020-04-02,425794,5.1249,9.434,0.1179,6.1947,20200402_425794
1,2020-04-02,571704,0.0389,8.1761,0.0105,2.1304,20200402_571704
2,2020-04-02,506702,0.0106,5.0314,0.0082,0.885,20200402_506702
3,2020-04-02,607231,0.0247,2.8302,0.0222,0.59,20200402_607231
4,2020-04-02,543193,0.0071,1.1006,0.0012,0.1967,20200402_543193


In [7]:
#日付から特徴量を作成
#推論実施日カラム（推論実施日=推論対象日の前日）
df_engagement["date"] = pd.to_datetime(df_engagement["engagementMetricsDate"],
                                      format="%Y-%m-%d") + dt.timedelta(days=-1)
#推論実施日の「曜日」と「年月」特徴量
df_engagement["dayofweek"] = df_engagement["date"].dt.dayofweek
df_engagement["yearmonth"] = df_engagement["date"].astype(str).apply(lambda x: x[:7])
df_engagement.head()

Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4,date_playerId,date,dayofweek,yearmonth
0,2020-04-02,425794,5.1249,9.434,0.1179,6.1947,20200402_425794,2020-04-01,2,2020-04
1,2020-04-02,571704,0.0389,8.1761,0.0105,2.1304,20200402_571704,2020-04-01,2,2020-04
2,2020-04-02,506702,0.0106,5.0314,0.0082,0.885,20200402_506702,2020-04-01,2,2020-04
3,2020-04-02,607231,0.0247,2.8302,0.0222,0.59,20200402_607231,2020-04-01,2,2020-04
4,2020-04-02,543193,0.0071,1.1006,0.0012,0.1967,20200402_543193,2020-04-01,2,2020-04


In [8]:
#players.csvの読み込み
df_players = pd.read_csv("/kaggle/input/mlb-player-digital-engagement-forecasting/players.csv")
print(df_players.shape)
print(df_players["playerId"].agg("nunique"))
df_players.head()

(2061, 12)
2061


Unnamed: 0,playerId,playerName,DOB,mlbDebutDate,birthCity,birthStateProvince,birthCountry,heightInches,weight,primaryPositionCode,primaryPositionName,playerForTestSetAndFuturePreds
0,665482,Gilberto Celestino,1999-02-13,2021-06-02,Santo Domingo,,Dominican Republic,72,170,8,Outfielder,False
1,593590,Webster Rivas,1990-08-08,2021-05-28,Nagua,,Dominican Republic,73,219,3,First Base,True
2,661269,Vladimir Gutierrez,1995-09-18,2021-05-28,Havana,,Cuba,73,190,1,Pitcher,True
3,669212,Eli Morgan,1996-05-13,2021-05-28,Rancho Palos Verdes,CA,USA,70,190,1,Pitcher,True
4,666201,Alek Manoah,1998-01-09,2021-05-27,Homestead,FL,USA,78,260,1,Pitcher,True


In [9]:
#テストデータの評価対象者の確認
df_players["playerForTestSetAndFuturePreds"] = np.where(df_players["playerForTestSetAndFuturePreds"
                                                        ]==True,1,0)
print(df_players["playerForTestSetAndFuturePreds"].sum())
print(df_players["playerForTestSetAndFuturePreds"].mean())

1187
0.5759340126152354


In [10]:
#データセット作成

#テーブル結合
df_train = pd.merge(df_engagement,df_players,on=["playerId"],how="left")
print(df_train.shape)

(1003707, 21)


In [11]:
x_train = df_train[[
    "playerId","dayofweek","birthCity","birthStateProvince","birthCountry","heightInches",
    "weight","primaryPositionCode","primaryPositionName","playerForTestSetAndFuturePreds"]]
y_train = df_train[["target1","target2","target3","target4"]]
id_train = df_train[["engagementMetricsDate","playerId","date_playerId","date","yearmonth","playerForTestSetAndFuturePreds"]]
print(x_train.shape,y_train.shape,id_train.shape)
x_train.head()

(1003707, 10) (1003707, 4) (1003707, 6)


Unnamed: 0,playerId,dayofweek,birthCity,birthStateProvince,birthCountry,heightInches,weight,primaryPositionCode,primaryPositionName,playerForTestSetAndFuturePreds
0,425794,2,Brunswick,GA,USA,79,230,1,Pitcher,1
1,571704,2,Albuquerque,NM,USA,75,210,1,Pitcher,0
2,506702,2,Maracaibo,,Venezuela,70,235,2,Catcher,1
3,607231,2,Savannah,GA,USA,76,200,1,Pitcher,1
4,543193,2,Columbia,CA,USA,76,215,1,Pitcher,0


In [12]:
#category型に変換
for col in ["playerId","dayofweek","birthCity","birthStateProvince","birthCountry",
           "primaryPositionCode","primaryPositionName"]:
    x_train[col] = x_train[col].astype("category")

In [13]:
#バリデーション設計

#学習データと検証データの期間設定
list_cv_month = [
    [["2020-05","2020-06","2020-07","2020-08","2020-09","2020-10","2020-11","2020-12","2021-01",
     "2021-02","2021-03","2021-04"],["2021-05"]],
    [["2020-06","2020-07","2020-08","2020-09","2020-10","2020-11","2020-12","2021-01",
     "2021-02","2021-03","2021-04","2021-05"],["2021-06"]],
    [["2020-07","2020-08","2020-09","2020-10","2020-11","2020-12","2021-01",
     "2021-02","2021-03","2021-04","2021-05","2021-06"],["2021-07"]],
]

In [14]:
#学習データ、検証データのindexリストを作成
cv = []
for month_tr,month_va in list_cv_month:
    cv.append([
        id_train.index[id_train["yearmonth"].isin(month_tr)],
        id_train.index[id_train["yearmonth"].isin(month_va) &
        (id_train["playerForTestSetAndFuturePreds"]==1)],
    ])
#fold0のindexリスト
cv[0]

[Index([ 61830,  61831,  61832,  61833,  61834,  61835,  61836,  61837,  61838,
         61839,
        ...
        814085, 814086, 814087, 814088, 814089, 814090, 814091, 814092, 814093,
        814094],
       dtype='int64', length=752265),
 Index([814095, 814096, 814100, 814101, 814102, 814104, 814105, 814106, 814107,
        814109,
        ...
        877931, 877934, 877950, 877951, 877957, 877958, 877969, 877972, 877974,
        877975],
       dtype='int64', length=36797)]

In [15]:
#モデル学習

#目的変数「target1」、fold「fold1」の場合
target = "target1"
nfold = 0

#train,validのindex取得
idx_tr,idx_va = cv[nfold][0],cv[nfold][1]

#学習データと検証データに分離
x_tr,y_tr,id_tr = x_train.loc[idx_tr,:],y_train.loc[idx_tr,target],id_train.loc[idx_tr,:]
x_va,y_va,id_va = x_train.loc[idx_va,:],y_train.loc[idx_va,target],id_train.loc[idx_va,:]
print(x_tr.shape,y_tr.shape,id_tr.shape)
print(x_va.shape,y_va.shape,id_va.shape)

(752265, 10) (752265,) (752265, 6)
(36797, 10) (36797,) (36797, 6)


In [16]:
#ハイパーパラメータの設定
params = {
    'boosting_type':'gbdt',
    'objective':'regression_l1',
    'metric':'mean_absolute_error',
    'learning_rate':0.05,
    'num_leaves':32,
    'subsample':0.7,
    'subsample_freq':1,
    'feature_fraction':0.8,
    'min_data_in_leaf':50,
    'min_sum_hessian_in_leaf':50,
    'n_estimators':1000,
    "random_state":123,
    "importance_type":"gain",
}

#モデルの学習
model = lgb.LGBMRegressor(**params)
verbose_eval = 100
model.fit(x_tr,
         y_tr,
         eval_set=[(x_tr,y_tr),(x_va,y_va)],
         callbacks=[
        early_stopping(stopping_rounds=50),  # 早期停止のコールバック
        log_evaluation(verbose_eval)]          # ログ表示のコールバック
         )

#モデルの保存
with open("model_lgb_target1_fold0.h5","wb") as f:#h5は深層学習用拡張子
    pickle.dump(model,f,protocol=4)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018969 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3300
[LightGBM] [Info] Number of data points in the train set: 752265, number of used features: 10
[LightGBM] [Info] Start training from score 0.001289
Training until validation scores don't improve for 50 rounds
[100]	training's l1: 0.50831	valid_1's l1: 1.29786
[200]	training's l1: 0.508183	valid_1's l1: 1.29768
[300]	training's l1: 0.508143	valid_1's l1: 1.29767
Early stopping, best iteration is:
[258]	training's l1: 0.508161	valid_1's l1: 1.29766


In [17]:
#モデル評価
#検証データの推論値取得
y_va_pred = model.predict(x_va)

#全target/foldの推論値を格納する変数の作成
df_valid_pred = pd.DataFrame()

#推論値を格納
tmp_pred = pd.concat([
    id_va,
    pd.DataFrame({"target":target,"nfold":0,"true":y_va,"pred":y_va_pred}),
],axis=1)
df_valid_pred = pd.concat([df_valid_pred, tmp_pred], axis=0, ignore_index=True)

#全target/foldの評価値を入れる変数の作成
metrics = []

#評価値の算出
metrics_va = mean_absolute_error(y_va,y_va_pred)
#評価値を格納
metrics.append([target,nfold,metrics_va])
metrics



[['target1', 0, 1.2976578174338422]]

In [18]:
#説明変数の重要度取得
tmp_imp = pd.DataFrame({"col":x_tr.columns,"imp":model.feature_importances_,
                       "target":"target1","nfold":nfold})
#確認
display(tmp_imp.sort_values("imp",ascending=False))
#全target/foldの重要度を格納するデータフレームの作成
df_imp = pd.DataFrame()
#imp_foldをdf_impに結合
df_imp = pd.concat([df_imp,tmp_imp],axis=0,ignore_index=True)

Unnamed: 0,col,imp,target,nfold
0,playerId,13595482.8115,target1,0
9,playerForTestSetAndFuturePreds,2314285.0327,target1,0
2,birthCity,2249420.1773,target1,0
7,primaryPositionCode,523633.5634,target1,0
8,primaryPositionName,91211.0063,target1,0
1,dayofweek,89016.5762,target1,0
3,birthStateProvince,35673.0473,target1,0
6,weight,30337.572,target1,0
5,heightInches,20493.2084,target1,0
4,birthCountry,4882.033,target1,0


In [19]:
#モデルの評価
#リスト型をデータフレームに変換
df_metrics = pd.DataFrame(metrics,columns=["target","nfold","mae"])
display(df_metrics.head())

#評価値
print("MCMAE: {:.4f}".format(df_metrics["mae"].mean()))

display(pd.pivot_table(df_metrics,index="nfold",columns="target",values="mae",
                      aggfunc=np.mean,margins=True))

Unnamed: 0,target,nfold,mae
0,target1,0,1.2977


MCMAE: 1.2977


target,target1,All
nfold,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.2977,1.2977
All,1.2977,1.2977


In [20]:
#検証データの推論値の形式変換
df_valid_pred_all = pd.pivot_table(df_valid_pred,index=
                                   ["engagementMetricsDate","playerId","date_playerId",
                                    "date","yearmonth","playerForTestSetAndFuturePreds"],
                                   columns=["target","nfold"],values=["true","pred"],aggfunc=np.sum)
df_valid_pred_all.columns = ["{}_fold{}_{}".format(j,k,i)for i,j,k in df_valid_pred_all.columns]
df_valid_pred_all = df_valid_pred_all.reset_index(drop=False)
df_valid_pred_all.head()

Unnamed: 0,engagementMetricsDate,playerId,date_playerId,date,yearmonth,playerForTestSetAndFuturePreds,target1_fold0_pred,target1_fold0_true
0,2021-05-02,405395,20210502_405395,2021-05-01,2021-05,1,0.6049,0.1518
1,2021-05-02,408234,20210502_408234,2021-05-01,2021-05,1,0.3317,0.2365
2,2021-05-02,424144,20210502_424144,2021-05-01,2021-05,1,0.002,0.0016
3,2021-05-02,425772,20210502_425772,2021-05-01,2021-05,1,0.0065,0.0035
4,2021-05-02,425784,20210502_425784,2021-05-01,2021-05,1,0.0008,0.0001


In [21]:
#説明変数の重要度取得
df_imp.groupby(["col"])["imp"].agg(["mean","std"]).sort_values("mean",ascending=False)

Unnamed: 0_level_0,mean,std
col,Unnamed: 1_level_1,Unnamed: 2_level_1
playerId,13595482.8115,
playerForTestSetAndFuturePreds,2314285.0327,
birthCity,2249420.1773,
primaryPositionCode,523633.5634,
primaryPositionName,91211.0063,
dayofweek,89016.5762,
birthStateProvince,35673.0473,
weight,30337.572,
heightInches,20493.2084,
birthCountry,4882.033,


In [22]:
#学習用関数の作成
def train_lgb(input_x,
              input_y,
              input_id,
              params,
              list_nfold=[0,1,2], 
              mode_train="train",
             ):
    #推論値を格納する変数の作成
    df_valid_pred = pd.DataFrame()
    #評価値を入れる変数の作成
    metrics = []
    #重要度を格納するデータフレームの作成
    df_imp = pd.DataFrame()

    #validation
    cv = []
    for month_tr,month_va in list_cv_month:
        cv.append([
            input_id.index[input_id["yearmonth"].isin(month_tr)],
            input_id.index[input_id["yearmonth"].isin(month_va) &
            (input_id["playerForTestSetAndFuturePreds"]==1)],
        ])

    #モデル学習(target/foldごとに学習)
    for nfold in list_nfold:
        for i,target in enumerate(["target1","target2","target3","target4"]):
            print("-"*20,target,",fold:", nfold,"-"*20)
            #trainとvalidに分離
            idx_tr,idx_va = cv[nfold][0],cv[nfold][1]
            x_tr,y_tr,id_tr = x_train.loc[idx_tr,:],y_train.loc[idx_tr,target],id_train.loc[idx_tr,:]
            x_va,y_va,id_va = x_train.loc[idx_va,:],y_train.loc[idx_va,target],id_train.loc[idx_va,:]
            print(x_tr.shape,y_tr.shape,id_tr.shape)
            print(x_va.shape,y_va.shape,id_va.shape)

            #保存するモデルのファイル名
            filepath = "model_lgb_{}_fold{}.h5".format(target,nfold)

            if mode_train == "train":
                print("training start.")
                model = lgb.LGBMRegressor(**params)
                verbose_eval = 100
                model.fit(x_tr,
                         y_tr,
                         eval_set=[(x_tr,y_tr),(x_va,y_va)],
                         callbacks=[
                        early_stopping(stopping_rounds=50),  # 早期停止のコールバック
                        log_evaluation(verbose_eval)]          # ログ表示のコールバック
                         )
                with open(filepath,"wb") as f:#h5は深層学習用拡張子
                    pickle.dump(model,f,protocol=4)
            else:
                print("model load.")
                with open(filepath,"rb") as f:
                    model = pickle.load(f)
                print("Done.")
            
            #validの推論値取得
            y_va_pred = model.predict(x_va)
            tmp_pred = pd.concat([
                id_va,
                pd.DataFrame({"target":target,"nfold":nfold,"true":y_va,"pred":y_va_pred}),
            ],axis=1)
            df_valid_pred = pd.concat([df_valid_pred, tmp_pred], axis=0, ignore_index=True)

            #評価値の算出
            metrics_va = mean_absolute_error(y_va,y_va_pred)
            metrics.append([target,nfold,metrics_va])

            #重要度の取得
            tmp_imp = pd.DataFrame({"col":x_tr.columns,"imp":model.feature_importances_,
                       "target":target,"nfold":nfold})
            df_imp = pd.concat([df_imp,tmp_imp],axis=0,ignore_index=True)
                               
                    
    print("-"*20,"result","-"*20)
    #評価値
    df_metrics = pd.DataFrame(metrics,columns=["target","nfold","mae"])
    print("MCMAE: {:.4f}".format(df_metrics["mae"].mean()))

    #validの推論値
    df_valid_pred_all = pd.pivot_table(df_valid_pred,index=
                               ["engagementMetricsDate","playerId","date_playerId",
                                "date","yearmonth","playerForTestSetAndFuturePreds"],
                               columns=["target","nfold"],values=["true","pred"],aggfunc=np.sum)
    df_valid_pred_all.columns = ["{}_fold{}_{}".format(j,k,i)for i,j,k in df_valid_pred_all.columns]
    df_valid_pred_all = df_valid_pred_all.reset_index(drop=False)

    return df_valid_pred_all, df_metrics, df_imp

In [23]:
#モデルを学習
params = {
    'boosting_type':'gbdt',
    'objective':'regression_l1',
    'metric':'mean_absolute_error',
    'learning_rate':0.05,
    'num_leaves':32,
    'subsample':0.7,
    'subsample_freq':1,
    'feature_fraction':0.8,
    'min_data_in_leaf':50,
    'min_sum_hessian_in_leaf':50,
    'n_estimators':1000,
    "random_state":123,
    "importance_type":"gain",
}

df_valid_pred,df_metrics,df_imp = train_lgb(x_train,
                                           y_train,
                                           id_train,
                                           params,
                                           list_nfold=[0,1,2],
                                           mode_train="train")

-------------------- target1 ,fold: 0 --------------------
(752265, 10) (752265,) (752265, 6)
(36797, 10) (36797,) (36797, 6)
training start.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014133 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3300
[LightGBM] [Info] Number of data points in the train set: 752265, number of used features: 10
[LightGBM] [Info] Start training from score 0.001289
Training until validation scores don't improve for 50 rounds
[100]	training's l1: 0.50831	valid_1's l1: 1.29786
[200]	training's l1: 0.508183	valid_1's l1: 1.29768
[300]	training's l1: 0.508143	valid_1's l1: 1.29767
Early stopping, best iteration is:
[258]	training's l1: 0.508161	valid_1's l1: 1.29766
-------------------- target2 ,fold: 0 --------------------
(752265, 10) (752265,) (752265, 6)
(36797, 10) (36797,) (36797, 6)
training start.
[Light

In [24]:
display(pd.pivot_table(df_metrics,index="nfold",columns="target",values="mae",
       aggfunc=np.mean,margins=True))

target,target1,target2,target3,target4,All
nfold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1.2977,2.4447,0.878,1.2451,1.4664
1,1.1953,2.1539,0.8317,1.6406,1.4554
2,1.1133,1.7903,0.7606,0.8542,1.1296
All,1.2021,2.1297,0.8234,1.2466,1.3504


In [25]:
df_imp.groupby(["col"])["imp"].agg(["mean","std"]).sort_values("mean",ascending=False)

Unnamed: 0_level_0,mean,std
col,Unnamed: 1_level_1,Unnamed: 2_level_1
playerId,4976980.5395,6102371.1341
playerForTestSetAndFuturePreds,1115074.5464,1091298.8384
birthCity,741457.7171,1058010.6632
primaryPositionCode,110792.2785,167555.0234
dayofweek,78879.047,140714.7657
primaryPositionName,33697.431,38239.7619
weight,20633.3897,31551.4583
heightInches,19410.1825,34119.9997
birthStateProvince,7714.1863,13322.8164
birthCountry,2933.4399,5114.668


In [26]:
#特徴量エンジニアリング
#選手ごとのステータス
df_rosters = extract_data(train,col="rosters")#jsonからdfへ変換

In [27]:
df_rosters.head()

Unnamed: 0,playerId,gameDate,teamId,statusCode,status
0,430935,2020-04-01,144,A,Active
1,435062,2020-04-01,120,A,Active
2,444489,2020-04-01,158,A,Active
3,445276,2020-04-01,119,A,Active
4,446308,2020-04-01,138,A,Active


In [28]:
df_rosters = df_rosters.rename(columns={"gameDate":"date"})
df_rosters["date"] = pd.to_datetime(df_rosters["date"],format="%Y-%m-%d")

#追加するカラムリストの作成（dateとplayerIDは結合キー）
col_rosters = ["teamId","statusCode","status"]

df_rosters.head()

Unnamed: 0,playerId,date,teamId,statusCode,status
0,430935,2020-04-01,144,A,Active
1,435062,2020-04-01,120,A,Active
2,444489,2020-04-01,158,A,Active
3,445276,2020-04-01,119,A,Active
4,446308,2020-04-01,138,A,Active


In [29]:
#ラグ特徴量の作成（1か月前の統計量を特徴量として利用）
#データの前処理
df_agg_target = df_train.groupby(["yearmonth","playerId"])[["target1","target2",
                                                            "target3","target4"]].agg(["mean","median","std","min","max"])
df_agg_target.columns = ["{}_{}".format(i,j) for i,j in df_agg_target.columns]
df_agg_target = df_agg_target.reset_index(drop=False)
df_agg_target.head()

Unnamed: 0,yearmonth,playerId,target1_mean,target1_median,target1_std,target1_min,target1_max,target2_mean,target2_median,target2_std,...,target3_mean,target3_median,target3_std,target3_min,target3_max,target4_mean,target4_median,target4_std,target4_min,target4_max
0,2020-04,112526,0.8834,0.0647,2.9618,0.0224,15.978,10.811,10.4352,5.3041,...,0.2894,0.1752,0.3478,0.0216,1.6761,21.1961,20.7913,12.6768,0.6305,51.3299
1,2020-04,134181,2.9999,0.2175,10.9845,0.0645,58.4642,14.7861,11.9902,13.2362,...,10.6877,0.9546,24.8149,0.0348,100.0,12.0298,11.6739,6.2926,0.5478,24.3902
2,2020-04,279571,0.0003,0.0,0.0006,0.0,0.0016,0.397,0.3435,0.2787,...,0.0004,0.0,0.0013,0.0,0.006,0.2895,0.2481,0.1986,0.0097,0.7
3,2020-04,282332,0.1413,0.0748,0.1702,0.0223,0.7391,7.8652,7.7711,4.0453,...,0.3794,0.3382,0.2484,0.0501,0.9882,11.354,10.0147,6.1022,0.5633,23.4455
4,2020-04,400085,1.9515,0.6949,3.3399,0.0947,17.0843,30.0941,27.2808,16.4382,...,13.3777,1.8486,26.4342,0.2183,100.0,50.7711,47.0509,29.4601,2.5769,100.0


In [30]:
#ラグ特徴量の作成
#年月でソート（時系列順に並んでいないとシフト時におかしくなるので）
df_agg_target = df_agg_target.sort_values("yearmonth").reset_index(drop=True)

#yearmonthを1か月シフトして過去にする
df_agg_target["yearmonth"] = df_agg_target.groupby(["playerId"])["yearmonth"].shift(-1)
#yearmonthの欠損値を[2021-08]で埋める
df_agg_target["yearmonth"] = df_agg_target["yearmonth"].fillna("2021-08")

#集計値がラグ特徴量と分かるようにカラム名を変更
df_agg_target.columns = [col + "_lag1month" if col not in ["playerId","yearmonth"]else
                        col for col in df_agg_target.columns]

#追加したカラムリストを作成
col_agg_target = list(df_agg_target.columns[df_agg_target.columns.str.contains("lag1month")])
df_agg_target.head()

Unnamed: 0,yearmonth,playerId,target1_mean_lag1month,target1_median_lag1month,target1_std_lag1month,target1_min_lag1month,target1_max_lag1month,target2_mean_lag1month,target2_median_lag1month,target2_std_lag1month,...,target3_mean_lag1month,target3_median_lag1month,target3_std_lag1month,target3_min_lag1month,target3_max_lag1month,target4_mean_lag1month,target4_median_lag1month,target4_std_lag1month,target4_min_lag1month,target4_max_lag1month
0,2020-05,112526,0.8834,0.0647,2.9618,0.0224,15.978,10.811,10.4352,5.3041,...,0.2894,0.1752,0.3478,0.0216,1.6761,21.1961,20.7913,12.6768,0.6305,51.3299
1,2020-05,628318,0.0003,0.0,0.0016,0.0,0.0088,0.3717,0.3519,0.2857,...,0.0,0.0,0.0,0.0,0.0,0.4519,0.4173,0.2852,0.0126,1.176
2,2020-05,628317,0.0747,0.0327,0.1005,0.0139,0.4201,10.7568,9.6495,4.7834,...,0.0816,0.0746,0.0462,0.0116,0.1811,3.2524,2.9701,1.861,0.1119,6.8816
3,2020-05,627894,0.0004,0.0,0.0008,0.0,0.0037,1.2347,1.1066,0.6663,...,0.002,0.0,0.0035,0.0,0.0157,0.3802,0.3303,0.2352,0.0165,0.9146
4,2020-05,627500,0.0004,0.0,0.0019,0.0,0.0104,0.294,0.1969,0.3396,...,0.0,0.0,0.0001,0.0,0.0005,0.2036,0.1609,0.1362,0.0117,0.5662


In [31]:
#学習用データセットの作成
df_train = pd.merge(df_engagement,df_players,on=["playerId"],how="left")
df_train = pd.merge(df_train,df_rosters,on=["date","playerId"],how="left")
df_train = pd.merge(df_train,df_agg_target,on=["playerId","yearmonth"],how="left")

#説明変数と目的変数の作成
x_train = df_train[[
    "playerId","dayofweek",
    "birthCity","birthStateProvince","birthCountry","heightInches","weight",
    "primaryPositionCode","primaryPositionName","playerForTestSetAndFuturePreds"
]+ col_rosters + col_agg_target]

y_train = df_train[["target1","target2","target3","target4"]]
id_train = df_train[["engagementMetricsDate","playerId","date_playerId","date",
                    "yearmonth","playerForTestSetAndFuturePreds"]]

#カテゴリ変数をcategory型に変換
for col in ["playerId","dayofweek","birthCity","birthStateProvince",
           "birthCountry","primaryPositionCode","primaryPositionName"] + col_rosters:
    x_train[col] = x_train[col].astype("category")

print(x_train.shape,y_train.shape,id_train.shape)

(1003707, 33) (1003707, 4) (1003707, 6)


In [32]:
#モデル学習
#モデルを学習
params = {
    'boosting_type':'gbdt',
    'objective':'regression_l1',
    'metric':'mean_absolute_error',
    'learning_rate':0.05,
    'num_leaves':32,
    'subsample':0.7,
    'subsample_freq':1,
    'feature_fraction':0.8,
    'min_data_in_leaf':50,
    'min_sum_hessian_in_leaf':50,
    'n_estimators':1000,
    "random_state":123,
    "importance_type":"gain",
}

df_valid_pred,df_metrics,df_imp = train_lgb(x_train,
                                           y_train,
                                           id_train,
                                           params,
                                           list_nfold=[0,1,2],
                                           mode_train="train")

-------------------- target1 ,fold: 0 --------------------
(752265, 33) (752265,) (752265, 6)
(36797, 33) (36797,) (36797, 6)
training start.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069228 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8454
[LightGBM] [Info] Number of data points in the train set: 752265, number of used features: 33
[LightGBM] [Info] Start training from score 0.001289
Training until validation scores don't improve for 50 rounds
[100]	training's l1: 0.504414	valid_1's l1: 1.28763
[200]	training's l1: 0.504261	valid_1's l1: 1.28723
Early stopping, best iteration is:
[167]	training's l1: 0.504269	valid_1's l1: 1.28723
-------------------- target2 ,fold: 0 --------------------
(752265, 33) (752265,) (752265, 6)
(36797, 33) (36797,) (36797, 6)
training start.
[LightGBM] [Info] Auto-choosing row-wise multi-threading,

In [33]:
display(pd.pivot_table(df_metrics,index="nfold",columns="target",values="mae",
       aggfunc=np.mean,margins=True))

target,target1,target2,target3,target4,All
nfold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1.2872,2.1865,0.8731,1.206,1.3882
1,1.1815,1.8953,0.825,1.5538,1.3639
2,1.0987,1.5925,0.7524,0.8818,1.0814
All,1.1891,1.8914,0.8168,1.2139,1.2778


In [34]:
#モデルチューニング

#目的変数間の相関係数を算出
df_engagement[["target1","target2","target3","target4"]].corr()

Unnamed: 0,target1,target2,target3,target4
target1,1.0,0.3529,0.3833,0.3252
target2,0.3529,1.0,0.366,0.4988
target3,0.3833,0.366,1.0,0.3229
target4,0.3252,0.4988,0.3229,1.0


In [35]:
#目的変数同士に相関があるため、マルチタスクによる精度向上が期待できる
#そこで、ニューラルネットワークを用いる
from sklearn.preprocessing import LabelEncoder

import tensorflow
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Input,Dense,Dropout,BatchNormalization,Activation,Concatenate
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint,ReduceLROnPlateau,LearningRateScheduler
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.layers import Embedding,Flatten

In [36]:
#説明変数と目的変数の作成
x_train = df_train[[
    "playerId","dayofweek",
    "birthCity","birthStateProvince","birthCountry","heightInches","weight",
    "primaryPositionCode","primaryPositionName","playerForTestSetAndFuturePreds"
]+ col_rosters + col_agg_target]

y_train = df_train[["target1","target2","target3","target4"]]
id_train = df_train[["engagementMetricsDate","playerId","date_playerId","date",
                    "yearmonth","playerForTestSetAndFuturePreds"]]

In [37]:
#数値とカテゴリ変数のカラムリストを作成
col_num = ["heightInches","weight","playerForTestSetAndFuturePreds"] + col_agg_target
col_cat = ["playerId","dayofweek","birthCity","birthStateProvince","birthCountry",
          "primaryPositionCode","primaryPositionName"] + col_rosters
print(len(col_num),len(col_cat))

23 10


In [38]:
#数値データの欠損値補間・正規化
dict_num = {}
for col in col_num:
    print(col)
    #欠損値補間：0で埋める
    value_fillna = 0
    x_train[col] = x_train[col].fillna(value_fillna)

    #正規化（0~1になるように変換）
    value_min = x_train[col].min()
    value_max = x_train[col].max()
    x_train[col] = (x_train[col] - value_min) / (value_max - value_min)

    #testデータにも適応出来るように保存
    dict_num[col] = {}
    dict_num[col]["fillna"] = value_fillna
    dict_num[col]["min"] = value_min
    dict_num[col]["max"] = value_max

print("Done.")

heightInches
weight
playerForTestSetAndFuturePreds
target1_mean_lag1month
target1_median_lag1month
target1_std_lag1month
target1_min_lag1month
target1_max_lag1month
target2_mean_lag1month
target2_median_lag1month
target2_std_lag1month
target2_min_lag1month
target2_max_lag1month
target3_mean_lag1month
target3_median_lag1month
target3_std_lag1month
target3_min_lag1month
target3_max_lag1month
target4_mean_lag1month
target4_median_lag1month
target4_std_lag1month
target4_min_lag1month
target4_max_lag1month
Done.


In [39]:
#カテゴリ変数の欠損値補間・数値化
#埋め込み層ありのネットワークモデル用
dict_cat = {}
for col in col_cat:
    print(col)
    #欠損値補間：unknownで埋める
    value_fillna = "unknown"
    x_train[col] = x_train[col].fillna(value_fillna)

    #str型に変換
    x_train[col] = x_train[col].astype(str)

    #ラベルエンコーダー：0から始まる整数に変換
    le = LabelEncoder()
    le.fit(x_train[col])
    list_label = sorted(list(set(le.classes_) | set(["unknown"])))
    map_label = {j:i for i,j in enumerate(list_label)}
    x_train[col] = x_train[col].map(map_label)

    #testデータにも適応出来るように保存
    dict_cat[col] = {}
    dict_cat[col]["fillna"] = value_fillna
    dict_cat[col]["map_label"] = map_label
    dict_cat[col]["num_label"] = len(list_label)

print("Done.")

playerId
dayofweek
birthCity
birthStateProvince
birthCountry
primaryPositionCode
primaryPositionName
teamId
statusCode
status
Done.


In [40]:
#欠損値補間・正規化/数値化を関数化
def transform_data(input_x):
    output_x = input_x.copy()

    #数値データの欠損値補間・正規化
    for col in col_num:
        #欠損値補間：平均値で埋める
        value_fillna = dict_num[col]["fillna"]
        output_x[col] = output_x[col].fillna(value_fillna)
    
        #正規化（0~1になるように変換）
        value_min = dict_num[col]["min"]
        value_max = dict_num[col]["max"]
        output_x[col] = (output_x[col] - value_min) / (value_max - value_min)

    #カテゴリ変数の欠損値補間・数値化
    for col in col_cat:
        #欠損値補間：unknownで埋める
        value_fillna = "unknown"
        output_x[col] = output_x[col].fillna(value_fillna)
    
        #str型に変換
        output_x[col] = output_x[col].astype(str)
    
        #ラベルエンコーダー：0から始まる整数に変換
        map_label = dict_cat[col]["map_label"]
        output_x[col] = output_x[col].map(map_label)
        #対応するものがない場合はunknownラベルで埋める
        output_x[col] = output_x[col].fillna(map_label["unknown"])

    return output_x

In [41]:
#ニューラルネットワークのモデル定義
def create_model(col_num = ["heightInches","weight"],
                col_cat = ["playerId","teamId","dayofweek"],
                show=False,
                ):
    input_num = Input(shape=(len(col_num),))
    input_cat = Input(shape=(len(col_cat),))

    #numeric
    x_num = input_num

    #category
    for i,col in enumerate(col_cat):
        tmp_cat = input_cat[:,i]
        input_dim = dict_cat[col]["num_label"]
        output_dim = int(input_dim/2)
        tmp_cat = Embedding(input_dim=input_dim, output_dim=output_dim)(tmp_cat)
        tmp_cat = Dropout(0.2)(tmp_cat)
        tmp_cat = Flatten()(tmp_cat)
        if i==0:
            x_cat = tmp_cat
        else:
            x_cat = Concatenate()([x_cat,tmp_cat])

    #concat
    x = Concatenate()([x_num,x_cat])
    x = Dense(128, activation="relu")(x)
    x = Dropout(0.1)(x)
    output = Dense(4, activation="linear")(x)

    model = Model(inputs=[input_num,input_cat],outputs=output)
    model.compile(optimizer="Adam",loss="mae",metrics=["mae"])

    if show:
        print(model.summary())
    else:
        return model

In [42]:
#モデル構造の確認
create_model(col_num=col_num,
            col_cat=col_cat,
            show=True)

None


In [43]:
import tensorflow as tf
#tensorflowの再現性のためのシード指定関数
def seed_everything(seed):
    import random
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [44]:
#学習用の関数をニューラルネットワーク用にカスタマイズ
def train_tf(input_x,
              input_y,
              input_id,
              list_nfold=[0,1,2], 
              mode_train="train",
             batch_size=1024,
             epochs=100,
             ):
    #推論値を格納する変数の作成
    df_valid_pred = pd.DataFrame()
    #評価値を入れる変数の作成
    metrics = []

    #validation
    cv = []
    for month_tr,month_va in list_cv_month:
        cv.append([
            input_id.index[input_id["yearmonth"].isin(month_tr)],
            input_id.index[input_id["yearmonth"].isin(month_va) &
            (input_id["playerForTestSetAndFuturePreds"]==1)],
        ])

    #モデル学習(target/foldごとに学習)
    for nfold in list_nfold:
        print("-"*20,"fold:", nfold,"-"*20)
        #trainとvalidに分離
        idx_tr,idx_va = cv[nfold][0],cv[nfold][1]
    
        x_num_tr,x_cat_tr,y_tr = input_x.loc[idx_tr,col_num].values,input_x.loc[
        idx_tr,col_cat].values,input_y.loc[idx_tr,:].values
        x_num_va,x_cat_va,y_va = input_x.loc[idx_va,col_num].values,input_x.loc[
        idx_va,col_cat].values,input_y.loc[idx_va,:].values  
        print(x_num_tr.shape,x_cat_tr.shape,y_tr.shape)
        print(x_num_va.shape,x_cat_va.shape,y_va.shape)

        #保存するモデルのファイル名
        filepath = "model_tf_fold{}.weights.h5".format(nfold)

        if mode_train == "train":
            print("training start.")
            seed_everything(seed=123)
            model = create_model(col_num=col_num,col_cat=col_cat,show=False)
            model.fit(x=[x_num_tr,x_cat_tr],
                     y=y_tr,
                     validation_data=([x_num_va,x_cat_va],y_va),
                      batch_size=batch_size,
                      epochs=epochs,
                     callbacks=[
                    ModelCheckpoint(filepath= filepath,monitor="val_loss",
                                   mode="min",verbose=1,save_weights_only=True),
                    EarlyStopping(monitor="val_loss",mode="min",min_delta=0,
                                 patience=10,verbose=1,restore_best_weights=True),
                    ReduceLROnPlateau(monitor="val_loss",mode="min",factor=0.1,patience=5,verbose=1),
                     ],
                      verbose=1,
                     )
        else:
            print("model load.")
            model = create_model(col_num=col_num,col_cat=col_cat,show=False)
            model.load_weights(filepath)
            print("Done.")
            
        #validの推論値取得
        y_va_pred = model.predict([x_num_va,x_cat_va])
        tmp_pred = pd.concat([
            id_va,
            pd.DataFrame(y_va,columns=["target1_true","target2_true","target3_true","target4_true"]),
            pd.DataFrame(y_va_pred,columns=["target1_pred","target2_pred","target3_pred","target4_pred"]),
        ],axis=1)
        tmp_pred["nfold"] = nfold
        df_valid_pred = pd.concat([df_valid_pred,tmp_pred],axis=0,ignore_index=True)

        #評価値の算出
        metrics.append(["target1",nfold,np.mean(np.abs(y_va[:,0] - y_va_pred[:,0]))])
        metrics.append(["target2",nfold,np.mean(np.abs(y_va[:,1] - y_va_pred[:,1]))])
        metrics.append(["target3",nfold,np.mean(np.abs(y_va[:,2] - y_va_pred[:,2]))])
        metrics.append(["target4",nfold,np.mean(np.abs(y_va[:,3] - y_va_pred[:,3]))])
                                               
    print("-"*10,"result","-"*10)
    #評価値
    df_metrics = pd.DataFrame(metrics,columns=["target","nfold","mae"])
    print("MCMAE: {:.4f}".format(df_metrics["mae"].mean()))

    #validの推論値
    df_valid_pred_all = pd.pivot_table(df_valid_pred,index=
                               ["engagementMetricsDate","playerId","date_playerId",
                                "date","yearmonth","playerForTestSetAndFuturePreds"],columns=["nfold"],
                                       values=list(df_valid_pred.columns[df_valid_pred.columns.str.contains("target")]),aggfunc=np.sum)
    df_valid_pred_all.columns = ["{}_fold{}_{}".format(i.split("_")[0],j,i.split("_")[1]) for i,j in df_valid_pred_all.columns]
    df_valid_pred_all = df_valid_pred_all.reset_index(drop=False)

    return df_valid_pred_all, df_metrics

In [45]:
#学習の実行
df_valid_pred,df_metrics = train_tf(x_train,
                                   y_train,
                                   id_train,
                                   list_nfold=[0,1,2],
                                   mode_train="train",
                                   batch_size=1024,
                                   epochs=1000,
                                   )

-------------------- fold: 0 --------------------
(752265, 23) (752265, 10) (752265, 4)
(36797, 23) (36797, 10) (36797, 4)
training start.
Epoch 1/1000
[1m734/735[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 62ms/step - loss: 0.9538 - mae: 0.9538
Epoch 1: saving model to model_tf_fold0.weights.h5
[1m735/735[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 63ms/step - loss: 0.9537 - mae: 0.9537 - val_loss: 1.3965 - val_mae: 1.3965 - learning_rate: 0.0010
Epoch 2/1000
[1m734/735[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 61ms/step - loss: 0.8807 - mae: 0.8807
Epoch 2: saving model to model_tf_fold0.weights.h5
[1m735/735[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 62ms/step - loss: 0.8807 - mae: 0.8807 - val_loss: 1.3952 - val_mae: 1.3952 - learning_rate: 0.0010
Epoch 3/1000
[1m735/735[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - loss: 0.8689 - mae: 0.8689
Epoch 3: saving model to model_tf_fold0.weights.h5
[1m735/735[0m [32

In [46]:
print("MCMAE:{:.4f}".format(df_metrics["mae"].mean()))
display(pd.pivot_table(df_metrics,index="nfold",columns="target",values="mae",aggfunc=np.mean,margins=True))

MCMAE:1.2803


target,target1,target2,target3,target4,All
nfold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1.2824,2.2135,0.8781,1.2061,1.395
1,1.1698,1.9171,0.8283,1.5133,1.3571
2,1.0887,1.6083,0.75,0.9076,1.0886
All,1.1803,1.9129,0.8188,1.209,1.2803
