In [20]:
#時系列タスクで気を付ける点
#・説明変数として使って用データは何か？
#・古いデータは学習に使うべきか？
#・学習用データセットから検証用データをどう作るか？

#ライブラリのインポート
import numpy as np
import pandas as pd
import pickle
import gc
import os
import datetime as dt

import matplotlib.pyplot as plt

import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

#表示桁数の指定
pd.options.display.float_format = '{:10.4f}'.format

In [4]:
#ファイルの読み込み、データの確認
train = pd.read_csv("/kaggle/input/mlb-player-digital-engagement-forecasting/train_updated.csv")
print(train.shape)

#処理速度を上げるため、データを絞り込む
train = train.loc[train["date"]>=20200401,:].reset_index(drop=True)
print(train.shape)

(1308, 12)
(487, 12)


In [5]:
train.head()

Unnamed: 0,date,nextDayPlayerEngagement,games,rosters,playerBoxScores,teamBoxScores,transactions,standings,awards,events,playerTwitterFollowers,teamTwitterFollowers
0,20200401,"[{""engagementMetricsDate"":""2020-04-02"",""player...",,"[{""playerId"":430935,""gameDate"":""2020-04-01"",""t...",,,,,,,"[{""date"":""2020-04-01"",""playerId"":545361,""playe...","[{""date"":""2020-04-01"",""teamId"":147,""teamName"":..."
1,20200402,"[{""engagementMetricsDate"":""2020-04-03"",""player...",,"[{""playerId"":405395,""gameDate"":""2020-04-02"",""t...",,,,,,,,
2,20200403,"[{""engagementMetricsDate"":""2020-04-04"",""player...",,"[{""playerId"":425844,""gameDate"":""2020-04-03"",""t...",,,,,,,,
3,20200404,"[{""engagementMetricsDate"":""2020-04-05"",""player...",,"[{""playerId"":405395,""gameDate"":""2020-04-04"",""t...",,,,,,,,
4,20200405,"[{""engagementMetricsDate"":""2020-04-06"",""player...",,"[{""playerId"":408234,""gameDate"":""2020-04-05"",""t...",,,,,,,,


In [6]:
#json形式の列を表形式に直す関数
def unpack_json(json_str):
    return np.nan if pd.isna(json_str) else pd.read_json(json_str)

def extract_data(input_df, col="events", show=False):
    output_df = pd.DataFrame()
    for i in np.arange(len(input_df)):
        if show: print("\r{}/{}".format(i+1, len(input_df)), end="")
        try:
            output_df = pd.concat([
                output_df,
                unpack_json(input_df[col].iloc[i])
            ],axis=0, ignore_index=True)
        except:
            pass
    if show:print("")
    if show:print(output_df.shape)
    if show:display(output_df.head())
    return output_df

In [7]:
#engagementを取り出して表形式に変換
df_engagement = extract_data(train, col="nextDayPlayerEngagement",show=True)

487/487
(1003707, 6)


Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4
0,2020-04-02,425794,5.1249,9.434,0.1179,6.1947
1,2020-04-02,571704,0.0389,8.1761,0.0105,2.1304
2,2020-04-02,506702,0.0106,5.0314,0.0082,0.885
3,2020-04-02,607231,0.0247,2.8302,0.0222,0.59
4,2020-04-02,543193,0.0071,1.1006,0.0012,0.1967


In [8]:
#df_engagementの前処理
#結合キーの作成
df_engagement["date_playerId"] = df_engagement["engagementMetricsDate"].str.replace(
    "-","") + "_" + df_engagement["playerId"].astype(str)
df_engagement.head()

Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4,date_playerId
0,2020-04-02,425794,5.1249,9.434,0.1179,6.1947,20200402_425794
1,2020-04-02,571704,0.0389,8.1761,0.0105,2.1304,20200402_571704
2,2020-04-02,506702,0.0106,5.0314,0.0082,0.885,20200402_506702
3,2020-04-02,607231,0.0247,2.8302,0.0222,0.59,20200402_607231
4,2020-04-02,543193,0.0071,1.1006,0.0012,0.1967,20200402_543193


In [10]:
#日付から特徴量を作成
#推論実施日カラム（推論実施日=推論対象日の前日）
df_engagement["date"] = pd.to_datetime(df_engagement["engagementMetricsDate"],
                                      format="%Y-%m-%d") + dt.timedelta(days=-1)
#推論実施日の「曜日」と「年月」特徴量
df_engagement["dayofweek"] = df_engagement["date"].dt.dayofweek
df_engagement["yearmonth"] = df_engagement["date"].astype(str).apply(lambda x: x[:7])
df_engagement.head()

Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4,date_playerId,date,dayofweek,yearmonth
0,2020-04-02,425794,5.1249,9.434,0.1179,6.1947,20200402_425794,2020-04-01,2,2020-04
1,2020-04-02,571704,0.0389,8.1761,0.0105,2.1304,20200402_571704,2020-04-01,2,2020-04
2,2020-04-02,506702,0.0106,5.0314,0.0082,0.885,20200402_506702,2020-04-01,2,2020-04
3,2020-04-02,607231,0.0247,2.8302,0.0222,0.59,20200402_607231,2020-04-01,2,2020-04
4,2020-04-02,543193,0.0071,1.1006,0.0012,0.1967,20200402_543193,2020-04-01,2,2020-04


In [11]:
#players.csvの読み込み
df_players = pd.read_csv("/kaggle/input/mlb-player-digital-engagement-forecasting/players.csv")
print(df_players.shape)
print(df_players["playerId"].agg("nunique"))
df_players.head()

(2061, 12)
2061


Unnamed: 0,playerId,playerName,DOB,mlbDebutDate,birthCity,birthStateProvince,birthCountry,heightInches,weight,primaryPositionCode,primaryPositionName,playerForTestSetAndFuturePreds
0,665482,Gilberto Celestino,1999-02-13,2021-06-02,Santo Domingo,,Dominican Republic,72,170,8,Outfielder,False
1,593590,Webster Rivas,1990-08-08,2021-05-28,Nagua,,Dominican Republic,73,219,3,First Base,True
2,661269,Vladimir Gutierrez,1995-09-18,2021-05-28,Havana,,Cuba,73,190,1,Pitcher,True
3,669212,Eli Morgan,1996-05-13,2021-05-28,Rancho Palos Verdes,CA,USA,70,190,1,Pitcher,True
4,666201,Alek Manoah,1998-01-09,2021-05-27,Homestead,FL,USA,78,260,1,Pitcher,True


In [12]:
#テストデータの評価対象者の確認
df_players["playerForTestSetAndFuturePreds"] = np.where(df_players["playerForTestSetAndFuturePreds"
                                                        ]==True,1,0)
print(df_players["playerForTestSetAndFuturePreds"].sum())
print(df_players["playerForTestSetAndFuturePreds"].mean())

1187
0.5759340126152354


In [13]:
#データセット作成

#テーブル結合
df_train = pd.merge(df_engagement,df_players,on=["playerId"],how="left")
print(df_train.shape)

(1003707, 21)


In [14]:
x_train = df_train[[
    "playerId","dayofweek","birthCity","birthStateProvince","birthCountry","heightInches",
    "weight","primaryPositionCode","primaryPositionName","playerForTestSetAndFuturePreds"]]
y_train = df_train[["target1","target2","target3","target4"]]
id_train = df_train[["engagementMetricsDate","playerId","date_playerId","date","yearmonth","playerForTestSetAndFuturePreds"]]
print(x_train.shape,y_train.shape,id_train.shape)
x_train.head()

(1003707, 10) (1003707, 4) (1003707, 6)


Unnamed: 0,playerId,dayofweek,birthCity,birthStateProvince,birthCountry,heightInches,weight,primaryPositionCode,primaryPositionName,playerForTestSetAndFuturePreds
0,425794,2,Brunswick,GA,USA,79,230,1,Pitcher,1
1,571704,2,Albuquerque,NM,USA,75,210,1,Pitcher,0
2,506702,2,Maracaibo,,Venezuela,70,235,2,Catcher,1
3,607231,2,Savannah,GA,USA,76,200,1,Pitcher,1
4,543193,2,Columbia,CA,USA,76,215,1,Pitcher,0


In [15]:
#category型に変換
for col in ["playerId","dayofweek","birthCity","birthStateProvince","birthCountry",
           "primaryPositionCode","primaryPositionName"]:
    x_train[col] = x_train[col].astype("category")

In [16]:
#バリデーション設計

#学習データと検証データの期間設定
list_cv_month = [
    [["2020-05","2020-06","2020-07","2020-08","2020-09","2020-10","2020-11","2020-12","2021-01",
     "2021-02","2021-03","2021-04"],["2021-05"]],
    [["2020-06","2020-07","2020-08","2020-09","2020-10","2020-11","2020-12","2021-01",
     "2021-02","2021-03","2021-04","2021-05"],["2021-06"]],
    [["2020-07","2020-08","2020-09","2020-10","2020-11","2020-12","2021-01",
     "2021-02","2021-03","2021-04","2021-05","2021-06"],["2021-07"]],
]

In [17]:
#学習データ、検証データのindexリストを作成
cv = []
for month_tr,month_va in list_cv_month:
    cv.append([
        id_train.index[id_train["yearmonth"].isin(month_tr)],
        id_train.index[id_train["yearmonth"].isin(month_va) &
        (id_train["playerForTestSetAndFuturePreds"]==1)],
    ])
#fold0のindexリスト
cv[0]

[Index([ 61830,  61831,  61832,  61833,  61834,  61835,  61836,  61837,  61838,
         61839,
        ...
        814085, 814086, 814087, 814088, 814089, 814090, 814091, 814092, 814093,
        814094],
       dtype='int64', length=752265),
 Index([814095, 814096, 814100, 814101, 814102, 814104, 814105, 814106, 814107,
        814109,
        ...
        877931, 877934, 877950, 877951, 877957, 877958, 877969, 877972, 877974,
        877975],
       dtype='int64', length=36797)]

In [18]:
#モデル学習

#目的変数「target1」、fold「fold1」の場合
target = "target1"
nfold = 0

#train,validのindex取得
idx_tr,idx_va = cv[nfold][0],cv[nfold][1]

#学習データと検証データに分離
x_tr,y_tr,id_tr = x_train.loc[idx_tr,:],y_train.loc[idx_tr,target],id_train.loc[idx_tr,:]
x_va,y_va,id_va = x_train.loc[idx_va,:],y_train.loc[idx_va,target],id_train.loc[idx_va,:]
print(x_tr.shape,y_tr.shape,id_tr.shape)
print(x_va.shape,y_va.shape,id_va.shape)

(752265, 10) (752265,) (752265, 6)
(36797, 10) (36797,) (36797, 6)


In [22]:
#ハイパーパラメータの設定
params = {
    'boosting_type':'gbdt',
    'objective':'regression_l1',
    'metric':'mean_absolute_error',
    'learning_rate':0.05,
    'num_leaves':32,
    'subsample':0.7,
    'subsample_freq':1,
    'feature_fraction':0.8,
    'min_data_in_leaf':50,
    'min_sum_hessian_in_leaf':50,
    'n_estimators':1000,
    "random_state":123,
    "importance_type":"gain",
}

#モデルの学習
model = lgb.LGBMRegressor(**params)
verbose_eval = 100
model.fit(x_tr,
         y_tr,
         eval_set=[(x_tr,y_tr),(x_va,y_va)],
         callbacks=[
        early_stopping(stopping_rounds=50),  # 早期停止のコールバック
        log_evaluation(verbose_eval)]          # ログ表示のコールバック
         )

#モデルの保存
with open("model_lgb_target1_fold0.h5","wb") as f:#h5は深層学習用拡張子
    pickle.dump(model,f,protocol=4)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017892 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3300
[LightGBM] [Info] Number of data points in the train set: 752265, number of used features: 10
[LightGBM] [Info] Start training from score 0.001289
Training until validation scores don't improve for 50 rounds
[100]	training's l1: 0.50831	valid_1's l1: 1.29786
[200]	training's l1: 0.508183	valid_1's l1: 1.29768
[300]	training's l1: 0.508143	valid_1's l1: 1.29767
Early stopping, best iteration is:
[258]	training's l1: 0.508161	valid_1's l1: 1.29766


In [23]:
#モデル評価
#検証データの推論値取得
y_va_pred = model.predict(x_va)

#全target/foldの推論値を格納する変数の作成
df_valid_pred = pd.DataFrame()

#推論値を格納
tmp_pred = pd.concat([
    id_va,
    pd.DataFrame({"target":target,"nfold":0,"true":y_va,"pred":y_va_pred}),
],axis=1)
df_valid_pred = pd.concat([df_valid_pred, tmp_pred], axis=0, ignore_index=True)

#全target/foldの評価値を入れる変数の作成
metrics = []

#評価値の算出
metrics_va = mean_absolute_error(y_va,y_va_pred)
#評価値を格納
metrics.append([target,nfold,metrics_va])
metrics



[['target1', 0, 1.2976578174338422]]

In [25]:
#説明変数の重要度取得
tmp_imp = pd.DataFrame({"col":x_tr.columns,"imp":model.feature_importances_,
                       "target":"target1","nfold":nfold})
#確認
display(tmp_imp.sort_values("imp",ascending=False))
#全target/foldの重要度を格納するデータフレームの作成
df_imp = pd.DataFrame()
#imp_foldをdf_impに結合
df_imp = pd.concat([df_imp,tmp_imp],axis=0,ignore_index=True)

Unnamed: 0,col,imp,target,nfold
0,playerId,13595482.8115,target1,0
9,playerForTestSetAndFuturePreds,2314285.0327,target1,0
2,birthCity,2249420.1773,target1,0
7,primaryPositionCode,523633.5634,target1,0
8,primaryPositionName,91211.0063,target1,0
1,dayofweek,89016.5762,target1,0
3,birthStateProvince,35673.0473,target1,0
6,weight,30337.572,target1,0
5,heightInches,20493.2084,target1,0
4,birthCountry,4882.033,target1,0


In [26]:
#モデルの評価
#リスト型をデータフレームに変換
df_metrics = pd.DataFrame(metrics,columns=["target","nfold","mae"])
display(df_metrics.head())

#評価値
print("MCMAE: {:.4f}".format(df_metrics["mae"].mean()))

display(pd.pivot_table(df_metrics,index="nfold",columns="target",values="mae",
                      aggfunc=np.mean,margins=True))

Unnamed: 0,target,nfold,mae
0,target1,0,1.2977


MCMAE: 1.2977


target,target1,All
nfold,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.2977,1.2977
All,1.2977,1.2977


In [27]:
#検証データの推論値の形式変換
df_valid_pred_all = pd.pivot_table(df_valid_pred,index=
                                   ["engagementMetricsDate","playerId","date_playerId",
                                    "date","yearmonth","playerForTestSetAndFuturePreds"],
                                   columns=["target","nfold"],values=["true","pred"],aggfunc=np.sum)
df_valid_pred_all.columns = ["{}_fold{}_{}".format(j,k,i)for i,j,k in df_valid_pred_all.columns]
df_valid_pred_all = df_valid_pred_all.reset_index(drop=False)
df_valid_pred_all.head()

Unnamed: 0,engagementMetricsDate,playerId,date_playerId,date,yearmonth,playerForTestSetAndFuturePreds,target1_fold0_pred,target1_fold0_true
0,2021-05-02,405395,20210502_405395,2021-05-01,2021-05,1,0.6049,0.1518
1,2021-05-02,408234,20210502_408234,2021-05-01,2021-05,1,0.3317,0.2365
2,2021-05-02,424144,20210502_424144,2021-05-01,2021-05,1,0.002,0.0016
3,2021-05-02,425772,20210502_425772,2021-05-01,2021-05,1,0.0065,0.0035
4,2021-05-02,425784,20210502_425784,2021-05-01,2021-05,1,0.0008,0.0001


In [28]:
#説明変数の重要度取得
df_imp.groupby(["col"])["imp"].agg(["mean","std"]).sort_values("mean",ascending=False)

Unnamed: 0_level_0,mean,std
col,Unnamed: 1_level_1,Unnamed: 2_level_1
playerId,13595482.8115,
playerForTestSetAndFuturePreds,2314285.0327,
birthCity,2249420.1773,
primaryPositionCode,523633.5634,
primaryPositionName,91211.0063,
dayofweek,89016.5762,
birthStateProvince,35673.0473,
weight,30337.572,
heightInches,20493.2084,
birthCountry,4882.033,


In [None]:
#学習用関数の作成
def train_lgb(input_x,
              input_y,
              input_id,
              params,
              list_nfold=[0,1,2], 
              mode_train="train",
             ):
    #推論値を格納する変数の作成
    df_valid_pred = pd.DataFrame()
    #評価値を入れる変数の作成
    metrics = []
    #重要度を格納するデータフレームの作成
    df_imp = pd.DataFrame()

    #validation
    cv = []
    for month_tr,month_va in list_cv_month:
        cv.append([
            input_id.index[input_id["yearmonth"].isin(month_tr)],
            input_id.index[input_id["yearmonth"].isin(month_va) &
            (input_id["playerForTestSetAndFuturePreds"]==1)],
        ])

    #モデル学習(target/foldごとに学習)
    for nfold in list_nfold:
        for i,target in enumerate(["target1","target2","target3","target4"]):
            print("-"*20,target,",fold:", nfold,"-"*20)
            #trainとvalidに分離
            idx_tr,idx_va = cv[nfold][0],cv[nfold][1]
            x_tr,y_tr,id_tr = x_train.loc[idx_tr,:],y_train.loc[idx_tr,target],id_train.loc[idx_tr,:]
            x_va,y_va,id_va = x_train.loc[idx_va,:],y_train.loc[idx_va,target],id_train.loc[idx_va,:]
            print(x_tr.shape,y_tr.shape,id_tr.shape)
            print(x_va.shape,y_va.shape,id_va.shape)

            #保存するモデルのファイル名
            filepath = "model_lgb_{}_fold{}.h5".format(target,nfold)

            if mode_train == "train":
                print("training start.")
                model = lgb.LGBMRegressor(**params)
                verbose_eval = 100
                model.fit(x_tr,
                         y_tr,
                         eval_set=[(x_tr,y_tr),(x_va,y_va)],
                         callbacks=[
                        early_stopping(stopping_rounds=50),  # 早期停止のコールバック
                        log_evaluation(verbose_eval)]          # ログ表示のコールバック
                         )
                with open(filepath,"wb") as f:#h5は深層学習用拡張子
                    pickle.dump(model,f,protocol=4)
            else:
                print("model load.")
                with open(filepath,"rb") as f:
                    model = pickle.load(f)
                print("Done.")
            
            #validの推論値取得
            y_va_pred = model.predict(x_va)
            tmp_pred = pd.concat([
                id_va,
                pd.DataFrame({"target":target,"nfold":nfold,"true":y_va,"pred":y_va_pred}),
            ],axis=1)
            df_valid_pred = pd.concat([df_valid_pred, tmp_pred], axis=0, ignore_index=True)

            #評価値の算出
            metrics_va = mean_absolute_error(y_va,y_va_pred)
            metrics.append([target,nfold,metrics_va])

            #重要度の取得
            tmp_imp = pd.DataFrame({"col":x_tr.columns,"imp":model.feature_importances_,
                       "target":target,"nfold":nfold})
            df_imp = pd.concat([df_imp,tmp_imp],axis=0,ignore_index=True
                               
                    
    print("-"*20,"result","-"*20)
    #評価値
    df_metrics = pd.DataFrame(metrics,columns=["target","nfold","mae"])
    print("MCMAE: {:.4f}".format(df_metrics["mae"].mean()))

    #validの推論値
    df_valid_pred_all = pd.pivot_table(df_valid_pred,index=
                               ["engagementMetricsDate","playerId","date_playerId",
                                "date","yearmonth","playerForTestSetAndFuturePreds"],
                               columns=["target","nfold"],values=["true","pred"],aggfunc=np.sum)
    df_valid_pred_all.columns = ["{}_fold{}_{}".format(j,k,i)for i,j,k in df_valid_pred_all.columns]
    df_valid_pred_all = df_valid_pred_all.reset_index(drop=False)

    return df_valid_pred_all, df_metrics, df_imp