In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

pd.set_option('display.max_columns', 50)
SEED =1234
random.seed(SEED)
np.random.seed(SEED)

### 地図上に可視化

In [9]:
from folium import Map, Marker
from folium.plugins import MarkerCluster

data = pd.read_csv('data/train_data.csv',index_col=0)
marker_cluster = MarkerCluster()
map = Map(location=[25.5, 127], zoom_start=10)
for i, r in data.iterrows():
    Marker(location=[r['lat'], r['lon']]).add_to(marker_cluster)

marker_cluster.add_to(map)
map.save("map_ocean.html")

## ベースモデル

In [20]:
from sklearn.model_selection import train_test_split

train_data = pd.read_csv('data/train_data.csv',index_col=0)
train_data, val_data = train_test_split(train_data,test_size=0.2,shuffle=True,random_state=1234)
test_data = pd.read_csv('data/test_data.csv',index_col=0)

X_train = train_data.drop(['YMD','cover','Landsat_StartTime','PRODUCT_ID','mesh20'],axis=1)
y_train = train_data['cover']

X_val = val_data.drop(['YMD','cover','Landsat_StartTime','PRODUCT_ID','mesh20'],axis=1)
y_val = val_data['cover']

X_test = test_data.drop(['YMD','Landsat_StartTime','PRODUCT_ID','mesh20'],axis=1)

In [19]:
import optuna.integration.lightgbm as lgb
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.model_selection import train_test_split
import lightgbm as lgbn
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# LightGBM用のデータセットに変換
lgb_train = lgb.Dataset(X_train, y_train)

# ハイパーパラメータサーチ&モデル構築
params = {'objective': 'regression',
        'metric': 'l2',
        'random_seed':1234,
        'n_jobs':-1,
        'force_row_wise':True,
        'verbose': -1 # これを指定しないと`No further splits with positive gain, best gain: -inf`というWarningが表示される
        } 

kf = KFold(n_splits=5, shuffle = True, random_state=1234)
# クロスバリデーションによるハイパーパラメータの探索 3fold
tuner = lgb.LightGBMTunerCV(
                        params, lgb_train,
                        callbacks=[
                                lgb.early_stopping(stopping_rounds=10,verbose=False),
                                lgb.log_evaluation(False)
                                ],
                        folds=kf
                        )
# ハイパーパラメータ探索の実行
tuner.run()
# サーチしたパラメータの表示
best_params = tuner.best_params

#パラメータをもとに再学習
lgb_train = lgb.Dataset(X_train, y_train)
lgb_val = lgb.Dataset(X_val,y_val)

model = lgbn.train(best_params, lgb_train,
                valid_sets=[lgb_train, lgb_val],
                callbacks=[
                        lgb.early_stopping(10),
                        lgb.log_evaluation(False),
                        ],
                num_boost_round =1000
                )

pred_y = model.predict(X_test)

[32m[I 2023-01-24 23:15:28,596][0m A new study created in memory with name: no-name-b4ba0368-c12f-4c67-a43e-518d35488ca1[0m
[32m[I 2023-01-24 23:22:21,888][0m Trial 0 finished with value: 0.012612398123661056 and parameters: {'feature_fraction': 1.0}. Best is trial 0 with value: 0.012612398123661056.[0m
[32m[I 2023-01-24 23:25:56,508][0m Trial 1 finished with value: 0.012666705976966739 and parameters: {'feature_fraction': 0.5}. Best is trial 0 with value: 0.012612398123661056.[0m
[32m[I 2023-01-24 23:32:55,108][0m Trial 2 finished with value: 0.012620464999066683 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 0 with value: 0.012612398123661056.[0m
[32m[I 2023-01-24 23:39:51,532][0m Trial 3 finished with value: 0.012504184966478923 and parameters: {'feature_fraction': 0.7}. Best is trial 3 with value: 0.012504184966478923.[0m
[32m[I 2023-01-24 23:43:23,121][0m Trial 4 finished with value: 0.012761208687734834 and parameters: {'feature_fraction'

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[113]	training's l2: 0.000622883	valid_1's l2: 0.0107592


In [23]:
pred_y = np.round(pred_y,decimals=3)

In [25]:
pd.DataFrame(pred_y).to_csv('data/submit.csv', header=False)

## EDA

In [47]:
data = pd.read_csv('data/train_data.csv',index_col=0)