<a href="https://colab.research.google.com/github/nagasora/MITSUI-CO.-Commodity-Prediction-Metric/blob/main/basemodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install lightgbm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
import json
f = open("/content/drive/MyDrive/kaggle notebook/kaggle.json")
json_data = json.load(f)
os.environ['KAGGLE_USERNAME'] = json_data['username']
os.environ['KAGGLE_KEY'] = json_data['key']


In [4]:
#APIコマンドを入力
!kaggle competitions download -c mitsui-commodity-prediction-challenge

mitsui-commodity-prediction-challenge.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
import os
import zipfile

# Specify the path to the zip file in Google Drive
zip_file_path = "/content/mitsui-commodity-prediction-challenge.zip"

# Specify the destination directory (same as the zip file directory)
destination_directory = os.path.dirname(zip_file_path)

# Create the destination directory if it doesn't exist
os.makedirs(destination_directory, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(destination_directory)

print(f"File unzipped to: {destination_directory}")

File unzipped to: /content


In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv('/content/train.csv')
train_labels = pd.read_csv('/content/train_labels.csv')
target_pairs = pd.read_csv('/content/target_pairs.csv')

print('training data:', train.shape)
print('train label:', train_labels.shape)
print('target_pairs:', target_pairs.shape)

training data: (1917, 558)
train label: (1917, 425)
target_pairs: (424, 3)


In [7]:
"""
#データの結合
df_full = pd.merge(train, train_labels, on='date_id', how='left')

#欠損値の処理
feature_cols = [col for col in df_full.columns if col != 'date_id']
df_full[feature_cols] = df_full[feature_cols].fillna(method='ffill').fillna(method='bfill')

#特徴量エンジニアリング
def create_technical_features(df, price_cols, window_sizes=[5, 10, 20]):
    """"""Create technical indicators for price series""""""
    features = df.copy()

    for col in price_cols:
        if col in df.columns:
            # Moving averages
            for window in window_sizes:
                features[f'{col}_MA_{window}'] = df[col].rolling(window=window).mean()
                features[f'{col}_STD_{window}'] = df[col].rolling(window=window).std()

            # Price changes and returns
            features[f'{col}_Return_1d'] = df[col].pct_change(1)
            features[f'{col}_Return_5d'] = df[col].pct_change(5)
            features[f'{col}_LogReturn_1d'] = np.log(df[col] / df[col].shift(1))

            # Bollinger Bands
            ma_20 = df[col].rolling(window=20).mean()
            std_20 = df[col].rolling(window=20).std()
            features[f'{col}_BB_Upper'] = ma_20 + (2 * std_20)
            features[f'{col}_BB_Lower'] = ma_20 - (2 * std_20)
            features[f'{col}_BB_Position'] = (df[col] - features[f'{col}_BB_Lower']) / (features[f'{col}_BB_Upper'] - features[f'{col}_BB_Lower'])

            # RSI
            delta = df[col].diff()
            gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
            loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
            rs = gain / loss
            features[f'{col}_RSI'] = 100 - (100 / (1 + rs))

    return features

price_cols_for_feats = [col for col in df_full.columns if col != 'date_id']
df_featured = create_technical_features(df_full, price_cols_for_feats)
print("特徴量生成後のデータ形状:", df_featured.shape)

# 新しく作成されたカラムを確認
new_cols = [col for col in df_featured.columns if col not in df_full.columns]
print("生成された特徴量のサンプル:", new_cols[:5])

# 4. 無限大や追加のNaNを処理
df_featured.replace([np.inf, -np.inf], np.nan, inplace=True)
df_featured.fillna(method='ffill', inplace=True)
df_featured.fillna(method='bfill', inplace=True)
"""

'\n#データの結合\ndf_full = pd.merge(train, train_labels, on=\'date_id\', how=\'left\')\n\n#欠損値の処理\nfeature_cols = [col for col in df_full.columns if col != \'date_id\']\ndf_full[feature_cols] = df_full[feature_cols].fillna(method=\'ffill\').fillna(method=\'bfill\')\n\n#特徴量エンジニアリング\ndef create_technical_features(df, price_cols, window_sizes=[5, 10, 20]):\n    Create technical indicators for price series\n    features = df.copy()\n\n    for col in price_cols:\n        if col in df.columns:\n            # Moving averages\n            for window in window_sizes:\n                features[f\'{col}_MA_{window}\'] = df[col].rolling(window=window).mean()\n                features[f\'{col}_STD_{window}\'] = df[col].rolling(window=window).std()\n\n            # Price changes and returns\n            features[f\'{col}_Return_1d\'] = df[col].pct_change(1)\n            features[f\'{col}_Return_5d\'] = df[col].pct_change(5)\n            features[f\'{col}_LogReturn_1d\'] = np.log(df[col] / df[col].shift(1

In [8]:
"""
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error

target_cols = [col for col in df_featured.columns if col.startswith('target_')]
featured_cols = [col for col in df_featured.columns if col not in target_cols and col != 'date_id']

target_to_train = 'target_0'

#学習データから、Nanを持つものを消去'
train_df = df_featured.dropna(subset=[target_to_train])
X = train_df[featured_cols]
y = train_df[target_to_train]

# --------------------------------------------------
# TimeSeriesSplit を使ったクロスバリデーション
# --------------------------------------------------
print(f"\n--- Training baseline model for {target_to_train} ---")

def cross_validation_strategy():
    cv_strategy = {
        'method': 'TimeSeriesSplit',
        'n_splits': 5,
        'gap': 5,  # Days between train and test
        'test_size': 90,  # Days in test set
        'expanding_window': True  # Use all previous data for training
    }
    return cv_strategy

cv_strategy = cross_validation_strategy()
tscv = TimeSeriesSplit(
    n_splits=cv_strategy['n_splits'],
    gap=cv_strategy['gap'],
    test_size=cv_strategy['test_size'],
)

oof_preds = np.zeros(len(X))
models = []
scores = []

for fold, (train_index, val_index) in enumerate(tscv.split(X)):
    print(f"--- Fold {fold+1} ---")
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # LightGBMモデルの定義
    lgb_params = {
        'objective': 'regression_l1', # MAE
        'metric': 'rmse', # RMSE
        'n_estimators': 1000,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'lambda_l1': 0.1,
        'lambda_l2': 0.1,
        'num_leaves': 31,
        'verbose': -1,
        'n_jobs': -1,
        'seed': 42
    }

    model = lgb.LGBMRegressor(**lgb_params)

    # 学習
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              eval_metric='rmse',
              callbacks=[lgb.early_stopping(100, verbose=False)])

    # 評価
    val_preds = model.predict(X_val)
    oof_preds[val_index] = val_preds
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    scores.append(rmse)
    print(f"Fold {fold+1} RMSE: {rmse:.4f}")
    models.append(model)

print(f"\nAverage CV RMSE: {np.mean(scores):.4f} (+/- {np.std(scores):.4f})")
"""

'\nimport lightgbm as lgb\nfrom sklearn.model_selection import TimeSeriesSplit\nfrom sklearn.metrics import mean_squared_error\n\ntarget_cols = [col for col in df_featured.columns if col.startswith(\'target_\')]\nfeatured_cols = [col for col in df_featured.columns if col not in target_cols and col != \'date_id\']\n\ntarget_to_train = \'target_0\'\n\n#学習データから、Nanを持つものを消去\'\ntrain_df = df_featured.dropna(subset=[target_to_train])\nX = train_df[featured_cols]\ny = train_df[target_to_train]\n\n# --------------------------------------------------\n# TimeSeriesSplit を使ったクロスバリデーション\n# --------------------------------------------------\nprint(f"\n--- Training baseline model for {target_to_train} ---")\n\ndef cross_validation_strategy():\n    cv_strategy = {\n        \'method\': \'TimeSeriesSplit\',\n        \'n_splits\': 5,\n        \'gap\': 5,  # Days between train and test\n        \'test_size\': 90,  # Days in test set\n        \'expanding_window\': True  # Use all previous data for trainin

In [9]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import gc

TRAIN_PATH = '/content/train.csv'
LABELS_PATH = '/content/train_labels.csv'
TARGET_PAIRS_PATH = '/content/target_pairs.csv'

# 特徴量エンジニアリングで利用する価格カラム
# LME, JPX, US Stock, FX の主要なカラムをリストアップ
PRICE_COLS = [
    'LME_AH_Close', 'LME_CA_Close', 'LME_NI_Close', 'LME_PB_Close', 'LME_SN_Close', 'LME_ZS_Close',
    'JPX_Gold_Standard_Futures_Close', 'JPX_Silver_Standard_Futures_Close', 'JPX_Platinum_Standard_Futures_Close',
    'JPX_Palladium_Standard_Futures_Close', 'JPX_Nikkei_225_Futures_Close',
    'US_Stock_GLD_adj_close', 'US_Stock_SLV_adj_close', 'US_Stock_USO_adj_close', 'US_Stock_VT_adj_close',
    'FX_EURUSD', 'FX_USDJPY', 'FX_GBPUSD', 'FX_AUDUSD'
]

# モデルのパラメーター
LGB_params = {
    'objective': 'regression_l1',
    'metrics': 'rmse',
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'num_leaves': 31,
    'verbose': -1,
    'n_jobs': -1,
    'seed':42,
    'boosting_type': 'gbdt',
    'device': 'gpu'
    }

# クロスバリデーションの設定
N_SPLITS = 5
GAP = 5
TEST_SIZE = 90

print("設定が完了しました。")

設定が完了しました。


In [10]:
# --- データ読み込みと前処理 ---

def load_data():
    """データを読み込み、結合する"""
    print("データの読み込みを開始します...")
    train_df = pd.read_csv(TRAIN_PATH)
    labels_df = pd.read_csv(LABELS_PATH)
    df = pd.merge(train_df, labels_df, on='date_id', how='left')
    print("データの読み込みと結合が完了しました。")
    return df

#特徴量エンジニアリング
def create_technical_features(df, price_cols, window_sizes=[5, 10, 20]):
    """テクニカル指標を生成する"""
    features = df.copy()
    for col in price_cols:
        if col in df.columns:
            for window in window_sizes:
                features[f'{col}_MA_{window}'] = features[col].rolling(window=window, min_periods=1).mean()
                features[f'{col}_STD_{window}'] = features[col].rolling(window=window, min_periods=1).std()
            features[f'{col}_Return_1d'] = features[col].pct_change(1)
            features[f'{col}_LogReturn_1d'] = np.log(features[col] / features[col].shift(1))
            ma_20 = features[col].rolling(window=20, min_periods=1).mean()
            std_20 = features[col].rolling(window=20, min_periods=1).std()
            features[f'{col}_BB_Upper'] = ma_20 + (2 * std_20)
            features[f'{col}_BB_Lower'] = ma_20 - (2 * std_20)
    return features

def create_cross_asset_features(df, asset_groups):
    """クロスアセット特徴量を生成する"""
    features = df.copy()
    for group_name, assets in asset_groups.items():
        available_assets = [asset for asset in assets if asset in df.columns]
        if len(available_assets) >= 2:
            group_returns = df[available_assets].pct_change().rolling(5).mean()
            features[f'{group_name}_Momentum_5d'] = group_returns.mean(axis=1)
            features[f'{group_name}_Volatility_5d'] = df[available_assets].pct_change().rolling(5).std().mean(axis=1)
    return features

def generate_features(df):
  print('特徴量生成を開始します。')

  df_features = df.ffill().bfill()
#テクニカル指標の作成
  df_features = create_technical_features(df_features, PRICE_COLS)
#クロスアセットの作成
  asset_groups = {
      'Metals': [col for col in PRICE_COLS if 'LME' in col],
      'Precious_Metals': [col for col in PRICE_COLS if 'Gold' in col or 'Silver' in col or 'Platinum' in col],
      'FX': [col for col in PRICE_COLS if 'FX' in col]
    }
  df_features = create_cross_asset_features(df_features, asset_groups)

  df_features.replace([np.inf, -np.inf], np.nan, inplace=True)
  df_features = df_features.ffill().bfill()

  print(f"特徴量作成完了. データ形状:{df_features.shape} ")
  return df_features

full_df = load_data()
featured_df = generate_features(full_df)

del full_df
gc.collect()

データの読み込みを開始します...
データの読み込みと結合が完了しました。
特徴量生成を開始します。
特徴量作成完了. データ形状:(1917, 1118) 


0

In [11]:
#モデル学習と評価
def train_and_evaluate(df):
  """全ターゲットに対してモデルを学習・評価し、結果を返す"""
  print("\nモデル学習と評価を開始します...")
  target_cols = [col for col in df.columns if col.startswith('target_')]
  featured_cols = [col for col in df.columns if col not in target_cols and col != 'date_id']

  oof_preds_df = pd.DataFrame(index=df.index)
  scores_list = []
  feature_importance_df = pd.DataFrame()

  # Define LGB_params inside the function to ensure correct parameters are used
  LGB_params = {
    'objective': 'regression_l1',
    'metrics': 'rmse',
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'num_leaves': 31,
    'verbose': -1,
    'n_jobs': -1,
    'seed':42,
    'boosting_type': 'gbdt',
    #'device': 'cuda'
  }


  # ↓↓↓ 全ターゲットで実行する場合はこの行をコメントアウト ↓↓↓
  # target_cols = target_cols[:5] # テスト用に最初の5つのターゲットに絞る
  # ↑↑↑ 全ターゲットで実行する場合はこの行をコメントアウト ↑↑↑

  for i, target in enumerate(target_cols):
    print(f"\n--- Processing Target {i+1}/{len(target_cols)}: {target} ---")

    #このターゲットのための学習データを準備
    temp_df = df.dropna(subset=[target])
    X = temp_df[featured_cols]
    y = temp_df[target]

    #クロスバリデーションの設定
    tscv = TimeSeriesSplit(n_splits=N_SPLITS, gap=GAP, test_size=TEST_SIZE)
    oof_preds = np.zeros(len(X))
    fold_models = []
    fold_scores = []
    fold_importances = pd.DataFrame(index=featured_cols)

    for fold, (train_index, val_index) in enumerate(tscv.split(X)):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        model = lgb.LGBMRegressor(**LGB_params)
        model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  eval_metric='rmse',
                  callbacks=[lgb.early_stopping(100, verbose=False)])

    val_preds = model.predict(X_val)
    oof_preds[val_index] = val_preds
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    fold_scores.append(rmse)

    fold_importances[f'fold_{fold+1}'] = model.feature_importances_
    fold_models.append(model)

    # 3. 結果の集計
    oof_preds_df[target] = pd.Series(oof_preds, index=X.index)
    avg_rmse = np.mean(fold_scores)
    std_rmse = np.std(fold_scores)
    scores_list.append({'target': target, 'avg_rmse': avg_rmse, 'std_rmse': std_rmse})
    print(f"Target {target} | Avg RMSE: {avg_rmse:.4f} (+/- {std_rmse:.4f})")

    # 特徴量重要度の集計
    feature_importance_df[target] = fold_importances.mean(axis=1)

  # 4. 最終結果をまとめる
  scores_df = pd.DataFrame(scores_list)
  return oof_preds_df, scores_df, feature_importance_df

# --- 実行 ---
oof_df, scores_df, importance_df = train_and_evaluate(featured_df)

print("\n--- 全ターゲットの学習が完了しました ---")
print("\n平均評価スコア:")
print(scores_df.head())
print(f"\n全ターゲットの平均RMSE: {scores_df['avg_rmse'].mean():.4f}")

print("\n特徴量重要度 (上位10件):")
# 全ターゲットでの重要度の平均を計算して表示
top_features = importance_df.mean(axis=1).sort_values(ascending=False).head(10)
print(top_features)


# --- 結果の保存 ---
# 必要に応じて、結果をファイルに保存します
oof_df.to_csv('/content/drive/MyDrive/oof_predictions_lgbm.csv')
scores_df.to_csv('/content/drive/MyDrive/scores_lgbm.csv', index=False)
importance_df.to_csv('/content/drive/MyDrive/feature_importance_lgbm.csv')
print("\n予測結果、スコア、特徴量重要度をCSVファイルとして保存しました。")


モデル学習と評価を開始します...

--- Processing Target 1/424: target_0 ---
Target target_0 | Avg RMSE: 0.0149 (+/- 0.0000)

--- Processing Target 2/424: target_1 ---
Target target_1 | Avg RMSE: 0.0175 (+/- 0.0000)

--- Processing Target 3/424: target_2 ---
Target target_2 | Avg RMSE: 0.0111 (+/- 0.0000)

--- Processing Target 4/424: target_3 ---
Target target_3 | Avg RMSE: 0.0096 (+/- 0.0000)

--- Processing Target 5/424: target_4 ---
Target target_4 | Avg RMSE: 0.0148 (+/- 0.0000)

--- Processing Target 6/424: target_5 ---
Target target_5 | Avg RMSE: 0.0146 (+/- 0.0000)

--- Processing Target 7/424: target_6 ---
Target target_6 | Avg RMSE: 0.0113 (+/- 0.0000)

--- Processing Target 8/424: target_7 ---
Target target_7 | Avg RMSE: 0.0172 (+/- 0.0000)

--- Processing Target 9/424: target_8 ---
Target target_8 | Avg RMSE: 0.0176 (+/- 0.0000)

--- Processing Target 10/424: target_9 ---
Target target_9 | Avg RMSE: 0.0138 (+/- 0.0000)

--- Processing Target 11/424: target_10 ---
Target target_10 | Avg RM