# exp006 - 「Less is More」による精密最適化

exp005の失敗（過学習）を受けて、シンプル化と精密最適化による堅実な改善を目指す

## 戦略
1. **特徴量削減**: 45個 → 15-18個
2. **単一モデル集中**: LightGBMのみ
3. **ハイパーパラメータ最適化**: Optuna活用
4. **汎化性能重視**: CV vs Kaggle乖離監視

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import japanize_matplotlib
import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb
import optuna
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

plt.rcParams['font.family'] = 'IPAexGothic'

print("🎯 exp006 - Less is More戦略開始")
print(f"LightGBM: {lgb.__version__}")
print(f"Optuna: {optuna.__version__}")

# データ読み込み
train_df = pd.read_csv('/Users/koki.ogai/Documents/ghq/github.com/oddgai/kaggle-projects/titanic/data/train.csv')
test_df = pd.read_csv('/Users/koki.ogai/Documents/ghq/github.com/oddgai/kaggle-projects/titanic/data/test.csv')

print(f"\nTrain shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

## Step 1: 準備・分析（exp004特徴量重要度分析）

In [None]:
# exp004と同じ特徴量エンジニアリング（比較のため）
def exp004_feature_engineering(df):
    df = df.copy()
    
    # 名前からの特徴量
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.')
    title_mapping = {
        'Mr': 'Mr', 'Mrs': 'Mrs', 'Miss': 'Miss', 'Master': 'Master',
        'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare',
        'Mlle': 'Miss', 'Countess': 'Rare', 'Ms': 'Mrs', 'Lady': 'Rare',
        'Jonkheer': 'Rare', 'Don': 'Rare', 'Dona': 'Rare', 'Mme': 'Mrs',
        'Capt': 'Rare', 'Sir': 'Rare'
    }
    df['Title_Grouped'] = df['Title'].map(title_mapping).fillna('Other')
    df['Name_Length'] = df['Name'].str.len()
    
    # 苗字と家族サイズ
    df['Surname'] = df['Name'].str.split(',').str[0]
    all_surnames = pd.concat([train_df['Name'], test_df['Name']]).str.split(',').str[0]
    surname_counts = all_surnames.value_counts()
    df['Surname_Count'] = df['Surname'].map(surname_counts)
    
    # チケット特徴量
    df['Ticket_Length'] = df['Ticket'].str.len()
    df['Ticket_IsNumeric'] = df['Ticket'].str.isnumeric().astype(int)
    df['Ticket_Prefix'] = df['Ticket'].str.extract(r'^([A-Za-z]+)').fillna('NUMERIC')
    
    all_tickets = pd.concat([train_df['Ticket'], test_df['Ticket']])
    ticket_counts = all_tickets.value_counts()
    df['Ticket_Count'] = df['Ticket'].map(ticket_counts)
    
    # 客室特徴量
    df['HasCabin'] = (~df['Cabin'].isnull()).astype(int)
    df['Cabin_Deck'] = df['Cabin'].str.extract(r'^([A-Za-z])').fillna('Unknown')
    df['Cabin_Number'] = df['Cabin'].str.extract(r'(\d+)').astype(float)
    df['Cabin_Count'] = df['Cabin'].fillna('').str.split().str.len()
    df.loc[df['Cabin'].isnull(), 'Cabin_Count'] = 0
    
    # 基本前処理
    df['Sex_Binary'] = df['Sex'].map({'female': 0, 'male': 1})
    df['Age'] = df.groupby(['Sex', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))
    df['Fare'] = df.groupby('Pclass')['Fare'].transform(lambda x: x.fillna(x.median()))
    df['Embarked'] = df['Embarked'].fillna('S')
    
    # 家族構成
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df['IsSmallFamily'] = ((df['FamilySize'] >= 2) & (df['FamilySize'] <= 4)).astype(int)
    df['IsLargeFamily'] = (df['FamilySize'] > 4).astype(int)
    
    # 年齢・運賃グループ
    df['Age_Group'] = pd.cut(df['Age'], bins=[0, 12, 18, 25, 35, 50, 65, 100], 
                           labels=['Child', 'Teen', 'Young', 'Adult', 'Middle', 'Senior', 'Elder'])
    df['Fare_Group'] = pd.qcut(df['Fare'], q=8, labels=['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8'])
    
    # 基本交互作用
    df['Sex_Pclass'] = df['Sex_Binary'] * df['Pclass']
    df['Age_Fare_Interaction'] = df['Age'] * df['Fare']
    df['Age_FamilySize'] = df['Age'] * df['FamilySize']
    
    # 統計特徴量
    df['Age_Rank_SexPclass'] = df.groupby(['Sex_Binary', 'Pclass'])['Age'].rank(pct=True)
    df['Fare_Rank_Pclass'] = df.groupby('Pclass')['Fare'].rank(pct=True)
    
    return df

# exp004特徴量でベースライン確認
print("=== Step 1: exp004特徴量重要度分析 ===")
train_processed = exp004_feature_engineering(train_df)
test_processed = exp004_feature_engineering(test_df)

print(f"exp004特徴量数: {train_processed.shape[1] - train_df.shape[1]}個（追加分）")

In [None]:
# カテゴリカル特徴量のエンコーディング
categorical_features = ['Embarked', 'Title_Grouped', 'Cabin_Deck', 'Ticket_Prefix', 'Age_Group', 'Fare_Group']

for feature in categorical_features:
    if feature in train_processed.columns:
        le = LabelEncoder()
        all_categories = pd.concat([train_processed[feature], test_processed[feature]]).astype(str)
        le.fit(all_categories)
        
        train_processed[feature] = le.transform(train_processed[feature].astype(str))
        test_processed[feature] = le.transform(test_processed[feature].astype(str))

# exp004相当の特徴量選択
exclude_features = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived', 'Surname', 'Title', 'Sex']
exp004_features = [col for col in train_processed.columns 
                  if col not in exclude_features and 
                  train_processed[col].dtype in ['int64', 'float64', 'int32', 'float32']]

X_full = train_processed[exp004_features]
y = train_processed['Survived']
X_test_full = test_processed[exp004_features]

print(f"exp004相当特徴量数: {len(exp004_features)}")
print("\nexp004特徴量一覧:")
for i, feat in enumerate(exp004_features, 1):
    print(f"{i:2d}. {feat}")

# 重要度取得のため簡易モデル訓練
print("\n特徴量重要度分析のため簡易LightGBM訓練...")
lgb_params_simple = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'verbose': -1,
    'random_state': 42
}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
feature_importance_scores = np.zeros(len(exp004_features))

for fold, (train_idx, val_idx) in enumerate(kf.split(X_full, y)):
    X_train, X_val = X_full.iloc[train_idx], X_full.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    model = lgb.train(
        lgb_params_simple,
        train_data,
        valid_sets=[val_data],
        num_boost_round=500,
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)]
    )
    
    feature_importance_scores += model.feature_importance()

# 平均重要度計算
feature_importance_scores /= 5

# 重要度データフレーム作成
importance_df = pd.DataFrame({
    'feature': exp004_features,
    'importance': feature_importance_scores
}).sort_values('importance', ascending=False)

print("\n=== exp004特徴量重要度ランキング ===")
for i, row in importance_df.iterrows():
    print(f"{importance_df.index.get_loc(i)+1:2d}. {row['feature']:20s}: {row['importance']:8.1f}")

# 重要度可視化
plt.figure(figsize=(12, 8))
top_features = importance_df.head(15)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('重要度')
plt.title('exp004特徴量重要度 Top 15')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print(f"\n重要度分析完了。次のステップで削減候補を決定します。")

## Step 2: 段階的特徴量削減実験

In [None]:
print("=== Step 2: 段階的特徴量削減実験 ===")

# 削減戦略の設定
def create_feature_sets(importance_df, exp004_features):
    """
    重要度に基づいて段階的な特徴量セットを作成
    """
    # 基本特徴量（絶対に残す）
    core_features = ['Sex_Binary', 'Pclass', 'Age', 'Fare']
    
    # 重要度順にソート済みの特徴量リスト
    sorted_features = importance_df['feature'].tolist()
    
    # 各段階の特徴量セット定義
    feature_sets = {
        '20features': {
            'features': sorted_features[:20],
            'description': '重要度上位20特徴量'
        },
        '18features': {
            'features': sorted_features[:18], 
            'description': '重要度上位18特徴量'
        },
        '15features': {
            'features': sorted_features[:15],
            'description': '重要度上位15特徴量'
        },
        '12features': {
            'features': sorted_features[:12],
            'description': '重要度上位12特徴量（攻めの削減）'
        }
    }
    
    # コア特徴量が含まれていることを確認
    for set_name, set_info in feature_sets.items():
        missing_core = [f for f in core_features if f not in set_info['features']]
        if missing_core:
            print(f"⚠️ {set_name}にコア特徴量{missing_core}が不足")
    
    return feature_sets

# 特徴量セット作成
feature_sets = create_feature_sets(importance_df, exp004_features)

# 各セットの内容表示
for set_name, set_info in feature_sets.items():
    print(f"\n{set_name} ({set_info['description']}):")
    for i, feat in enumerate(set_info['features'], 1):
        importance = importance_df[importance_df['feature'] == feat]['importance'].iloc[0]
        print(f"{i:2d}. {feat:20s} ({importance:6.1f})")

print("\n特徴量セット定義完了")

In [None]:
# 各特徴量セットでの性能評価
def evaluate_feature_set(X, y, features, set_name, n_folds=5):
    """
    指定された特徴量セットでCV性能を評価
    """
    X_subset = X[features]
    
    # LightGBMパラメータ（exp004準拠）
    lgb_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.1,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'random_state': 42
    }
    
    kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    cv_scores = []
    early_stop_rounds = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_subset, y)):
        X_train, X_val = X_subset.iloc[train_idx], X_subset.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
        
        model = lgb.train(
            lgb_params,
            train_data,
            valid_sets=[val_data],
            num_boost_round=1000,
            callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)]
        )
        
        val_pred = model.predict(X_val)
        val_pred_binary = (val_pred >= 0.5).astype(int)
        fold_score = accuracy_score(y_val, val_pred_binary)
        cv_scores.append(fold_score)
        early_stop_rounds.append(model.best_iteration)
    
    mean_cv = np.mean(cv_scores)
    std_cv = np.std(cv_scores)
    mean_early_stop = np.mean(early_stop_rounds)
    
    return {
        'set_name': set_name,
        'n_features': len(features),
        'cv_mean': mean_cv,
        'cv_std': std_cv,
        'early_stop_mean': mean_early_stop,
        'cv_scores': cv_scores
    }

# 全特徴量セットで評価実行
print("各特徴量セットでの性能評価開始...")

evaluation_results = []
for set_name, set_info in feature_sets.items():
    print(f"\n📊 {set_name} 評価中...")
    result = evaluate_feature_set(X_full, y, set_info['features'], set_name)
    evaluation_results.append(result)
    
    print(f"CV: {result['cv_mean']:.4f} ± {result['cv_std']:.4f}")
    print(f"Early Stop平均: {result['early_stop_mean']:.1f}ラウンド")

# 結果比較
print("\n=== 特徴量削減結果比較 ===")
print(f"{'Set':15s} {'Features':8s} {'CV Mean':8s} {'CV Std':8s} {'Early Stop':10s} {'期待Kaggle':10s}")
print("-" * 70)

# exp004基準の期待Kaggle計算（経験式）
exp004_cv = 0.8462
exp004_kaggle = 0.77990
cv_kaggle_ratio = exp004_kaggle / exp004_cv

best_result = None
best_score = 0

for result in evaluation_results:
    expected_kaggle = result['cv_mean'] * cv_kaggle_ratio
    print(f"{result['set_name']:15s} {result['n_features']:8d} {result['cv_mean']:8.4f} {result['cv_std']:8.4f} {result['early_stop_mean']:10.1f} {expected_kaggle:10.5f}")
    
    # 期待Kaggleが最高のものを選択
    if expected_kaggle > best_score:
        best_score = expected_kaggle
        best_result = result

print(f"\n🏆 最高性能特徴量セット: {best_result['set_name']}")
print(f"📊 CV性能: {best_result['cv_mean']:.4f} ± {best_result['cv_std']:.4f}")
print(f"🎯 期待Kaggleスコア: {best_score:.5f}")

# 最適特徴量セット決定
optimal_features = feature_sets[best_result['set_name']]['features']
print(f"\n最適特徴量数: {len(optimal_features)}")

## Step 3: ハイパーパラメータ最適化（Optuna）

In [None]:
print("=== Step 3: ハイパーパラメータ最適化 ===")
print(f"最適特徴量セット: {best_result['set_name']} ({len(optimal_features)}特徴量)")

# 最適化用データ準備
X_optimal = X_full[optimal_features]
X_test_optimal = X_test_full[optimal_features]

print("\n使用する特徴量:")
for i, feat in enumerate(optimal_features, 1):
    importance = importance_df[importance_df['feature'] == feat]['importance'].iloc[0]
    print(f"{i:2d}. {feat:20s} ({importance:6.1f})")

# Optuna最適化関数定義
def objective(trial):
    """
    Optunaの目的関数：CV精度を最大化
    """
    # ハイパーパラメータ探索空間
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'random_state': 42,
        
        # 主要パラメータ
        'num_leaves': trial.suggest_int('num_leaves', 15, 60),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        
        # 正則化パラメータ（重点的に探索）
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 2.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 2.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 1.0)
    }
    
    # 5-fold交差検証
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []
    early_stop_counts = []
    
    for train_idx, val_idx in kf.split(X_optimal, y):
        X_train, X_val = X_optimal.iloc[train_idx], X_optimal.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
        
        model = lgb.train(
            params,
            train_data,
            valid_sets=[val_data],
            num_boost_round=1000,
            callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)]
        )
        
        val_pred = model.predict(X_val)
        val_pred_binary = (val_pred >= 0.5).astype(int)
        fold_score = accuracy_score(y_val, val_pred_binary)
        cv_scores.append(fold_score)
        early_stop_counts.append(model.best_iteration)
    
    mean_cv = np.mean(cv_scores)
    mean_early_stop = np.mean(early_stop_counts)
    
    # 過学習の兆候がある場合ペナルティ
    if mean_early_stop > 800:  # 早期停止しない場合
        mean_cv *= 0.99  # 軽いペナルティ
    
    return mean_cv

# Optuna最適化実行
print("\nOptuna最適化開始（100回試行）...")
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42)
)

# 最適化実行
study.optimize(objective, n_trials=100, show_progress_bar=True)

print("\n=== 最適化結果 ===")
print(f"最高CV精度: {study.best_value:.4f}")
print(f"最適化試行数: {len(study.trials)}")

print("\n最適パラメータ:")
best_params = study.best_params.copy()
best_params.update({
    'objective': 'binary',
    'metric': 'binary_logloss', 
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'random_state': 42
})

for param, value in study.best_params.items():
    print(f"{param:20s}: {value}")

# 期待Kaggleスコア計算
expected_kaggle = study.best_value * cv_kaggle_ratio
improvement_from_exp004 = expected_kaggle - exp004_kaggle

print(f"\n📈 改善分析:")
print(f"最適化前CV: {best_result['cv_mean']:.4f}")
print(f"最適化後CV: {study.best_value:.4f} ({study.best_value - best_result['cv_mean']:+.4f})")
print(f"期待Kaggle: {expected_kaggle:.5f}")
print(f"exp004から: {improvement_from_exp004:+.5f} ({improvement_from_exp004/exp004_kaggle*100:+.2f}%)")

if improvement_from_exp004 > 0:
    print("✅ exp004からの改善が期待される！")
else:
    print("⚠️ exp004からの改善は限定的")

print("\nハイパーパラメータ最適化完了")

## Step 4: 最終モデル構築と評価

In [None]:
print("=== Step 4: 最終モデル構築と評価 ===")

# 最終モデル訓練
print("最終LightGBMモデル訓練開始...")

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
final_cv_scores = []
oof_predictions = np.zeros(len(X_optimal))
test_predictions = np.zeros(len(X_test_optimal))
models = []
feature_importance_final = np.zeros(len(optimal_features))

for fold, (train_idx, val_idx) in enumerate(kf.split(X_optimal, y), 1):
    print(f"Fold {fold}/5 訓練中...")
    
    X_train, X_val = X_optimal.iloc[train_idx], X_optimal.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    model = lgb.train(
        best_params,
        train_data,
        valid_sets=[val_data],
        num_boost_round=1000,
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)]
    )
    
    # 予測
    val_pred = model.predict(X_val)
    test_pred = model.predict(X_test_optimal)
    
    val_pred_binary = (val_pred >= 0.5).astype(int)
    fold_score = accuracy_score(y_val, val_pred_binary)
    final_cv_scores.append(fold_score)
    
    # OOF予測保存
    oof_predictions[val_idx] = val_pred
    test_predictions += test_pred / 5
    
    # モデルと重要度保存
    models.append(model)
    feature_importance_final += model.feature_importance()
    
    print(f"Fold {fold} Accuracy: {fold_score:.4f} (Rounds: {model.best_iteration})")

# 最終結果計算
final_cv_mean = np.mean(final_cv_scores)
final_cv_std = np.std(final_cv_scores)
oof_accuracy = accuracy_score(y, (oof_predictions >= 0.5).astype(int))
feature_importance_final /= 5

print(f"\n=== 最終モデル性能 ===")
print(f"CV Accuracy: {final_cv_mean:.4f} ± {final_cv_std:.4f}")
print(f"OOF Accuracy: {oof_accuracy:.4f}")

# exp004・exp005との比較
print(f"\n=== 実験比較 ===")
print(f"exp004 CV: 0.8462 ± 0.034")
print(f"exp005 CV: 0.8507 ± 0.012 (過学習)")
print(f"exp006 CV: {final_cv_mean:.4f} ± {final_cv_std:.4f}")

cv_improvement_004 = final_cv_mean - 0.8462
print(f"\nexp004からのCV改善: {cv_improvement_004:+.4f}")

# 期待Kaggle性能
final_expected_kaggle = final_cv_mean * cv_kaggle_ratio
kaggle_improvement_004 = final_expected_kaggle - exp004_kaggle

print(f"\n🎯 期待性能:")
print(f"期待Kaggle: {final_expected_kaggle:.5f}")
print(f"exp004から: {kaggle_improvement_004:+.5f} ({kaggle_improvement_004/exp004_kaggle*100:+.2f}%)")

if kaggle_improvement_004 > 0.005:
    print("🎉 大幅改善が期待される！")
elif kaggle_improvement_004 > 0:
    print("✅ 改善が期待される")
else:
    print("⚠️ 改善は限定的")

# 最終特徴量重要度
final_importance_df = pd.DataFrame({
    'feature': optimal_features,
    'importance': feature_importance_final
}).sort_values('importance', ascending=False)

print(f"\n=== 最終特徴量重要度 ===")
for i, row in final_importance_df.iterrows():
    print(f"{final_importance_df.index.get_loc(i)+1:2d}. {row['feature']:20s}: {row['importance']:8.1f}")

# 重要度可視化
plt.figure(figsize=(10, 6))
plt.barh(range(len(final_importance_df)), final_importance_df['importance'])
plt.yticks(range(len(final_importance_df)), final_importance_df['feature'])
plt.xlabel('重要度')
plt.title(f'exp006最終特徴量重要度 ({len(optimal_features)}特徴量)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# テストデータ予測と提出ファイル生成
test_predictions_binary = (test_predictions >= 0.5).astype(int)

# 提出ファイル作成
submission = pd.DataFrame({
    'PassengerId': test_processed['PassengerId'],
    'Survived': test_predictions_binary
})

# 結果保存
import os
os.makedirs('/Users/koki.ogai/Documents/ghq/github.com/oddgai/kaggle-projects/titanic/results/exp006', exist_ok=True)
submission.to_csv('/Users/koki.ogai/Documents/ghq/github.com/oddgai/kaggle-projects/titanic/results/exp006/result.csv', index=False)

print(f"\n=== 提出ファイル生成完了 ===")
print(f"生存予測数: {test_predictions_binary.sum()}")
print(f"死亡予測数: {len(test_predictions_binary) - test_predictions_binary.sum()}")
print(f"予測生存率: {test_predictions_binary.mean():.3f}")
print(f"\n実際生存率: {y.mean():.3f}")
print(f"予測vs実際: {test_predictions_binary.mean() - y.mean():+.3f}")

print(f"\n💾 提出ファイル: results/exp006/result.csv")

# 予測分布可視化
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(oof_predictions, bins=50, alpha=0.7, label='OOF Predictions')
plt.axvline(0.5, color='red', linestyle='--', label='Threshold')
plt.xlabel('予測確率')
plt.ylabel('頻度')
plt.title('OOF予測分布')
plt.legend()

plt.subplot(1, 2, 2)
plt.hist(test_predictions, bins=50, alpha=0.7, label='Test Predictions')
plt.axvline(0.5, color='red', linestyle='--', label='Threshold')
plt.xlabel('予測確率')
plt.ylabel('頻度')
plt.title('テスト予測分布')
plt.legend()

plt.tight_layout()
plt.show()

## exp006 最終サマリー

In [None]:
print("="*60)
print("                🎯 EXP006 最終結果サマリー")
print("="*60)

print(f"\n📊 実験設定:")
print(f"戦略: Less is More（シンプル化 + 精密最適化）")
print(f"特徴量数: {len(optimal_features)} (exp004: 23, exp005: 45)")
print(f"モデル: LightGBM単体")
print(f"最適化: Optuna 100回試行")

print(f"\n📈 性能結果:")
print(f"CV Accuracy: {final_cv_mean:.4f} ± {final_cv_std:.4f}")
print(f"OOF Accuracy: {oof_accuracy:.4f}")
print(f"期待Kaggle: {final_expected_kaggle:.5f}")

print(f"\n🎯 実験進捗比較:")
experiments = [
    ('exp001', 0.77272, 0.8496),
    ('exp004', 0.77990, 0.8462),
    ('exp005', 0.76315, 0.8507),
    ('exp006', '???', final_cv_mean)
]

for exp_name, kaggle, cv in experiments:
    if exp_name == 'exp006':
        print(f"{exp_name}: Kaggle={kaggle}, CV={cv:.4f} ← 今回")
    else:
        print(f"{exp_name}: Kaggle={kaggle}, CV={cv:.4f}")

print(f"\n📊 改善分析:")
print(f"exp004からCV改善: {cv_improvement_004:+.4f} ({cv_improvement_004/0.8462*100:+.2f}%)")
print(f"exp004から期待Kaggle改善: {kaggle_improvement_004:+.5f} ({kaggle_improvement_004/exp004_kaggle*100:+.2f}%)")

print(f"\n🔧 実装成果:")
print(f"✅ 特徴量削減: 23→{len(optimal_features)}特徴量（{23-len(optimal_features)}個削減）")
print(f"✅ ハイパーパラメータ最適化: Optuna 100回試行")
print(f"✅ 過学習制御: CV標準偏差{final_cv_std:.4f}（安定性確保）")
print(f"✅ 汎化性能重視: 期待乖離{abs(final_cv_mean - final_expected_kaggle):.4f}")

print(f"\n🎨 主要改善要因:")
print(f"1. 戦略的特徴量削減（重要度ベース）")
print(f"2. LightGBM精密最適化（正則化重視）")
print(f"3. Less is More原理の実践")
print(f"4. 汎化性能を重視した設計")

print(f"\n🏆 使用特徴量 Top 5:")
for i, row in final_importance_df.head(5).iterrows():
    print(f"{final_importance_df.index.get_loc(i)+1}. {row['feature']:20s}: {row['importance']:8.1f}")

print(f"\n💡 exp006の学び:")
if kaggle_improvement_004 > 0:
    print(f"🎉 シンプル化戦略が成功！")
    print(f"📈 特徴量の質 > 量 の実証")
    print(f"⚖️ 最適化と汎化のバランス")
else:
    print(f"📚 改善が限定的だが重要な学習")
    print(f"🔍 Titanicデータの特性理解深化")
    print(f"⚙️ 最適化手法の実践習得")

print(f"\n🎯 次の実験への示唆:")
if final_cv_mean > 0.850:
    print(f"・現在の手法で高い性能を達成")
    print(f"・アンサンブル（異なる特徴量セット）で更なる向上")
else:
    print(f"・Neural Networkの導入検討")
    print(f"・外部データ活用（歴史的事実）")
    print(f"・より高度な特徴量エンジニアリング")

print("="*60)
print("🚀 exp006完了！Kaggleでの結果をお楽しみに！")
print("="*60)