# exp006 - 「Less is More」による精密最適化（高速版）

exp005の失敗を受けて、シンプル化と効率的な最適化による改善を目指す

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import japanize_matplotlib
import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

plt.rcParams['font.family'] = 'IPAexGothic'

print("🎯 exp006 - Less is More戦略（高速版）")

# データ読み込み
train_df = pd.read_csv('/Users/koki.ogai/Documents/ghq/github.com/oddgai/kaggle-projects/titanic/data/train.csv')
test_df = pd.read_csv('/Users/koki.ogai/Documents/ghq/github.com/oddgai/kaggle-projects/titanic/data/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

In [None]:
# exp004特徴量エンジニアリング（簡略版）
def create_features(df):
    df = df.copy()
    
    # 名前からの特徴量
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.')
    title_mapping = {
        'Mr': 'Mr', 'Mrs': 'Mrs', 'Miss': 'Miss', 'Master': 'Master',
        'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare',
        'Mlle': 'Miss', 'Countess': 'Rare', 'Ms': 'Mrs', 'Lady': 'Rare',
        'Jonkheer': 'Rare', 'Don': 'Rare', 'Dona': 'Rare', 'Mme': 'Mrs',
        'Capt': 'Rare', 'Sir': 'Rare'
    }
    df['Title_Grouped'] = df['Title'].map(title_mapping).fillna('Other')
    
    # 苗字と家族サイズ
    df['Surname'] = df['Name'].str.split(',').str[0]
    all_surnames = pd.concat([train_df['Name'], test_df['Name']]).str.split(',').str[0]
    surname_counts = all_surnames.value_counts()
    df['Surname_Count'] = df['Surname'].map(surname_counts)
    
    # チケット・客室特徴量
    df['Ticket_IsNumeric'] = df['Ticket'].str.isnumeric().astype(int)
    all_tickets = pd.concat([train_df['Ticket'], test_df['Ticket']])
    ticket_counts = all_tickets.value_counts()
    df['Ticket_Count'] = df['Ticket'].map(ticket_counts)
    
    df['HasCabin'] = (~df['Cabin'].isnull()).astype(int)
    
    # 基本前処理
    df['Sex_Binary'] = df['Sex'].map({'female': 0, 'male': 1})
    df['Age'] = df.groupby(['Sex', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))
    df['Fare'] = df.groupby('Pclass')['Fare'].transform(lambda x: x.fillna(x.median()))
    df['Embarked'] = df['Embarked'].fillna('S')
    
    # 家族構成
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # 交互作用
    df['Sex_Pclass'] = df['Sex_Binary'] * df['Pclass']
    df['Age_Fare_Interaction'] = df['Age'] * df['Fare']
    
    # 統計特徴量
    df['Age_Rank_SexPclass'] = df.groupby(['Sex_Binary', 'Pclass'])['Age'].rank(pct=True)
    df['Fare_Rank_Pclass'] = df.groupby('Pclass')['Fare'].rank(pct=True)
    
    return df

# 特徴量作成
train_processed = create_features(train_df)
test_processed = create_features(test_df)

# カテゴリカルエンコーディング
for feature in ['Embarked', 'Title_Grouped']:
    le = LabelEncoder()
    all_categories = pd.concat([train_processed[feature], test_processed[feature]]).astype(str)
    le.fit(all_categories)
    
    train_processed[feature] = le.transform(train_processed[feature].astype(str))
    test_processed[feature] = le.transform(test_processed[feature].astype(str))

print("特徴量エンジニアリング完了")

In [None]:
# 重要度ベース特徴量選択
exclude_features = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived', 'Surname', 'Title', 'Sex']
candidate_features = [col for col in train_processed.columns 
                     if col not in exclude_features and 
                     train_processed[col].dtype in ['int64', 'float64', 'int32', 'float32']]

X_full = train_processed[candidate_features]
y = train_processed['Survived']
X_test_full = test_processed[candidate_features]

print(f"候補特徴量: {len(candidate_features)}個")

# 重要度取得（簡易版）
lgb_simple = lgb.LGBMClassifier(random_state=42, verbose=-1, n_estimators=100)
lgb_simple.fit(X_full, y)
importance_scores = lgb_simple.feature_importances_

importance_df = pd.DataFrame({
    'feature': candidate_features,
    'importance': importance_scores
}).sort_values('importance', ascending=False)

print("\n重要度ランキング:")
for i, row in importance_df.head(15).iterrows():
    print(f"{importance_df.index.get_loc(i)+1:2d}. {row['feature']:20s}: {row['importance']:8.4f}")

# 最適特徴量セット決定（重要度上位15個）
optimal_features = importance_df.head(15)['feature'].tolist()
print(f"\n選択特徴量: {len(optimal_features)}個")

X_optimal = X_full[optimal_features]
X_test_optimal = X_test_full[optimal_features]

In [None]:
# 軽量Optuna最適化
def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'random_state': 42,
        'n_estimators': 200,  # 高速化のため削減
        
        'num_leaves': trial.suggest_int('num_leaves', 15, 50),
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.15),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0)
    }
    
    # 3-fold CV（高速化）
    kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    cv_scores = []
    
    for train_idx, val_idx in kf.split(X_optimal, y):
        X_train, X_val = X_optimal.iloc[train_idx], X_optimal.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model = lgb.LGBMClassifier(**params)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                 callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)])
        
        val_pred = model.predict(X_val)
        fold_score = accuracy_score(y_val, val_pred)
        cv_scores.append(fold_score)
    
    return np.mean(cv_scores)

print("\nOptuna最適化開始（30回試行）...")
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=30, show_progress_bar=True)

print(f"\n最高CV精度: {study.best_value:.4f}")
print("最適パラメータ:")
for param, value in study.best_params.items():
    print(f"{param:20s}: {value}")

best_params = study.best_params.copy()
best_params.update({
    'objective': 'binary',
    'metric': 'binary_logloss',
    'verbosity': -1,
    'random_state': 42
})

In [None]:
# 最終モデル構築
print("\n最終モデル構築中...")

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
final_cv_scores = []
oof_predictions = np.zeros(len(X_optimal))
test_predictions = np.zeros(len(X_test_optimal))
feature_importance_final = np.zeros(len(optimal_features))

for fold, (train_idx, val_idx) in enumerate(kf.split(X_optimal, y), 1):
    X_train, X_val = X_optimal.iloc[train_idx], X_optimal.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = lgb.LGBMClassifier(**best_params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
             callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
    
    # 予測
    val_pred_proba = model.predict_proba(X_val)[:, 1]
    val_pred = model.predict(X_val)
    test_pred_proba = model.predict_proba(X_test_optimal)[:, 1]
    
    fold_score = accuracy_score(y_val, val_pred)
    final_cv_scores.append(fold_score)
    
    oof_predictions[val_idx] = val_pred_proba
    test_predictions += test_pred_proba / 5
    feature_importance_final += model.feature_importances_
    
    print(f"Fold {fold}: {fold_score:.4f}")

final_cv_mean = np.mean(final_cv_scores)
final_cv_std = np.std(final_cv_scores)
oof_accuracy = accuracy_score(y, (oof_predictions >= 0.5).astype(int))
feature_importance_final /= 5

print(f"\n=== 最終結果 ===")
print(f"CV Accuracy: {final_cv_mean:.4f} ± {final_cv_std:.4f}")
print(f"OOF Accuracy: {oof_accuracy:.4f}")

# exp004との比較
exp004_cv = 0.8462
exp004_kaggle = 0.77990
cv_improvement = final_cv_mean - exp004_cv
expected_kaggle = final_cv_mean * (exp004_kaggle / exp004_cv)
kaggle_improvement = expected_kaggle - exp004_kaggle

print(f"\n=== exp004との比較 ===")
print(f"exp004 CV: {exp004_cv:.4f}")
print(f"exp006 CV: {final_cv_mean:.4f} ({cv_improvement:+.4f})")
print(f"\n期待Kaggle: {expected_kaggle:.5f}")
print(f"exp004から: {kaggle_improvement:+.5f} ({kaggle_improvement/exp004_kaggle*100:+.2f}%)")

if kaggle_improvement > 0.005:
    print("🎉 大幅改善が期待される！")
elif kaggle_improvement > 0:
    print("✅ 改善が期待される")
else:
    print("⚠️ 改善は限定的")

# 最終特徴量重要度
final_importance_df = pd.DataFrame({
    'feature': optimal_features,
    'importance': feature_importance_final
}).sort_values('importance', ascending=False)

print(f"\n=== 最終特徴量重要度 ===")
for i, row in final_importance_df.iterrows():
    print(f"{final_importance_df.index.get_loc(i)+1:2d}. {row['feature']:20s}: {row['importance']:8.4f}")

# 可視化
plt.figure(figsize=(10, 6))
plt.barh(range(len(final_importance_df)), final_importance_df['importance'])
plt.yticks(range(len(final_importance_df)), final_importance_df['feature'])
plt.xlabel('重要度')
plt.title(f'exp006最終特徴量重要度 ({len(optimal_features)}特徴量)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# 提出ファイル生成
test_predictions_binary = (test_predictions >= 0.5).astype(int)

submission = pd.DataFrame({
    'PassengerId': test_processed['PassengerId'],
    'Survived': test_predictions_binary
})

submission.to_csv('/Users/koki.ogai/Documents/ghq/github.com/oddgai/kaggle-projects/titanic/results/exp006/result.csv', index=False)

print(f"\n=== 提出ファイル生成完了 ===")
print(f"生存予測数: {test_predictions_binary.sum()}")
print(f"死亡予測数: {len(test_predictions_binary) - test_predictions_binary.sum()}")
print(f"予測生存率: {test_predictions_binary.mean():.3f}")
print(f"実際生存率: {y.mean():.3f}")
print(f"\n💾 提出ファイル保存: results/exp006/result.csv")

print("\n" + "="*60)
print("                🎯 EXP006 完了")
print("="*60)
print(f"戦略: Less is More（{len(optimal_features)}特徴量）")
print(f"CV性能: {final_cv_mean:.4f} ± {final_cv_std:.4f}")
print(f"期待改善: {kaggle_improvement:+.5f}")
print("Kaggleでの結果をお楽しみに！")
print("="*60)