In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

# データ読み込み
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df  = pd.read_csv('/kaggle/input/titanic/test.csv')

# Age, Fare の欠損値補完
train_df['Age']  = train_df['Age'].fillna(train_df['Age'].median())
test_df['Age']   = test_df['Age'].fillna(train_df['Age'].median())
train_df['Fare'] = train_df['Fare'].fillna(train_df['Fare'].median())
test_df['Fare']  = test_df['Fare'].fillna(train_df['Fare'].median())

# Title 抽出
train_df['Title'] = train_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_df['Title']  = test_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Rare タイトルまとめ
rare_titles = ['Lady','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona']
train_df['Title'] = train_df['Title'].replace(rare_titles, 'Rare')
test_df['Title']  = test_df['Title'].replace(rare_titles, 'Rare')

# Sex, Embarked のカテゴリをコード化
for col in ['Sex','Embarked','Title']:
    le = LabelEncoder()
    combined = pd.concat([train_df[col], test_df[col]], axis=0)
    le.fit(combined.astype(str))
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col]  = le.transform(test_df[col].astype(str))

# FamilySize, IsAlone, Familyカテゴリ
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch']
test_df['FamilySize']  = test_df['SibSp'] + test_df['Parch']

train_df['IsAlone'] = (train_df['FamilySize']==0).astype(int)
test_df['IsAlone']  = (test_df['FamilySize']==0).astype(int)

train_df['SmallFamily'] = ((train_df['FamilySize']>=1) & (train_df['FamilySize']<=4)).astype(int)
test_df['SmallFamily']  = ((test_df['FamilySize']>=1) & (test_df['FamilySize']<=4)).astype(int)

train_df['LargeFamily'] = (train_df['FamilySize']>=5).astype(int)
test_df['LargeFamily']  = (test_df['FamilySize']>=5).astype(int)

# Age*Pclass 組み合わせ特徴
train_df['Age*Pclass'] = train_df['Age'] * train_df['Pclass']
test_df['Age*Pclass']  = test_df['Age'] * test_df['Pclass']

# Fare のログ変換
train_df['Fare_log'] = np.log1p(train_df['Fare'])
test_df['Fare_log']  = np.log1p(test_df['Fare'])

# 特徴量とターゲット
feature_columns = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked',
                   'Title','FamilySize','IsAlone','SmallFamily','LargeFamily',
                   'Age*Pclass','Fare_log']
X = train_df[feature_columns]
y = train_df['Survived']
X_test = test_df[feature_columns]

categorical_features = ['Pclass','Sex','Embarked','Title','IsAlone','SmallFamily','LargeFamily']

# Stratified K-Fold CV
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_scores = []
lgb_fold_preds = []

lgb_params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'learning_rate': 0.02,
    'num_leaves': 31,
    'max_depth': 7,
    'seed': 42
}

for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y)):
    print(f"===== Fold {fold+1} =====")
    X_train, X_valid = X.iloc[tr_idx], X.iloc[va_idx]
    y_train, y_valid = y.iloc[tr_idx], y.iloc[va_idx]

    lgb_train = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
    lgb_valid = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features, reference=lgb_train)

    lgb_model = lgb.train(
        lgb_params,
        lgb_train,
        num_boost_round=2000,
        valid_sets=[lgb_valid],
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)]
    )

    fold_score = lgb_model.best_score['valid_0']['binary_error']
    print(f"Fold {fold+1} binary_error: {fold_score}")
    fold_scores.append(fold_score)

    lgb_fold_preds.append(lgb_model.predict(X_test))

print(f"CV mean score: {1 - sum(fold_scores)/len(fold_scores)}")

# Fold予測平均
preds = np.mean(lgb_fold_preds, axis=0)

# 提出用 CSV 作成
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': (preds > 0.5).astype(int)
})
submission.to_csv('submission.csv', index=False)
print("提出用CSV作成完了")








===== Fold 1 =====
[LightGBM] [Info] Number of positive: 307, number of negative: 494
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003008 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 447
[LightGBM] [Info] Number of data points in the train set: 801, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383271 -> initscore=-0.475688
[LightGBM] [Info] Start training from score -0.475688
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[99]	valid_0's binary_error: 0.122222
Fold 1 binary_error: 0.12222222222222222
===== Fold 2 =====
[LightGBM] [Info] Number of positive: 308, number of negative: 494
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000142 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if

In [3]:
submission.head()  # 上から 5 行を確認


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
