In [23]:
# インポート
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# データ読み込み
train_data = pd.read_csv('/app/data/train.csv')
test_data = pd.read_csv('/app/data/test.csv')

# データ確認
print("train:")
print(train_data.head())

print("test:")
print(test_data.head())

# カラム確認
print("train_c:")
print(train_data.columns)

print("test_c:")
print(test_data.columns)




train:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN       

In [24]:
# メモ
# PassengerId：識別
# Survived：生存・目的変数（0：生存しない　1：生存）
# Pclass：チケットランク
# Name：名前
# Sex：性別
# Age：年
# SibSp：兄弟姉妹や配偶者の同伴人数
# Parch：両親や子供の同伴人
# Ticket：チケット番号
# Fare：料金
# Cabin：客室番号


In [25]:

# カテゴリ変数のエンコード
le = LabelEncoder()
train_data['Sex'] = le.fit_transform(train_data['Sex'])
test_data['Sex'] = le.transform(test_data['Sex'])

# 欠損値の処理、中央値代入
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].median())
test_data['Age'] = test_data['Age'].fillna(train_data['Age'].median())
test_data['Fare'] = test_data['Fare'].fillna(train_data['Fare'].median())

# 説明変数定義、必要データのみX_test_dataへ
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
X = train_data[features]
y = train_data['Survived']
X_test_data = test_data[features]

# データ分割、学習８、テスト２
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [26]:
# インポート
import xgboost as xgb

# XGboost用にDMatrix形式へ変換
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test_data)


In [32]:
#　パラメータ定義
params = {
    'objective': 'binary:logistic', # バイナリ分類
    'eval_metric': 'logloss',       # ロジスティック損失定義
    'eta': 0.1,                     # 学習率
    'max_depth': 6                  # 決定木の深さ
}

# モデルの学習
model = xgb.train(
    params,                     # ハイパーパラメータ設定
    dtrain,                     # 学習用データ（dtrain = xgb.DMatrix(X_train, label=y_train))
    num_boost_round=500,        # 学習の最大反復回数（Boosting回数）
    evals=[(dval, 'validation')],  # 検証データを指定（進捗確認用）
    early_stopping_rounds=10    # 検証データで10回連続で性能が改善しない場合、学習終了（過学習対策）
)


[0]	validation-logloss:0.63423
[1]	validation-logloss:0.59685
[2]	validation-logloss:0.56736
[3]	validation-logloss:0.54274
[4]	validation-logloss:0.52385
[5]	validation-logloss:0.50642
[6]	validation-logloss:0.49283
[7]	validation-logloss:0.48363
[8]	validation-logloss:0.47341
[9]	validation-logloss:0.46570
[10]	validation-logloss:0.45879
[11]	validation-logloss:0.45217
[12]	validation-logloss:0.44798
[13]	validation-logloss:0.44581
[14]	validation-logloss:0.44095
[15]	validation-logloss:0.43782
[16]	validation-logloss:0.43666
[17]	validation-logloss:0.43422
[18]	validation-logloss:0.43192
[19]	validation-logloss:0.43271
[20]	validation-logloss:0.43096
[21]	validation-logloss:0.43137
[22]	validation-logloss:0.43079
[23]	validation-logloss:0.43013
[24]	validation-logloss:0.42985
[25]	validation-logloss:0.42972
[26]	validation-logloss:0.42948
[27]	validation-logloss:0.42866
[28]	validation-logloss:0.42924
[29]	validation-logloss:0.42984
[30]	validation-logloss:0.43033
[31]	validation-lo

In [28]:
# モデルを使用、検証データを計算　(dval = xgb.DMatrix(X_val, label=y_val))
y_pred_val = model.predict(dval)

# 予測確率（0～1の値）をしきい値0.5で二値（0または1）に変換
y_pred_val_binary = [1 if prob > 0.5 else 0 for prob in y_pred_val]


In [29]:
# 指標インストインポート
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 精度の評価
accuracy = accuracy_score(y_val, y_pred_val_binary)
print(f"Validation Accuracy: {accuracy:.4f}")

# 他の評価指標の計算
precision = precision_score(y_val, y_pred_val_binary)
recall = recall_score(y_val, y_pred_val_binary)
f1 = f1_score(y_val, y_pred_val_binary)

# 結果の出力
print(f"Validation Precision: {precision:.4f}")
print(f"Validation Recall: {recall:.4f}")
print(f"Validation F1 Score: {f1:.4f}")


Validation Accuracy: 0.8156
Validation Precision: 0.8475
Validation Recall: 0.6757
Validation F1 Score: 0.7519


In [30]:
# テストデータでの予測（dtest = xgb.DMatrix(X_test_data)）
y_pred_test = model.predict(dtest)

# 検証時と同様の定義。予測確率（0～1の値）をしきい値0.5で二値（0または1）に変換
y_pred_test_binary = [1 if prob > 0.5 else 0 for prob in y_pred_test]

# 提出用データ整形（PassengerIdとSurvivedのみ出力）
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': y_pred_test_binary
})

# csv出力、index除外
submission.to_csv('titanic.csv', index=False)
