# Kaggle 泰坦尼克生存預測競賽

## 目錄
1. 數據載入與初步探索
2. 探索性數據分析 (EDA)
3. 特徵工程
4. 模型訓練與評估
5. 模型優化
6. 結果提交

## 環境準備

In [None]:
# 導入必要的庫
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# 設置顯示選項
pd.set_option('display.max_columns', None)
plt.style.use('seaborn')
%matplotlib inline

## 1. 數據載入與初步探索

In [None]:
# 載入訓練和測試數據
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# 查看訓練數據基本信息
print("訓練數據形狀:", train_df.shape)
print("\n訓練數據基本信息:")
print(train_df.info())

# 查看數據前幾行
print("\n訓練數據預覽:")
print(train_df.head())

# 基本統計描述
print("\n數值特徵統計描述:")
print(train_df.describe())

## 2. 探索性數據分析 (EDA)

In [None]:
def analyze_survival_rate(df, feature):
    survival_rate = df.groupby(feature)['Survived'].mean()
    counts = df[feature].value_counts()
    
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    survival_rate.plot(kind='bar')
    plt.title(f'Survival Rate by {feature}')
    plt.ylabel('Survival Rate')
    
    plt.subplot(1, 2, 2)
    counts.plot(kind='bar')
    plt.title(f'Count of {feature}')
    plt.ylabel('Count')
    
    plt.tight_layout()
    plt.show()

# 分析不同特徵的生存率
features_to_analyze = ['Pclass', 'Sex', 'Embarked']
for feature in features_to_analyze:
    analyze_survival_rate(train_df, feature)

In [None]:
# 年齡分佈分析
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.histplot(data=train_df, x='Age', hue='Survived', multiple="dodge")
plt.title('Age Distribution by Survival')

plt.subplot(1, 3, 2)
sns.boxplot(data=train_df, x='Pclass', y='Age')
plt.title('Age Distribution by Class')

plt.subplot(1, 3, 3)
sns.boxplot(data=train_df, x='Sex', y='Age')
plt.title('Age Distribution by Sex')

plt.tight_layout()
plt.show()

In [None]:
# 票價分析
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.histplot(data=train_df, x='Fare', hue='Survived', multiple="dodge")
plt.title('Fare Distribution by Survival')

plt.subplot(1, 3, 2)
sns.boxplot(data=train_df, x='Pclass', y='Fare')
plt.title('Fare Distribution by Class')

plt.subplot(1, 3, 3)
sns.boxplot(data=train_df, x='Embarked', y='Fare')
plt.title('Fare Distribution by Embarked')

plt.tight_layout()
plt.show()

In [None]:
# 相關性分析
numeric_features = ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
correlation = train_df[numeric_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

## 3. 特徵工程

In [None]:
def preprocess_data(df, is_train=True):
    # 創建數據副本
    data = df.copy()
    
    # Title提取
    data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
    # Title分組
    title_mapping = {
        'Mr': 'Mr',
        'Mrs': 'Mrs',
        'Miss': 'Miss',
        'Master': 'Master',
        'Don': 'Rare',
        'Rev': 'Rare',
        'Dr': 'Rare',
        'Mme': 'Mrs',
        'Ms': 'Miss',
        'Major': 'Rare',
        'Lady': 'Rare',
        'Sir': 'Rare',
        'Mlle': 'Miss',
        'Col': 'Rare',
        'Capt': 'Rare',
        'Countess': 'Rare',
        'Jonkheer': 'Rare'
    }
    data['Title'] = data['Title'].map(title_mapping)
    
    # 處理缺失的年齡
    age_by_title = data.groupby('Title')['Age'].median()
    for title in data['Title'].unique():
        age_null_mask = data['Age'].isnull() & (data['Title'] == title)
        data.loc[age_null_mask, 'Age'] = age_by_title[title]
    
    # 創建年齡段
    data['AgeBand'] = pd.cut(data['Age'], bins=[0, 12, 18, 35, 50, 80])
    
    # 家庭規模
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    data['IsAlone'] = (data['FamilySize'] == 1).astype(int)
    
    # 處理票價缺失值
    if data['Fare'].isnull().any():
        fare_median = data.groupby('Pclass')['Fare'].median()
        for pclass in data['Pclass'].unique():
            fare_null_mask = data['Fare'].isnull() & (data['Pclass'] == pclass)
            data.loc[fare_null_mask, 'Fare'] = fare_median[pclass]
    
    # 票價分段
    data['FareBand'] = pd.qcut(data['Fare'], 4)
    
    # 填充缺失的登船港口
    data['Embarked'] = data['Embarked'].fillna('S')
    
    # 選擇特徵
    features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Title', 
                'FamilySize', 'IsAlone']
    
    # 轉換類別特徵
    categorical_features = ['Sex', 'Embarked', 'Title']
    for feature in categorical_features:
        le = LabelEncoder()
        data[feature] = le.fit_transform(data[feature])
    
    if is_train:
        return data[features], data['Survived']
    else:
        return data[features]

# 處理訓練和測試數據
X_train, y_train = preprocess_data(train_df, is_train=True)
X_test = preprocess_data(test_df, is_train=False)

# 特徵縮放
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 4. 模型訓練與評估

In [None]:
# 定義評估函數
def evaluate_model(model, X, y, cv=5):
    cv_scores = cross_val_score(model, X, y, cv=cv)
    print(f"交叉驗證分數: {cv_scores}")
    print(f"平均分數: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# 訓練隨機森林
rf = RandomForestClassifier(n_estimators=100, random_state=42)
print("隨機森林評估結果:")
evaluate_model(rf, X_train_scaled, y_train)

# 訓練XGBoost
xgb_model = xgb.XGBClassifier(random_state=42)
print("\nXGBoost評估結果:")
evaluate_model(xgb_model, X_train_scaled, y_train)

In [None]:
# 特徵重要性分析
rf.fit(X_train_scaled, y_train)

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance - Random Forest')
plt.show()

## 5. 模型優化

In [None]:
from sklearn.model_selection import GridSearchCV

# 隨機森林參數優化
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42),
                          param_grid=param_grid,
                          cv=5,
                          n_jobs=-1,
                          verbose=1)

grid_search.fit(X_train_scaled, y_train)

print("最佳參數:", grid_search.best_params_)
print("最佳分數:", grid_search.best_score_)

# 使用最佳參數的模型
best_rf = grid_search.best_estimator_

In [None]:
# XGBoost參數優化
xgb_param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 200],
    'subsample': [0.8, 0.9, 1.0]
}

xgb_grid_search = GridSearchCV(xgb.XGBClassifier(random_state=42),
                              xgb_param_grid,
                              cv=5,
                              n_jobs=-1,
                              verbose=1)

xgb_grid_search.fit(X_train_scaled, y_train)

print("XGBoost最佳參數:", xgb_grid_search.best_params_)
print("XGBoost最佳分數:", xgb_grid_search.best_score_)

best_xgb = xgb_grid_search.best_estimator_

In [None]:
# 集成預測
rf_predictions = best_rf.predict_proba(X_test_scaled)[:, 1]
xgb_predictions = best_xgb.predict_proba(X_test_scaled)[:, 1]

# 加權平均
final_predictions = (0.6 * rf_predictions + 0.4 * xgb_predictions > 0.5).astype(int)

## 6. 結果提交

In [None]:
# 創建提交文件
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': final_predictions
})

# 保存結果
submission.to_csv('submission.csv', index=False)
print("提交文件已生成！")

# 查看預測結果分布
print("\n預測結果分布：")
print(submission['Survived'].value_counts(normalize=True))

## 7. 總結與改進建議

### 7.1 主要發現
1. 性別、票價和艙位等級是最重要的預測特徵
2. 家庭規模和年齡也提供了有用的信息
3. 集成模型比單一模型表現更好

### 7.2 可能的改進方向
1. 特徵工程：
   - 探索更多特徵組合
   - 嘗試更複雜的特徵轉換
   - 處理離群值

2. 模型優化：
   - 嘗試其他算法（如LightGBM、CatBoost）
   - 更詳細的參數調優
   - 使用更複雜的集成策略

3. 驗證策略：
   - 使用不同的交叉驗證方法
   - 增加驗證指標
   - 分析預測錯誤的案例

### 7.3 經驗總結
1. 詳細的EDA對理解數據至關重要
2. 特徵工程是提升模型表現的關鍵
3. 集成不同模型可以提高穩定性
4. 交叉驗證有助於防止過擬合