# 資料載入

In [1]:
import pandas as pd

train_df = pd.read_csv('../train.csv', encoding='utf-8')
test_df = pd.read_csv('../test.csv', encoding='utf-8')

# 特徵工程

## 資料 - 取中間值

In [2]:
med = train_df.median().drop(['Pclass'])

train_df = train_df.fillna(med)
test_df = test_df.fillna(med)

## Embarked (登船港口) 處理 - 取最常用值

In [3]:
most = train_df['Embarked'].value_counts().idxmax()

train_df['Embarked'] = train_df['Embarked'].fillna(most)
test_df['Embarked'] = test_df['Embarked'].fillna(most)

## Name (姓名) 處理 - 取稱謂

In [4]:
def name_cut(s):
    reserved = ['Mr', 'Mrs', 'Miss', 'Master']
    s = s.split(',')[-1].split('.')[0]
    s = s.strip()
    if s in reserved:
        return s
    else:
        return None
    
train_df['Name'] = train_df['Name'].apply(name_cut)
test_df['Name'] = test_df['Name'].apply(name_cut)

train = pd.get_dummies(train_df, columns=['Name', 'Sex', 'Embarked'])
predict = pd.get_dummies(test_df, columns=['Name', 'Sex', 'Embarked'])

## 資料欄位捨去

In [5]:
x_train = train.drop(['PassengerId', 'Survived', 'Ticket', 'Cabin'], axis=1)
y_train = train['Survived']

x_predict = predict.drop(['PassengerId', 'Ticket', 'Cabin'], axis=1)
predict_id = predict['PassengerId']

# 模型建立

In [6]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x_trans_norm   = scaler.fit_transform(x_train)
x_predict_norm = scaler.transform(x_predict)

# 模型超參篩選

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier()
params = {
    'n_estimators': range(20, 35),
    'max_depth': range(6, 10)
}
grid = GridSearchCV(clf, params, cv=10, n_jobs=4)
grid.fit(x_train, y_train)
print('最佳參數: ', grid.best_params_)
print('最佳分數 (10 次分數): ', grid.best_score_)

最佳參數:  {'max_depth': 8, 'n_estimators': 24}
最佳分數 (10 次分數):  0.8395630461922596


In [8]:
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(n_estimators=22, max_depth=7)
scores = cross_val_score(clf,
                         x_train,
                         y_train,
                         cv=10,
                         n_jobs=4)

print('十次分數: ', scores)
print('平均分數: ', sum(scores) / len(scores))

十次分數:  [0.77777778 0.85393258 0.75280899 0.91011236 0.85393258 0.80898876
 0.83146067 0.76404494 0.86516854 0.84269663]
平均分數:  0.8260923845193509


# 模型訓練、結果儲存

In [9]:
clf = RandomForestClassifier(n_estimators=22, max_depth=7)
clf.fit(x_train, y_train)

pre = clf.predict(x_predict)
result = pd.DataFrame({
    "PassengerId": predict_id,
    "Survived": pre
})

result.to_csv("titanic_rf.csv",
              encoding='utf-8',
              index=False)