# Kaggle 之泰坦尼克号罹难者预测

In [86]:
import pandas as pd

train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')

In [87]:
PassengerId = test['PassengerId']

In [88]:
# 人工选择特征
select_features = [
    'Pclass', 'Sex', 'Age', 'Embarked', 'SibSp', 'Parch', 'Fare'
]

In [89]:
train = data_train[select_features]
test = data_test[select_features]

`Survived` 和 `Age` 这两个特征的数据有缺失。

In [90]:
y_train = data_train['Survived']

## 填补缺失特征

In [91]:
train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [92]:
test['Embarked'].value_counts()

S    270
C    102
Q     46
Name: Embarked, dtype: int64

类别型的特征，使用众数来填充缺失值。

In [93]:
train['Embarked'].fillna('S', inplace=True)
test['Embarked'].fillna('S', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


数值型的特征，使用平均值或者中位数来填充缺失值。

In [94]:
train['Age'].fillna(train['Age'].mean(), inplace=True)
test['Age'].fillna(test['Age'].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [95]:
test['Fare'].fillna(test['Fare'].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [96]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Sex         418 non-null object
Age         418 non-null float64
Embarked    418 non-null object
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        418 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 22.9+ KB


### 特征向量化（离散型变量使用独热编码）

In [97]:
train.head()

Unnamed: 0,Pclass,Sex,Age,Embarked,SibSp,Parch,Fare
0,3,male,22.0,S,1,0,7.25
1,1,female,38.0,C,1,0,71.2833
2,3,female,26.0,S,0,0,7.925
3,1,female,35.0,S,1,0,53.1
4,3,male,35.0,S,0,0,8.05


In [98]:
from sklearn.feature_extraction import DictVectorizer

dict_vec = DictVectorizer(sparse=False)
X_train = dict_vec.fit_transform(train.to_dict(orient='record'))

In [99]:
X_train.shape

(891, 10)

In [100]:
dict_vec.feature_names_

['Age',
 'Embarked=C',
 'Embarked=Q',
 'Embarked=S',
 'Fare',
 'Parch',
 'Pclass',
 'Sex=female',
 'Sex=male',
 'SibSp']

In [101]:
X_train.shape

(891, 10)

### 测试数据也要做同样的向量化，或者使用 Pandas 做独热编码

In [102]:
X_test = dict_vec.transform(test.to_dict(orient='record'))

In [103]:
X_test

array([[34.5       ,  0.        ,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [47.        ,  0.        ,  0.        , ...,  1.        ,
         0.        ,  1.        ],
       [62.        ,  0.        ,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [38.5       ,  0.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [30.27259036,  0.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [30.27259036,  1.        ,  0.        , ...,  0.        ,
         1.        ,  1.        ]])

## 训练分类器

In [104]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

### 交叉验证计算得分

In [105]:
from sklearn.model_selection import cross_val_score

cross_val_score(rfc, X_train, y_train, cv=5).mean()



0.791335302985651

### 使用 xgboost 

In [106]:
from xgboost import XGBClassifier

xgbc = XGBClassifier()

In [107]:
cross_val_score(xgbc, X_train, y_train, cv=5).mean()

0.81824559798311

## 使用默认配置的 `RandomForestClassifier` 对测试数据进行预测输出

In [108]:
X_train

array([[22.        ,  0.        ,  0.        , ...,  0.        ,
         1.        ,  1.        ],
       [38.        ,  1.        ,  0.        , ...,  1.        ,
         0.        ,  1.        ],
       [26.        ,  0.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [29.69911765,  0.        ,  0.        , ...,  1.        ,
         0.        ,  1.        ],
       [26.        ,  1.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [32.        ,  0.        ,  1.        , ...,  0.        ,
         1.        ,  0.        ]])

In [110]:
rfc.fit(X_train, y_train)
rfc_y_predict = rfc.predict(X_test)
rfc_submission = pd.DataFrame({
    'PassengerId': PassengerId,
    'Survived': rfc_y_predict
})
rfc_submission.to_csv('./output/rfc_submission.csv', index=False)



## 使用默认配置的 `XGBClassifier` 对测试数据进行预测操作

In [113]:
xgbc.fit(X_train, y_train)
xgbc_y_predict = xgbc.predict(X_test)
xgbc_submission = pd.DataFrame({
    'PassengerId': PassengerId,
    'Survived': xgbc_y_predict
})
xgbc_submission.to_csv('./output/xgbc_submission.csv', index=False)

### 网格搜索找最佳超参数

In [115]:
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': list(range(2, 7)),
    'n_estimators': list(range(100, 1100, 200)),
    'learning_rate': [0.05, 0.1, 0.25, 0.5, 1.0]
}

### 找 xgboost 的最佳超参数

In [116]:
%%time
xgbc_best = XGBClassifier()

gs = GridSearchCV(xgbc_best, params, n_jobs=-1, cv=5, verbose=-1)
gs.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   15.2s


CPU times: user 2.53 s, sys: 210 ms, total: 2.74 s
Wall time: 32.6 s


[Parallel(n_jobs=-1)]: Done 625 out of 625 | elapsed:   32.5s finished


In [117]:
gs.best_score_

0.835016835016835

In [118]:
gs.best_params_

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}

In [119]:
gs.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [120]:
xgbc_best_y_predict = gs.best_estimator_.predict(X_test)
xgbc_best_y_predict

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [122]:
xgbc_best_submission = pd.DataFrame({
    'PassengerId': PassengerId,
    'Survived': xgbc_best_y_predict
})

xgbc_best_submission.to_csv('./output/xgbc_best_submission.csv', index=False)