参考资料：

https://www.kaggle.com/nlearn/tanic-rf

In [38]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in

In [39]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("./input/train.csv")
test_data = pd.read_csv("./input/test.csv")

## 数据预处理

In [40]:
# 丢弃一些列
# 姓名明显与是否生还无关、船票的号码同样难以联系、舱位 Cabin 的缺失值过多，舍弃
# 舱位 Cabin 的缺失值过多，舍弃
train_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [41]:
train_data['Embarked'].isnull().sum(), test_data['Embarked'].isnull().sum()

(2, 0)

In [42]:
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)

In [43]:
train_data['Embarked'].isnull().sum(), test_data['Embarked'].isnull().sum()

(0, 0)

In [44]:
train_data['Embarked'] = train_data['Embarked'].map({'S': 0, 'Q': 1, 'C': 2})
test_data['Embarked'] = test_data['Embarked'].map({'S': 0, 'Q': 1, 'C': 2})

性别处理。

In [45]:
train_data['Sex'].isnull().sum(), test_data['Sex'].isnull().sum()

(0, 0)

In [46]:
train_data['Sex'] = train_data['Sex'].map({'male': 1, 'female': 0})
test_data['Sex'] = test_data['Sex'].map({'male': 1, 'female': 0})

In [47]:
train_data['Fare'].isnull().sum(), test_data['Fare'].isnull().sum()

(0, 1)

In [48]:
test_data['Fare'].fillna(test_data['Fare'].mode()[0], inplace=True)

In [49]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,0
1,2,1,1,0,38.0,1,0,71.2833,2
2,3,1,3,0,26.0,0,0,7.925,0
3,4,1,1,0,35.0,1,0,53.1,0
4,5,0,3,1,35.0,0,0,8.05,0


In [50]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,34.5,0,0,7.8292,1
1,893,3,0,47.0,1,0,7.0,0
2,894,2,1,62.0,0,0,9.6875,1
3,895,3,1,27.0,0,0,8.6625,0
4,896,3,0,22.0,1,1,12.2875,0


In [51]:
# https://blog.csdn.net/Koala_Tree/article/details/78725881

# 直接将数据量化可能会存在问题：0 & 1之间存在大小关系，而 male 与 female 之间并没有这样的关系，因此需要将其变为哑节点以消除大小关系。使用 get_dummies 将其变为哑节点
# 最后由 get_dummies 将不同取值的列分别对应到不同列以避免数字大小关系。

embarked_train_dummy = pd.get_dummies(train_data['Embarked'], prefix='Embarked')
sex_train_dummy = pd.get_dummies(train_data['Sex'], prefix='Sex')
train_data = pd.concat([train_data, embarked_train_dummy], axis=1)
train_data = pd.concat([train_data, sex_train_dummy], axis=1)
train_data.drop(["Embarked", "Sex"], axis=1, inplace=True)

embarked_test_dummy = pd.get_dummies(test_data['Embarked'], prefix='Embarked')
sex_test_dummy = pd.get_dummies(test_data['Sex'], prefix='Sex')
test_data = pd.concat([test_data, embarked_test_dummy], axis=1)
test_data = pd.concat([test_data, sex_test_dummy], axis=1)
test_data.drop(["Embarked", "Sex"], axis=1, inplace=True)

In [52]:
# pclass 阶级，不同阶级存在大小关系，因此对于阶级并将其变为哑节点，同时将其差距扩大。
train_data['Pclass'] *= 10
test_data['Pclass'] *= 10

In [53]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_0,Embarked_1,Embarked_2,Sex_0,Sex_1
0,1,0,30,22.0,1,0,7.25,1,0,0,0,1
1,2,1,10,38.0,1,0,71.2833,0,0,1,1,0
2,3,1,30,26.0,0,0,7.925,1,0,0,1,0
3,4,1,10,35.0,1,0,53.1,1,0,0,1,0
4,5,0,30,35.0,0,0,8.05,1,0,0,0,1


In [54]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Embarked_0,Embarked_1,Embarked_2,Sex_0,Sex_1
0,892,30,34.5,0,0,7.8292,0,1,0,0,1
1,893,30,47.0,1,0,7.0,1,0,0,1,0
2,894,20,62.0,0,0,9.6875,0,1,0,0,1
3,895,30,27.0,0,0,8.6625,1,0,0,0,1
4,896,30,22.0,1,1,12.2875,1,0,0,1,0


In [55]:
# 使用 GBDT 对 Age 进行预测
# 先将测试集与训练集归于同一类别
test_data['Survived'] = 0

In [57]:
combine_data = pd.concat([train_data, test_data], axis=0, sort=False)
combine_data.shape

(1309, 12)

In [58]:
# combine_data

In [59]:
# 二者合并
missing_age_df = combine_data[[
    'Age', 'Embarked_0', 'Embarked_1', 'Embarked_2', 'Sex_0', 'Sex_1', 'Fare',
    'Parch', 'SibSp'
]]

# Age 不是空作为训练集
missing_age_train = missing_age_df[missing_age_df['Age'].notnull()]
# Age 为空作为测试集
missing_age_test = missing_age_df[missing_age_df['Age'].isnull()]

missing_age_train_x = missing_age_train.drop("Age", axis=1)
missing_age_train_y = missing_age_train['Age']
missing_age_test_x = missing_age_test.drop('Age', axis=1)

## 使用 RandomForestRegressor 或者 GradientBoostingRegressor 预测缺失的年龄数据

In [60]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# 尝试使用 RF 对 Age进 行预测
rfr = RandomForestRegressor()
rfr.fit(missing_age_train_x, missing_age_train_y)
age_predict = rfr.predict(missing_age_test_x)



In [61]:
# 尝试使用梯度提升树对 Age 进行预测
gbr = GradientBoostingRegressor(
    n_estimators=55, learning_rate=0.15, subsample=0.975)
gbr.fit(missing_age_train_x, missing_age_train_y)
age_predict = gbr.predict(missing_age_test_x)

In [62]:
combine_data.loc[combine_data['Age'].isnull(),
                 'Age'] = age_predict  # 将预测结果结合到数据中

In [63]:
# 分离数据
train_data = combine_data[:train_data.shape[0]]
test_data = combine_data[train_data.shape[0]:]
test_data.drop("Survived", axis=1, inplace=True)

X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


## 模型训练

先用 XGBClassifier 做一下。

In [68]:
from xgboost import XGBClassifier

xgbc = XGBClassifier()

xgbc.fit(X, y)
y_pred = xgbc.predict(test_data)

# 0.75119
res = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': y_pred})
res.to_csv('output-201903225.csv', index=None)

下面模型提交以后，得分 0.76555。

In [28]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [29]:
xgbc = XGBClassifier()
xgbc.fit(X, y)

gbc = GradientBoostingClassifier()
gbc.fit(X, y)

rfc = RandomForestClassifier(max_features=9, n_estimators=11, max_depth=105)
rfc.fit(X, y)

lg = LogisticRegression()
lg.fit(X, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [85]:
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html
from sklearn.ensemble import VotingClassifier

eclf1 = VotingClassifier(
    estimators=[('xgbc', xgbc), ('gbc', gbc), ('rfc', rfc), ('lg', lg)],
    voting='soft')

eclf1.fit(X, y)



VotingClassifier(estimators=[('xgbc', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', rando...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=None, voting='soft',
         weights=[1, 1, 1, 1])

In [86]:
y_pred = eclf1.predict(test_data)

In [87]:
res = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': y_pred})
res.to_csv('output-201903225.csv', index=None)