参考：https://www.kaggle.com/nlearn/tanic-rf

In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("../input/train.csv")
test_data = pd.read_csv("../input/test.csv")

In [2]:
# 删除不用的列，这里处理得比较粗略了
train_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [3]:
# Embarked- 登船港口
# 缺失值填充
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)

# 转换成数字
train_data['Embarked'] = train_data['Embarked'].map({'S': 0, 'Q': 1, 'C': 2})
test_data['Embarked'] = test_data['Embarked'].map({'S': 0, 'Q': 1, 'C': 2})

In [4]:
# Sex 性别
train_data['Sex'] = train_data['Sex'].map({'male': 1, 'female': 0})
test_data['Sex'] = test_data['Sex'].map({'male': 1, 'female': 0})

In [5]:
# Fare 票价

test_data['Fare'].fillna(test_data['Fare'].mode()[0], inplace=True)

In [6]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,0
1,2,1,1,0,38.0,1,0,71.2833,2
2,3,1,3,0,26.0,0,0,7.925,0
3,4,1,1,0,35.0,1,0,53.1,0
4,5,0,3,1,35.0,0,0,8.05,0


In [7]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,34.5,0,0,7.8292,1
1,893,3,0,47.0,1,0,7.0,0
2,894,2,1,62.0,0,0,9.6875,1
3,895,3,1,27.0,0,0,8.6625,0
4,896,3,0,22.0,1,1,12.2875,0


In [8]:
train_data = pd.get_dummies(train_data, columns=['Embarked', 'Pclass'])
test_data = pd.get_dummies(test_data, columns=['Embarked', 'Pclass'])

In [9]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,Embarked_0,Embarked_1,Embarked_2,Pclass_1,Pclass_2,Pclass_3
0,1,0,1,22.0,1,0,7.25,1,0,0,0,0,1
1,2,1,0,38.0,1,0,71.2833,0,0,1,1,0,0
2,3,1,0,26.0,0,0,7.925,1,0,0,0,0,1
3,4,1,0,35.0,1,0,53.1,1,0,0,1,0,0
4,5,0,1,35.0,0,0,8.05,1,0,0,0,0,1


In [10]:
test_data.head()

Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Fare,Embarked_0,Embarked_1,Embarked_2,Pclass_1,Pclass_2,Pclass_3
0,892,1,34.5,0,0,7.8292,0,1,0,0,0,1
1,893,0,47.0,1,0,7.0,1,0,0,0,0,1
2,894,1,62.0,0,0,9.6875,0,1,0,0,1,0
3,895,1,27.0,0,0,8.6625,1,0,0,0,0,1
4,896,0,22.0,1,1,12.2875,1,0,0,0,0,1


In [11]:
# 使用 GBDT 对 Age 进行预测
# 先将测试集与训练集归于同一类别
test_data['Survived'] = 0

In [12]:
combine_data = pd.concat([train_data, test_data], axis=0, sort=False)
combine_data.shape

(1309, 13)

In [13]:
missing_age_cols = combine_data.columns.drop(['PassengerId', 'Survived'])
missing_age_cols

Index(['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_0', 'Embarked_1',
       'Embarked_2', 'Pclass_1', 'Pclass_2', 'Pclass_3'],
      dtype='object')

In [14]:
# Age 不是空作为训练集
missing_age_train = combine_data.loc[combine_data['Age'].notnull(), missing_age_cols]
# Age 为空作为测试集
missing_age_test = combine_data.loc[combine_data['Age'].isnull(), missing_age_cols]

In [15]:
missing_age_train_x = missing_age_train.drop("Age", axis=1)
missing_age_train_y = missing_age_train['Age']
missing_age_test_x = missing_age_test.drop('Age', axis=1)

## 使用 RandomForestRegressor 或者 GradientBoostingRegressor 预测缺失的年龄数据

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# 尝试使用 RF 对 Age进 行预测
rfr = RandomForestRegressor(n_estimators=100)
rfr.fit(missing_age_train_x, missing_age_train_y)
age_predict = rfr.predict(missing_age_test_x)

In [17]:
# 尝试使用梯度提升树对 Age 进行预测
gbr = GradientBoostingRegressor(
    n_estimators=55, learning_rate=0.15, subsample=0.975)
gbr.fit(missing_age_train_x, missing_age_train_y)
age_predict = gbr.predict(missing_age_test_x)

In [18]:
combine_data.loc[combine_data['Age'].isnull(),
                 'Age'] = age_predict  # 将预测结果结合到数据中

In [19]:
# 分离数据
train_data = combine_data[:train_data.shape[0]]
test_data = combine_data[train_data.shape[0]:]
test_data.drop("Survived", axis=1, inplace=True)

X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


## 模型训练

先用 XGBClassifier 做一下。

In [20]:
from xgboost import XGBClassifier

xgbc = XGBClassifier()

xgbc.fit(X, y)
y_pred = xgbc.predict(test_data)

# 0.75119
res = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': y_pred})
res.to_csv('../output/output-201903225.csv', index=None)

下面模型提交以后，得分 0.76555。

In [21]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [22]:
xgbc = XGBClassifier()
xgbc.fit(X, y)

gbc = GradientBoostingClassifier()
gbc.fit(X, y)

rfc = RandomForestClassifier(max_features=9, n_estimators=11, max_depth=105)
rfc.fit(X, y)

lg = LogisticRegression(solver='newton-cg')
lg.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

In [23]:
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html
from sklearn.ensemble import VotingClassifier

eclf1 = VotingClassifier(
    estimators=[('xgbc', xgbc), ('gbc', gbc), ('rfc', rfc), ('lg', lg)],
    voting='soft')

eclf1.fit(X, y)

VotingClassifier(estimators=[('xgbc', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', rando...ty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=None, voting='soft', weights=None)

In [24]:
y_pred = eclf1.predict(test_data)

In [25]:
res = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': y_pred})
res.to_csv('../output/output-201903225.csv', index=None)