```
PassengerId => 乘客ID
Pclass => 客舱等级(1/2/3等舱位)
Name => 乘客姓名
Sex => 性别
Age => 年龄
SibSp => 兄弟姐妹数/配偶数
Parch => 父母数/子女数
Ticket => 船票编号
Fare => 船票价格
Cabin => 客舱号
Embarked => 登船港口
```

In [1]:
import pandas as pd

train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')

In [2]:
# info 查看缺失值的统计
selected_features = [
    'Pclass', 'Sex', 'Age', 'Embarked', 'SibSp', 'Parch', 'Fare'
]

In [3]:
X_train = train[selected_features]
X_test = test[selected_features]
y_train = train['Survived']

In [4]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,Embarked,SibSp,Parch,Fare
0,3,male,22.0,S,1,0,7.25
1,1,female,38.0,C,1,0,71.2833
2,3,female,26.0,S,0,0,7.925
3,1,female,35.0,S,1,0,53.1
4,3,male,35.0,S,0,0,8.05


### 填充缺失值

In [5]:
print(X_train['Embarked'].value_counts())
print(X_test['Embarked'].value_counts())

S    644
C    168
Q     77
Name: Embarked, dtype: int64
S    270
C    102
Q     46
Name: Embarked, dtype: int64


In [6]:
# S 是众数，用它填充缺失值
X_train['Embarked'].fillna('S', inplace=True)
X_test['Embarked'].fillna('S', inplace=True)
# X_train.loc[pd.isnull(train['Embarked']), ['Embarked']] = 'S'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [7]:
# Embarked => 登船港口
print(X_train['Embarked'].value_counts())
print(X_test['Embarked'].value_counts())

S    646
C    168
Q     77
Name: Embarked, dtype: int64
S    270
C    102
Q     46
Name: Embarked, dtype: int64


In [8]:
X_train['Age'].fillna(X_train['Age'].mean(), inplace=True)
X_test['Age'].fillna(X_test['Age'].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [9]:
# Fare => 船票价格
# 训练数据集没有缺，测试数据集缺少 np.sum(X_test['Fare'].isnull())
X_test['Fare'].fillna(X_test['Fare'].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [10]:
# 一起检查一下数据是不是完整的
X_train.info()
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
Embarked    891 non-null object
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 48.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Sex         418 non-null object
Age         418 non-null float64
Embarked    418 non-null object
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        418 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 22.9+ KB


## 把离散型特征转换为独热编码

In [11]:
from sklearn.feature_extraction import DictVectorizer

dict_vec = DictVectorizer(sparse=False)

In [12]:
X_train_vec = dict_vec.fit_transform(X_train.to_dict(orient='record'))
dict_vec.feature_names_

['Age',
 'Embarked=C',
 'Embarked=Q',
 'Embarked=S',
 'Fare',
 'Parch',
 'Pclass',
 'Sex=female',
 'Sex=male',
 'SibSp']

In [13]:
type(X_train_vec)

numpy.ndarray

In [14]:
X_train_vec.tolist()

[[22.0, 0.0, 0.0, 1.0, 7.25, 0.0, 3.0, 0.0, 1.0, 1.0],
 [38.0, 1.0, 0.0, 0.0, 71.2833, 0.0, 1.0, 1.0, 0.0, 1.0],
 [26.0, 0.0, 0.0, 1.0, 7.925, 0.0, 3.0, 1.0, 0.0, 0.0],
 [35.0, 0.0, 0.0, 1.0, 53.1, 0.0, 1.0, 1.0, 0.0, 1.0],
 [35.0, 0.0, 0.0, 1.0, 8.05, 0.0, 3.0, 0.0, 1.0, 0.0],
 [29.69911764705882, 0.0, 1.0, 0.0, 8.4583, 0.0, 3.0, 0.0, 1.0, 0.0],
 [54.0, 0.0, 0.0, 1.0, 51.8625, 0.0, 1.0, 0.0, 1.0, 0.0],
 [2.0, 0.0, 0.0, 1.0, 21.075, 1.0, 3.0, 0.0, 1.0, 3.0],
 [27.0, 0.0, 0.0, 1.0, 11.1333, 2.0, 3.0, 1.0, 0.0, 0.0],
 [14.0, 1.0, 0.0, 0.0, 30.0708, 0.0, 2.0, 1.0, 0.0, 1.0],
 [4.0, 0.0, 0.0, 1.0, 16.7, 1.0, 3.0, 1.0, 0.0, 1.0],
 [58.0, 0.0, 0.0, 1.0, 26.55, 0.0, 1.0, 1.0, 0.0, 0.0],
 [20.0, 0.0, 0.0, 1.0, 8.05, 0.0, 3.0, 0.0, 1.0, 0.0],
 [39.0, 0.0, 0.0, 1.0, 31.275, 5.0, 3.0, 0.0, 1.0, 1.0],
 [14.0, 0.0, 0.0, 1.0, 7.8542, 0.0, 3.0, 1.0, 0.0, 0.0],
 [55.0, 0.0, 0.0, 1.0, 16.0, 0.0, 2.0, 1.0, 0.0, 0.0],
 [2.0, 0.0, 1.0, 0.0, 29.125, 1.0, 3.0, 0.0, 1.0, 4.0],
 [29.69911764705882, 0.0, 0.0, 

In [15]:
X_test_vec = dict_vec.transform(X_test.to_dict(orient='record'))
X_test_vec

array([[34.5       ,  0.        ,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [47.        ,  0.        ,  0.        , ...,  1.        ,
         0.        ,  1.        ],
       [62.        ,  0.        ,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [38.5       ,  0.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [30.27259036,  0.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [30.27259036,  1.        ,  0.        , ...,  0.        ,
         1.        ,  1.        ]])

## 模型训练

### 随机森林

In [16]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

In [17]:
from sklearn.model_selection import cross_val_score

rfc_scores = cross_val_score(rfc, X_train_vec, y_train, cv=5)
rfc_scores



array([0.78212291, 0.7877095 , 0.8258427 , 0.7752809 , 0.84745763])

### xgboost

In [18]:
from xgboost import XGBClassifier

xgbc = XGBClassifier()
xgbc_scores = cross_val_score(xgbc, X_train_vec, y_train, cv=5)
xgbc_scores

array([0.80446927, 0.81005587, 0.8258427 , 0.79775281, 0.85310734])

In [19]:
rfc_scores.mean(), xgbc_scores.mean()

(0.8036827249717797, 0.81824559798311)

### 将随机森林算法训练的结果输出

In [20]:
rfc.fit(X_train_vec, y_train)
rfc_y_predict = rfc.predict(X_test_vec)

rfc_submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': rfc_y_predict
})
rfc_submission.to_csv('./output/rfc_submission.csv', index=False)



### 将 xgbc 算法训练的结果输出

In [21]:
xgbc.fit(X_train_vec, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [22]:
xgbc_y_predict = xgbc.predict(X_test_vec)
xgbc_submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': xgbc_y_predict})
xgbc_submission.to_csv('./output/xgbc_submission.csv', index=False)

## 使用网格搜索找到 xgbc 模型的最佳超参数

In [23]:
%%time
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': list(range(2, 7)),
    'n_estimators': list(range(100, 1100, 200)),
    'learning_rate': [0.05, 0.1, 0.25, 0.5, 1.0]
}

xgbc_best = XGBClassifier()

gs = GridSearchCV(xgbc_best, params, n_jobs=-1, cv=5, verbose=1)

gs.fit(X_train_vec, y_train)

Fitting 5 folds for each of 125 candidates, totalling 625 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   22.2s


CPU times: user 2.49 s, sys: 207 ms, total: 2.69 s
Wall time: 32.6 s


[Parallel(n_jobs=-1)]: Done 625 out of 625 | elapsed:   32.5s finished


In [24]:

print(gs.best_score_)
print(gs.best_params_)

xgbc_best_y_predict = gs.predict(X_test_vec)

0.835016835016835
{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}


In [25]:
xgbc_best_y_predict = gs.best_estimator_.predict(X_test_vec)

In [26]:
xgbc_best_submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': xgbc_best_y_predict})


In [28]:
xgbc_best_submission.to_csv('./xgbc_best_submission_1.csv', index=False)
