In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# 数据集介绍

此项目数据集分为2份数据集train.csv和test.csv

train.csv: 训练集，共计891条数据

test.csv: 测试集，共计418条数据


字段|字段说明
-|-
PassengerId| 乘客编号
Survived   | 存活情况（存活：1 ; 死亡：0）
Pclass     | 客舱等级
Name       | 乘客姓名
Sex        | 性别
Age        | 年龄
SibSp      | 同乘的兄弟姐妹/配偶数
Parch      | 同乘的父母/小孩数
Ticket     | 船票编号
Fare       | 船票价格
Cabin      | 客舱号
Embarked   | 登船港口

PassengerId 是数据唯一序号；Survived 是存活情况，为预测标记特征；剩下的10个是原始特征数据。

# 数据初探
查看一下训练集和测试集前10条数据：

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
test.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


查看训练集是否有缺失数据，以及数据的类型：

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
train.isnull().sum()  # 统计各属性缺失值数量

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

- 从上述分析可见，**属性 Age、Cabin（大量缺失）、Embarked（极少缺失） 带有缺失值，缺失值数量分别为177、687、2**

查看训练集的一些基本统计信息：

In [7]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


- 从以上分析可知，获救人数占总人数的0.383838、乘客平均年龄是29.699118
- **疑问**：上述统计可以发现有乘客的船票价格（Fare）是0，这个难道真是如电影中所说中了彩票？

# 数据清洗

## 缺失值处理
1. 客舱号Cabin列由于存在大量的空值，如果直接对空值进行填空，带来的误差影响会比较大，先不选用Cabin列做特征
2. Age列比较重要，缺失数量还可接受，因此这里使用中位数进行填充（好处：采用中位数可以保证年龄是个整数）

In [8]:
train.Age = train.Age.fillna(train.Age.median())  # 填充Age列的
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## 特征选择
1. PassengerId是一个连续的序列，对于是否能够存活的判断无关，不选用PassengerId作为特征

In [9]:
feature_name = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']  # 选用的属性/特征名

X = train[feature_name]
y = train.Survived

## 数据划分
数据集 test 是不带标签的，对模型的训练和评估都排不上用处。这里选取数据集 train 作为交叉验证集。

# XGBoost模型

In [10]:
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', booster='gbtree', 
                  max_depth=3, n_estimators=7, 
                  n_jobs=-1,)

## 基于交叉验证的网格搜索

In [11]:
param_grid = {
    'max_depth': range(2, 20),
    'n_estimators': list(range(3, 20, 2)),
}  # 要搜索的参数空间
gsCv = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, refit=True, verbose=1, n_jobs=-1, cv=5)  # 5折交叉验证网格搜索（使用所有CPU所有线程并行）
gsCv.fit(X, y)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 810 out of 810 | elapsed:    4.4s finished


GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster='gbtree',
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=3, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=7, n_jobs=-1,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None,
  

In [12]:
print(gsCv.best_score_)  # 输出最好的模型的分数（交叉验证集上的平均精准度）
print(gsCv.best_params_)  # 输出最好的模型参数
best_model = gsCv.best_estimator_  # 获取到最优的模型

0.7206327286422698
{'max_depth': 4, 'n_estimators': 7}


## 增加特征Sex和Embarked列，查看对预测的影响
1. 将性别 Sex 的值映射为0或1
2. 将登船港口 Embarked 的值映射为0、1或2

In [13]:
train.Sex

0        male
1      female
2      female
3      female
4        male
        ...  
886      male
887    female
888    female
889      male
890      male
Name: Sex, Length: 891, dtype: object

In [20]:
# 性别属性预处理 male => 0，female => 1
train.loc[train.Sex == "male", "Sex"] = 0  
train.loc[train.Sex == "female", "Sex"] = 1  
train.Sex = train.Sex.astype('int')

In [15]:
train.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [16]:
#缺失值用最多的S进行填充
train.Embarked = train.Embarked.fillna('S') 

#地点用0,1,2
train.loc[train["Embarked"] == "S", "Embarked"] = 0    
train.loc[train["Embarked"] == "C", "Embarked"] = 1
train.loc[train["Embarked"] == "Q", "Embarked"] = 2
train.Embarked = train.Embarked.astype('int')

0      0
1      1
2      0
3      0
4      0
      ..
886    0
887    0
888    0
889    1
890    2
Name: Embarked, Length: 891, dtype: int32

In [17]:
# 重新进行特征选择
feature_name = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
X = train[feature_name]
y = train.Survived

In [18]:
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', booster='gbtree', 
                  max_depth=3, n_estimators=7, 
                  n_jobs=-1,)

param_grid = {
    'max_depth': range(2, 20),
    'n_estimators': list(range(3, 20, 2)),
}  # 要搜索的参数空间
gsCv = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, refit=True, verbose=1, n_jobs=-1, cv=5)  # 5折交叉验证网格搜索（使用所有CPU所有线程并行）
gsCv.fit(X, y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 162 candidates, totalling 810 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 810 out of 810 | elapsed:    1.2s finished


ValueError: DataFrame.dtypes for data must be int, float or bool.
                Did not expect the data types in fields Sex, Embarked

In [None]:
print(gsCv.best_score_)  # 输出最好的模型的分数（交叉验证集上的平均精准度）
print(gsCv.best_params_)  # 输出最好的模型参数
best_model = gsCv.best_estimator_  # 获取到最优的模型