In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import warnings
warnings.filterwarnings("ignore")#忽略警告
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

In [2]:
# 读取数据
df1_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df1_test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [3]:
# 第一模块：数据探索，查看表的基本情况；
print(df1_train.info())
print('-'*30)
print(df1_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  4

In [4]:
# 对于数值型数据，查看其基本情况（样本量、均值、最大最小值等）
print(df1_train.describe())
print('-'*30)

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  
------------------------------


In [5]:
# 对于字符串数据，查看其基本情况（样本量、非重复计数、频数最高者等）
print(df1_train.describe(include=['O']))
print('-'*30)

                           Name   Sex  Ticket    Cabin Embarked
count                       891   891     891      204      889
unique                      891     2     681      147        3
top     Braund, Mr. Owen Harris  male  347082  B96 B98        S
freq                          1   577       7        4      644
------------------------------


In [6]:
# 第二模块：数据清洗
# 由于本项目样本量不大，且较为标准，无需作过多清洗；
# 即，重点将缺失值补齐即可；
# 对于数值型字段，用其均值补充缺失值
df1_train.fillna({'Age':df1_train['Age'].mean()},inplace=True)
df1_test.fillna({'Age':df1_train['Age'].mean(),'Fare':df1_train['Fare'].mean()},inplace=True)

In [7]:
# 查看存在缺失值的字符串字段
print(df1_train.info())
print(df1_test.info())
print('-'*30)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pcl

In [8]:
# 从以上数据概览，可知 train表存在缺失值的特征为 Embarked、Cabin；
# test表中为 Cabin；
# 且，由于特征 Cabin 在两张表里，均存在较多的缺失值；
# 无法找到合理的方法，大量补全该特征，即，进行模型模拟时，该特征将被剔除；
# 对 train表存在缺失值的特征 Embarked，
# 按频次降序排列，用频次最高者对其缺失值进行补充；
print(df1_train['Embarked'].value_counts())
df1_train.fillna({'Embarked':'S'},inplace=True)

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64


In [9]:
# 第三模块：模型的 特征选择
# 对于该案例，选择 Pclass、Sex、Age、SibSp、Parch、Ticket、Fare、Embarked 作为特征;
# Survived 作为标签
features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
train_features = df1_train[features]
train_labels = df1_train['Survived']
test_features = df1_test[features]

In [10]:
# 将所有字符串数据，变为 数值型
# 将字符串字段 处理成 数值型，使用 Sklearn 特征选择里的 DictVectorizer,并转换成 特征值矩阵
# 即，Embarked单独的一列，拆开变成 'Embarked = S'、'Embarked = C'、'Embarked = Q' 3列的值，这3列分别为 0/1
# 而，Sex 这一单独列，拆开变成 'Sex = female'、'Sex = male' 2列的值，也均为 0/1 的值
# 所以，原来 train_features 仅有 7列，这样，即 扩展为 10列,891行
dvec = DictVectorizer(sparse=False)
train_features = dvec.fit_transform(train_features.to_dict(orient='records'))
test_features = dvec.transform(test_features.to_dict(orient='records'))
# 数据规范化：Z-Score 规范化，使得每个特征数据符合，均值为0，方差为1 分布
ss = StandardScaler()
train_features = ss.fit_transform(train_features)
test_features = ss.transform(test_features)

In [11]:
# 第四模块：构造各分类器，并探索合适的参数
# 一、对于贝叶斯分类器，直接使用即可；
# 1、高斯贝叶斯分类器
# 模型评估
Gnb = GaussianNB()
Gnb.fit(train_features,train_labels)
print('高斯朴素贝叶斯分类器中 5折交叉验证准确率为 %.16f' % np.mean(cross_val_score(Gnb,train_features,train_labels,cv=5)))

高斯朴素贝叶斯分类器中 5折交叉验证准确率为 0.7890276818780995


In [12]:
# 结果预测，kaggle上得分为，0.75358
test_pre_Gnb = Gnb.predict(test_features)
df1_test['predict_model_Gnb'] = test_pre_Gnb

In [13]:
# 2、伯努利贝叶斯分类器
# 模型评估
Blb = BernoulliNB()
Blb.fit(train_features,train_labels)
print('伯努利贝叶斯分类器中 5折交叉验证准确率为 %.16f' % np.mean(cross_val_score(Blb,train_features,train_labels,cv=5)))

伯努利贝叶斯分类器中 5折交叉验证准确率为 0.7800326407632917


In [14]:
# 结果预测，kaggle上得分为，0.75837
test_pre_Blb = Blb.predict(test_features)
df1_test['predict_model_Blb'] = test_pre_Blb

In [15]:
# 二、对于不需要复杂调参的一般模型，直接一起用 GridSearchCV 调参，Pipeline流程化；
# 即，对决策树（criterion = 'entropy'/'gini'）、SVM分类器、Knn进行下述操作；

# 1）建立各分类器：
list_classifiers = [
    DecisionTreeClassifier(random_state=10,criterion='entropy'),
    DecisionTreeClassifier(random_state=10,criterion='gini'),
    SVC(random_state=10),
    KNeighborsClassifier(),
]

In [16]:
# 2）对各分类器命名
list_classifier_names = [
    'ID3',
    'CART',
    'svc',
    'knn',
]

In [17]:
# 3）对各分类器建立拟模拟参数
list_classifier_param_grid = [
    {'ID3__max_depth':range(3,11)},
    {'CART__max_depth': range(3,11)},
    {'svc__gamma':[0.001,0.01,0.02,0.03,0.05,0.1,0.15,0.2,0.3,1,10,100]},
    {'knn__n_neighbors':[4,6,8,10,15,20,25,30]},
]

In [18]:
# 4）对各分类器用网格搜索 GridSearchCV 调参
def GridSearchCV_Process(ppl,train_features,train_labels,test_features,param_grid,classifier_name,df1_test):
    model1 = GridSearchCV(estimator=ppl,param_grid=param_grid,cv=5) # scoring未指定，默认使用estimator的评估方法，即 accuracy
    # 用网格搜索，寻找最优参数、最佳准确率
    search1 = model1.fit(train_features,train_labels)
    print('%s 模型如下：' % classifier_name)
    print('model1 最优参数：',search1.best_params_)
    print('model1 最优得分_5折交叉准确率为：',search1.best_score_)
    test_predict = search1.predict(test_features)
    model_name = ''.join(['predict_model_',classifier_name])
    df1_test[model_name] = test_predict

In [19]:
# 5）各分类器结果预测
for classifier,classifier_name,classifier_param_grid in zip(list_classifiers,list_classifier_names,list_classifier_param_grid):
    ppl = Pipeline([
        (classifier_name,classifier)
    ])
    GridSearchCV_Process(ppl,train_features,train_labels,test_features,classifier_param_grid,classifier_name,df1_test)

ID3 模型如下：
model1 最优参数： {'ID3__max_depth': 8}
model1 最优得分_5折交叉准确率为： 0.8148389931579938
CART 模型如下：
model1 最优参数： {'CART__max_depth': 6}
model1 最优得分_5折交叉准确率为： 0.813709120582512
svc 模型如下：
model1 最优参数： {'svc__gamma': 0.1}
model1 最优得分_5折交叉准确率为： 0.8249074132195091
knn 模型如下：
model1 最优参数： {'knn__n_neighbors': 20}
model1 最优得分_5折交叉准确率为： 0.8159249262444291


In [20]:
# 6）各模型结果显示如下
# ID3模型
# kaggle上得分，0.76076

In [21]:
# CART模型
# kaggle上得分，0.76794

In [22]:
# svc模型
# kaggle上得分，0.77990

In [23]:
# knn模型
# kaggle上得分，0.77272

In [24]:
# 三、需特别调参的分类器
# 1、Adaboost分类器
# 用网格搜索调参，构建模型；
# scoring = 默认，也即，scoring = 'accuracy'
# param1 = {'n_estimators':range(500,1001,100)}
# model1 = GridSearchCV(estimator=AdaBoostClassifier(random_state=10),\
#                       param_grid=param1,cv=5) # 5折交叉验证
# model1.fit(train_features,train_labels)
# print(model1.best_score_) # 得分为：0.8126169104262131
# print(model1.best_params_) # {'n_estimators':600}

In [25]:
# param2 = {'learning_rate':[0.00001,0.00005,0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,0.5,1,3,5,10]}
# model2 = GridSearchCV(estimator=AdaBoostClassifier(random_state=10,n_estimators=600),\
#                       param_grid=param2,cv=5) # 5折交叉验证
# model2.fit(train_features,train_labels)
# print(model2.best_score_) # 得分为：0.8126169104262131
# print(model2.best_params_) # {'learning_rate':1}

In [26]:
# 结果预测，kaggle上得分，0.75837
model3 = AdaBoostClassifier(n_estimators=600,random_state=10,learning_rate=1)
model3.fit(train_features,train_labels)
test_pre_AdaBoost = model3.predict(test_features)
df1_test['predict_model_AdaBoost'] = test_pre_AdaBoost

In [27]:
# 2、GradientBoostingClassifie（梯度提升分类器）
# 用网格搜索调参，构建模型；
# param1 = {'n_estimators':range(500,1001,100)}
# model1 = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.1,random_state=10),param_grid=param1,cv=5)
# model1.fit(train_features,train_labels)
# print(model1.best_score_) # 得分为：0.8271797125102003
# print(model1.best_params_) # {'n_estimators':500}

In [28]:
# param2 = {'max_depth':range(3,11),'min_samples_split':range(10,21,2)}
# model2 = GridSearchCV(estimator=GradientBoostingClassifier(random_state=10,learning_rate=0.1,n_estimators=500),param_grid=param2)
# model2.fit(train_features,train_labels)
# print(model2.best_score_) # 得分为：0.8305379448873266
# print(model2.best_params_) # {'max_depth':3, 'min_samples_split':14}

In [29]:
# 结果预测，kaggle上得分，0.75837
model3 = GradientBoostingClassifier(n_estimators=500,random_state=10,learning_rate=0.1,max_depth=3,min_samples_split=14)
model3.fit(train_features,train_labels)
test_pre_GBDT = model3.predict(test_features)
df1_test['predict_model_GBDT'] = test_pre_GBDT

In [30]:
# 3、RandomForestClassifier_gini（随机森林分类器）
# 用网格搜索调参，构建模型；
# 指定划分子树的评估标准，criterion = 'gini'，默认值，计算基尼系数；
# param1 = {'n_estimators':range(500,1001,50)}
# model1 = GridSearchCV(estimator=RandomForestClassifier(random_state=10,oob_score=True),\
#                       param_grid=param1,cv=5)
# model1.fit(train_features,train_labels)
# print(model1.best_score_) # 得分为：0.8137153976523759
# print(model1.best_params_) # {'n_estimators':500}

In [31]:
# param2 = {'max_depth':range(3,11),'min_samples_split':range(10,21,2)}
# model2 = GridSearchCV(estimator=RandomForestClassifier(random_state=10,n_estimators=500,oob_score=True),param_grid=param2,cv=5)
# model2.fit(train_features,train_labels)
# print(model2.best_score_) # 得分为：0.8305504990270542
# print(model2.best_params_) # {'max_depth':10, 'min_samples_split':10}

In [32]:
# 结果预测，kaggle上得分，0.78468
model3 = RandomForestClassifier(random_state=10,n_estimators=500,oob_score=True,min_samples_split=10,max_depth=10)
model3.fit(train_features,train_labels)
test_pre_RF_gini = model3.predict(test_features)
df1_test['predict_model_RF_gini'] = test_pre_RF_gini

In [33]:
# 4、RandomForestClassifier_entropy（随机森林分类器）
# 用网格搜索调参，构建模型；
# 指定划分子树的评估标准，criterion = 'entropy’，使用基于信息熵的方法；
# param1 = {'n_estimators':range(500,1001,100)}
# model1 = GridSearchCV(estimator=RandomForestClassifier(random_state=10,criterion='entropy'),param_grid=param1,cv=5)
# model1.fit(train_features,train_labels)
# print(model1.best_score_) # 得分为：0.815956311593748
# print(model1.best_params_) # {'n_estimators':700}

In [34]:
# param2 = {'max_depth':range(3,11),'min_samples_split':range(10,21,2)}
# model2 = GridSearchCV(estimator=RandomForestClassifier(random_state=10,criterion='entropy',n_estimators=700,oob_score=True),param_grid=param2,cv=5)
# model2.fit(train_features,train_labels)
# print(model2.best_score_) # 得分为：0.8271922666499278
# print(model2.best_params_) # {'max_depth': 9, 'min_samples_split': 12}

In [35]:
# 结果预测，kaggle上得分，0.77511
model3 = RandomForestClassifier(n_estimators=700,random_state=10,criterion='entropy',max_depth=9, min_samples_split=12)
model3.fit(train_features,train_labels)
test_pre_RF_entropy = model3.predict(test_features)
df1_test['predict_model_RF_entropy'] = test_pre_RF_entropy

In [36]:
# 4、BaggingClassifier
# 用网格搜索调参，构建模型；
# scoring = 默认，也即，scoring = 'accuracy'
# param1 = {'n_estimators':range(10,501,10)}
# model1 = GridSearchCV(estimator=BaggingClassifier(random_state=10),\
#                       param_grid=param1,cv=5)
# model1.fit(train_features,train_labels)
# print(model1.best_score_) # 0.8182097796748478
# print(model1.best_params_) # {'n_estimators': 320}

In [37]:
# param2 = {'max_samples':[0.5,0.8,1],'max_features':[0.5,1]}
# model2 = GridSearchCV(estimator=BaggingClassifier(random_state=10,n_estimators=320),\
#                       param_grid=param2,cv=5)
# model2.fit(train_features,train_labels)
# print(model2.best_score_) # 0.8249576297784194
# print(model2.best_params_) # {'max_features': 0.5, 'max_samples': 0.8}

In [38]:
# 结果预测，kaggle上得分，0.78229
model3 = BaggingClassifier(n_estimators=320,random_state=10,max_features=0.5, max_samples=0.8)
model3.fit(train_features,train_labels)
test_pre_Bagging = model3.predict(test_features)
df1_test['predict_model_Bagging'] = test_pre_Bagging

In [39]:
# 5、XGBoost分类器
# 用网格搜索调参，构建模型；
# scoring = 默认，也即，scoring = 'accuracy'
# param1 = {'scale_pos_weight':[1,2,3],'min_child_weight':[1,2,3],'max_delta_step':[1,2,3]}
# model1 = GridSearchCV(estimator=XGBClassifier(random_state=10),\
#                       param_grid=param1,cv=5)
# model1.fit(train_features,train_labels)
# print(model1.best_score_) # 得分为：0.8260561170045821
# print(model1.best_params_) # {'max_delta_step': 1, 'min_child_weight': 3, 'scale_pos_weight': 1}

In [40]:
# param2 = {'learning_rate':[0.00001,0.00005,0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,0.5,1,3,5,10]}
# model2 = GridSearchCV(estimator=XGBClassifier(random_state=10,scale_pos_weight=1,min_child_weight=3,max_delta_step=1),\
#                       param_grid=param2,cv=5)
# model2.fit(train_features,train_labels)
# print(model2.best_score_) # 得分为：0.8372857949908983
# print(model2.best_params_) # {'learning_rate':0.1}

In [41]:
# param3 = {'n_estimators':range(1,1002,50)}
# model3 = GridSearchCV(estimator=XGBClassifier(random_state=10,scale_pos_weight=1,min_child_weight=3,max_delta_step=1,learning_rate=0.1),param_grid=param3,cv=5)
# model3.fit(train_features,train_labels)
# print(model3.best_score_) # 得分为：0.8372857949908983
# print(model3.best_params_) # {'n_estimators':101}

In [42]:
# param4 = {'max_depth':range(3,11)}
# model4 = GridSearchCV(estimator=XGBClassifier(random_state=10,scale_pos_weight=1,min_child_weight=3,max_delta_step=1,learning_rate=0.1,n_estimators=101),param_grid=param4,cv=5)
# model4.fit(train_features,train_labels)
# print(model4.best_score_) # 得分为：0.8395267089322704
# print(model4.best_params_) # {'max_depth':7}

In [43]:
# 结果预测，kaggle上得分，0.76315
model5 = XGBClassifier(random_state=10,scale_pos_weight=1,min_child_weight=3,max_delta_step=1,\
                       learning_rate=0.1,n_estimators=101,max_depth=7)
model5.fit(train_features,train_labels)
test_pre_xgb = model5.predict(test_features)
df1_test['predict_model_XGBClassifier'] = test_pre_xgb

In [44]:
# 观测目前测试集已产生的特征结果
print(df1_test.head(3))

   PassengerId  Pclass                              Name     Sex   Age  SibSp  \
0          892       3                  Kelly, Mr. James    male  34.5      0   
1          893       3  Wilkes, Mrs. James (Ellen Needs)  female  47.0      1   
2          894       2         Myles, Mr. Thomas Francis    male  62.0      0   

   Parch  Ticket    Fare Cabin  ... predict_model_ID3  predict_model_CART  \
0      0  330911  7.8292   NaN  ...                 0                   0   
1      0  363272  7.0000   NaN  ...                 0                   0   
2      0  240276  9.6875   NaN  ...                 0                   0   

   predict_model_svc  predict_model_knn  predict_model_AdaBoost  \
0                  0                  0                       0   
1                  0                  1                       0   
2                  0                  0                       0   

   predict_model_GBDT  predict_model_RF_gini  predict_model_RF_entropy  \
0                   0  

In [45]:
# 四、预测并输出结果
# 产生预测结果
test_id = df1_test['PassengerId']
df_test2 = df1_test.iloc[:,11:]
print(df_test2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   predict_model_Gnb            418 non-null    int64
 1   predict_model_Blb            418 non-null    int64
 2   predict_model_ID3            418 non-null    int64
 3   predict_model_CART           418 non-null    int64
 4   predict_model_svc            418 non-null    int64
 5   predict_model_knn            418 non-null    int64
 6   predict_model_AdaBoost       418 non-null    int64
 7   predict_model_GBDT           418 non-null    int64
 8   predict_model_RF_gini        418 non-null    int64
 9   predict_model_RF_entropy     418 non-null    int64
 10  predict_model_Bagging        418 non-null    int64
 11  predict_model_XGBClassifier  418 non-null    int64
dtypes: int64(12)
memory usage: 39.3 KB
None


In [46]:
df_columns = df_test2.columns
for i in df_columns:
    df_test3 = pd.DataFrame({'PassengerId':test_id,'Survived':df_test2[i]})
    df_test3.to_csv('kaggel_titanic_predict_%s.csv' % i,index=False)

In [47]:
# 综上，小结
# 由于本次项目中，kaggle上评估标准为 accuracy，Accuracy =(TP+TN)/(TP+TN+FP+FN)；
# 即，得分越高，准确率越高；
# 得分为1.0，准确率100%，说明该模型预测结果与实际结果全部相符；
# 即，本项目最佳模型为，RandomForest，criterion='gini'时；
# Kaggle上最终得分，0.78468；
# 即，约 80%的结果，均被预测正确，模型较适合；