In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.model_selection import GridSearchCV #网格搜索超参数
import sklearn.ensemble as ensemble

In [2]:
train_data = pd.read_csv("E:\\data\\titanic\\train.csv")
test_data = pd.read_csv("E:\\data\\titanic\\test.csv")

In [3]:
print(train_data.shape)
print(test_data.shape)

(891, 12)
(418, 11)


In [4]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
all_features = pd.concat((train_data.iloc[:, 2:], test_data.iloc[:, 1:]))
print(all_features.shape)

(1309, 10)


In [7]:
all_features.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
#删除缺失值过多的列和观察判断和预测的y没有关系的列
all_features = all_features.drop(['Cabin','Name',"Ticket"], axis = 1) 

In [9]:
all_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1309 non-null   int64  
 1   Sex       1309 non-null   object 
 2   Age       1046 non-null   float64
 3   SibSp     1309 non-null   int64  
 4   Parch     1309 non-null   int64  
 5   Fare      1308 non-null   float64
 6   Embarked  1307 non-null   object 
dtypes: float64(2), int64(3), object(2)
memory usage: 81.8+ KB


In [10]:
all_features['Sex'] = (all_features['Sex'].map({'male':1,'female':0}))

In [11]:
all_features.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,S
1,1,0,38.0,1,0,71.2833,C
2,3,0,26.0,0,0,7.925,S
3,1,0,35.0,1,0,53.1,S
4,3,1,35.0,0,0,8.05,S


In [12]:
all_features['Age'] = all_features['Age'].fillna(all_features['Age'].mean())

In [13]:
all_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1309 non-null   int64  
 1   Sex       1309 non-null   int64  
 2   Age       1309 non-null   float64
 3   SibSp     1309 non-null   int64  
 4   Parch     1309 non-null   int64  
 5   Fare      1308 non-null   float64
 6   Embarked  1307 non-null   object 
dtypes: float64(2), int64(4), object(1)
memory usage: 81.8+ KB


In [14]:
all_features['Embarked'] = (all_features['Embarked'].map({'S':0,'C':1,'Q':2}))

In [15]:
all_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1309 non-null   int64  
 1   Sex       1309 non-null   int64  
 2   Age       1309 non-null   float64
 3   SibSp     1309 non-null   int64  
 4   Parch     1309 non-null   int64  
 5   Fare      1308 non-null   float64
 6   Embarked  1307 non-null   float64
dtypes: float64(3), int64(4)
memory usage: 81.8 KB


In [16]:
all_features['Embarked'] = all_features['Embarked'].fillna(all_features['Embarked'].mode()[0])

In [17]:
all_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1309 non-null   int64  
 1   Sex       1309 non-null   int64  
 2   Age       1309 non-null   float64
 3   SibSp     1309 non-null   int64  
 4   Parch     1309 non-null   int64  
 5   Fare      1308 non-null   float64
 6   Embarked  1309 non-null   float64
dtypes: float64(3), int64(4)
memory usage: 81.8 KB


In [18]:
all_features['Embarked'] = all_features['Embarked'].astype('int')

In [19]:
all_features.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,0
1,1,0,38.0,1,0,71.2833,1
2,3,0,26.0,0,0,7.925,0
3,1,0,35.0,1,0,53.1,0
4,3,1,35.0,0,0,8.05,0


In [20]:
all_features['Fare'] = all_features['Fare'].fillna(all_features['Fare'].mean())

In [21]:
all_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1309 non-null   int64  
 1   Sex       1309 non-null   int64  
 2   Age       1309 non-null   float64
 3   SibSp     1309 non-null   int64  
 4   Parch     1309 non-null   int64  
 5   Fare      1309 non-null   float64
 6   Embarked  1309 non-null   int32  
dtypes: float64(2), int32(1), int64(4)
memory usage: 76.7 KB


In [22]:
n_train = train_data.shape[0]
print(n_train)
train_features = all_features[:n_train]
test_features = all_features[n_train:]

891


In [23]:
train_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    int64  
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  891 non-null    int32  
dtypes: float64(2), int32(1), int64(4)
memory usage: 52.2 KB


In [24]:
test_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    int64  
 2   Age       418 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      418 non-null    float64
 6   Embarked  418 non-null    int32  
dtypes: float64(2), int32(1), int64(4)
memory usage: 24.5 KB


In [25]:
train_labels = train_data.Survived
train_labels

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [47]:
X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels, test_size= 0.2, random_state = 10)

In [48]:
# 使用交叉网格搜索，边训练边优化
param_grid = {'criterion': ['entropy', 'gini'],
              'n_estimators' : [6, 9, 12,15, 18],  #决策树个数
             'max_depth': [3, 6, 9, 12],
              'max_features': ['log2', 'sqrt', 'auto'],
             'min_samples_split': [2, 4, 6, 8],
              'min_samples_leaf': [1, 3, 5 ,7]
             
             }

In [49]:
rfc = ensemble.RandomForestClassifier() #定义一颗树对象
rfc_cv = GridSearchCV(estimator = rfc, param_grid = param_grid, scoring = 'roc_auc', cv =5)

### 训练

In [50]:
rfc_cv.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': [3, 6, 9, 12],
                         'max_features': ['log2', 'sqrt', 'auto'],
                         'min_samples_leaf': [1, 3, 5, 7],
                         'min_samples_split': [2, 4, 6, 8],
                         'n_estimators': [6, 9, 12, 15, 18]},
             scoring='roc_auc')

In [51]:
test_result = rfc_cv.predict(X_test)

In [52]:
print('随机森林准确度：')
print(metrics.classification_report(y_test, test_result))

随机森林准确度：
              precision    recall  f1-score   support

           0       0.86      0.92      0.89       117
           1       0.83      0.71      0.77        62

    accuracy                           0.85       179
   macro avg       0.84      0.82      0.83       179
weighted avg       0.85      0.85      0.85       179



In [53]:
fpr_test, tpr_test, th_test = metrics.roc_curve(test_result, y_test)
print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))

AUC = 0.8437


In [54]:
rfc_cv.best_params_

{'criterion': 'entropy',
 'max_depth': 6,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 15}

### 预测

In [55]:
clf1 = ensemble.RandomForestClassifier(criterion = 'entropy', n_estimators = 15, 
           max_depth=6, max_features='log2', min_samples_split=4, min_samples_leaf=1)
clf1.fit(X_train, y_train)
y_predict = clf1.predict(test_features)

### 生成文件

In [56]:
# test_data['Survived'] = pd.Series(y_predict.reshape(1, -1)[0])
# submit = pd.concat([pd.Series(range(892,1310)),pd.Series(y_predict)], axis=1)
# submission = pd.concat([test_data['PassengerId'], test_data['Survived']], axis=1)
submission = pd.DataFrame({"PassengerID":test_data.PassengerId, "Survived":y_predict})

In [57]:
# submit.columns = ['PassengerId', 'Survived']
# submit = submit.set_index('PassengerId')
submission.to_csv('E:\\data\\titanic\\随机森林.csv', index = False)

In [58]:
submission

Unnamed: 0,PassengerID,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
