In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import sklearn.metrics as metrics
from sklearn import tree
import graphviz

In [5]:
data = pd.read_csv("E:\\data\\titanic\\train.csv")

In [4]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
data.shape

(891, 12)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
data = data.set_index("PassengerId")

In [8]:
data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### 数据预处理

In [9]:
#删除缺失值过多的列和观察判断和预测的y没有关系的列
data = data.drop(['Cabin','Name',"Ticket"], axis = 1) 

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 62.6+ KB


In [11]:
#将二分类变量转换为数值型变量
data['Sex'] = (data['Sex'] == 'male').astype(int)

In [12]:
data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,1,22.0,1,0,7.25,S
2,1,1,0,38.0,1,0,71.2833,C
3,1,3,0,26.0,0,0,7.925,S
4,1,1,0,35.0,1,0,53.1,S
5,0,3,1,35.0,0,0,8.05,S


In [13]:
data['Embarked'].value_counts().sum()

889

In [14]:
data['Embarked'].isnull().sum()

2

In [11]:
data = data[data['Embarked'].notnull()] #删除缺失值

In [12]:
data['Embarked'].isnull().sum()

0

In [13]:
labels = data['Embarked'].unique().tolist()

In [14]:
labels

['S', 'C', 'Q']

In [19]:
#转为数字 0 1 2
data['Embarked'] = data['Embarked'].apply(lambda x: labels.index(x))

In [20]:
data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,1,22.0,1,0,7.25,0
2,1,1,0,38.0,1,0,71.2833,1
3,1,3,0,26.0,0,0,7.925,0
4,1,1,0,35.0,1,0,53.1,0
5,0,3,1,35.0,0,0,8.05,0


#### 切分训练集和测试集

In [21]:
X = data.iloc[:, 1:] #数据
y = data.iloc[:, 0]  #标签

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state = 420)

In [23]:
X_train.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
699,1,1,49.0,1,1,110.8833,1
67,2,0,29.0,0,0,10.5,0
771,3,1,24.0,0,0,9.5,0
159,3,1,,0,0,8.6625,0
139,3,1,16.0,0,0,9.2167,0


In [24]:
# 修正索引
for i in [X_train, X_test, y_train, y_test]:
    i.index = range(i.shape[0])

In [25]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,1,49.0,1,1,110.8833,1
1,2,0,29.0,0,0,10.5000,0
2,3,1,24.0,0,0,9.5000,0
3,3,1,,0,0,8.6625,0
4,3,1,16.0,0,0,9.2167,0
...,...,...,...,...,...,...,...
617,3,1,26.0,0,0,7.8958,0
618,2,1,34.0,0,0,13.0000,0
619,2,0,34.0,0,0,13.0000,0
620,3,1,21.0,0,0,7.7958,0


In [26]:
#缺失值检查
X_train.isnull().sum()

Pclass        0
Sex           0
Age         120
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

In [27]:
X_test.isnull().sum()

Pclass       0
Sex          0
Age         57
SibSp        0
Parch        0
Fare         0
Embarked     0
dtype: int64

In [28]:
#处理缺失值 对含有缺失值的列进行填补
X_train.loc[:, 'Age'] = X_train['Age'].fillna(X_train['Age'].mean())
X_test.loc[:, 'Age'] = X_test['Age'].fillna(X_train['Age'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [29]:
X_train.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [103]:
X_test.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [34]:
clf = DTC(random_state = 1)
clf = clf.fit(X_train, y_train)
score = clf.score(X_test,y_test)

In [35]:
score

0.7565543071161048

### 调参

In [36]:
# 使用交叉网格搜索，边训练边优化
param_grid = {'criterion':['entropy', 'gini'],
             'max_depth': [2, 3, 4, 5, 6, 7, 8],
             'min_samples_split':[4, 8, 12, 16, 20, 24, 28]}

In [37]:
clf1 = tree.DecisionTreeClassifier() #定义一颗树对象
clfcv = 




In [38]:
clfcv.fit(X_train, y_train)

GridSearchCV(cv=4, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': [2, 3, 4, 5, 6, 7, 8],
                         'min_samples_split': [4, 8, 12, 16, 20, 24, 28]},
             scoring='roc_auc')

In [55]:
test_result = clfcv.predict(X_test)

In [56]:

print('决策树准确度：')
print(metrics.classification_report(y_test,test_result))

决策树准确度：
              precision    recall  f1-score   support

           0       0.81      0.87      0.84       165
           1       0.77      0.68      0.72       102

    accuracy                           0.80       267
   macro avg       0.79      0.77      0.78       267
weighted avg       0.80      0.80      0.79       267



In [111]:
fpr_test, tpr_test, th_test = metrics.roc_curve(y_test, test_result)
print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))

AUC = 0.7746


In [112]:
clfcv.best_params_

{'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 12}

#### 把最优参数带入 画图

In [113]:
clf2 = tree.DecisionTreeClassifier(criterion='gini', max_depth=4, min_samples_split=12)
clf2.fit(X_train, y_train)
test_result2 = clf2.predict(X_test)

#### 读取测试集

In [114]:
test_data = pd.read_csv("E:\\data\\titanic\\test.csv")

In [115]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [116]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [117]:
test_data = test_data.set_index('PassengerId')

In [118]:
test_data = test_data.drop(['Cabin','Name',"Ticket"], axis = 1) 

In [119]:
test_data

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,3,male,34.5,0,0,7.8292,Q
893,3,female,47.0,1,0,7.0000,S
894,2,male,62.0,0,0,9.6875,Q
895,3,male,27.0,0,0,8.6625,S
896,3,female,22.0,1,1,12.2875,S
...,...,...,...,...,...,...,...
1305,3,male,,0,0,8.0500,S
1306,1,female,39.0,0,0,108.9000,C
1307,3,male,38.5,0,0,7.2500,S
1308,3,male,,0,0,8.0500,S


In [120]:
#one-hot处理
test_data['Sex'] = test_data['Sex'].map({'male': 1, 'female': 0})

In [121]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    int64  
 2   Age       332 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      417 non-null    float64
 6   Embarked  418 non-null    object 
dtypes: float64(2), int64(4), object(1)
memory usage: 26.1+ KB


In [122]:
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].mean())

In [123]:
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].mean())

In [125]:
#转为数字 0 1 2
test_data['Embarked'] = test_data['Embarked'].apply(lambda x: labels.index(x))

In [126]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    int64  
 2   Age       418 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      418 non-null    float64
 6   Embarked  418 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 26.1 KB


In [127]:
y_predict = clf2.predict(test_data)

#### 根据模板生成提交文件

In [134]:
submit = pd.concat([pd.Series(range(892,1310)),pd.Series(y_predict)], axis=1)


In [None]:
submit.columns = ['PassengerId', 'Survived']
submit = submit.set_index('PassengerId')


In [130]:
submit.to_csv('E:\\data\\titanic\\my_submition.csv', index=False)

In [131]:
#删除缺失值过多的列和观察判断和预测的y没有关系的列
