# Titanic Decision Tree Classifier

## Import

In [2]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier

## Load Data

In [4]:
trainDataSet = pd.read_csv('data/titanic/train.csv')
testDataSet = pd.read_csv('data/titanic/test.csv')

## Data Explore

In [6]:
print(trainDataSet.info())
print('-'*30)
print(trainDataSet.describe())
print('-'*30)
print(trainDataSet.describe(include=['O']))
print('-'*30)
print(trainDataSet.head())
print('-'*30)
print(trainDataSet.tail())
print('-'*30)
print(trainDataSet['Embarked'].value_counts())  # 确认'Embarked'这个feature有几个value，每个value用了几次

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
------------------------------
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008  

## Data Clean

In [8]:
# 使用平均年龄来填充年龄中的 nan 值
trainDataSet['Age'].fillna(trainDataSet['Age'].mean(), inplace=True)
testDataSet['Age'].fillna(testDataSet['Age'].mean(), inplace=True)

# 使用票价的均值填充票价中的 nan 值
trainDataSet['Fare'].fillna(trainDataSet['Fare'].mean(), inplace=True)
testDataSet['Fare'].fillna(testDataSet['Fare'].mean(), inplace=True)

# 使用登录最多的港口来填充登录港口的 nan 值
trainDataSet['Embarked'].fillna('S', inplace=True)
testDataSet['Embarked'].fillna('S', inplace=True)

## Feature Selection

In [9]:
featureList = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X_train = trainDataSet[featureList]
y_train = trainDataSet['Survived']
X_test = testDataSet[featureList]
vectorTransformer = DictVectorizer(sparse=False)
X_train = vectorTransformer.fit_transform(X_train.to_dict(orient='record'))
X_test = vectorTransformer.transform(X_test.to_dict(orient='record'))



## Tree Construction and Train

In [10]:
id3DtClf = DecisionTreeClassifier(criterion='entropy')  # 构造ID3决策树
id3DtClf.fit(X_train, y_train)  # 训练决策树

DecisionTreeClassifier(criterion='entropy')

## Performance Evaluation and Prediction

In [19]:
acc_decision_tree = round(id3DtClf.score(X_train, y_train), 6)  # 得到决策树准确率
print('score accuracy rate is: ', acc_decision_tree)
y_test = id3DtClf.predict(X_test)  # 决策树预测
print('test label set is: ', y_test)

score accuracy rate is:  0.982043
test label set is:  [0 0 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 1 1 0 1 1 1 1 1 0 1 1 1 0 0 0 1 0 1 0 0
 0 0 1 0 0 0 1 1 0 0 0 1 1 0 0 1 0 1 0 0 0 0 1 0 0 0 1 1 1 0 0 0 1 1 0 0 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 0 1 0 0
 0 1 0 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 0 0 0 0 1 1 0 0 1 1 0 1 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0
 1 1 1 0 0 0 0 0 0 1 1 0 1 0 1 1 1 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 1 1 1 1 0 0 1 1 1
 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 1 0 1 1 1 0 0 1 0 1 0 0 1 0 1 0 0 1 0
 0 0 1 0 1 0 0 1 0 0 0]
