In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

train = train.drop(['Name','SibSp','Parch','Ticket','Fare','Cabin','Embarked'],axis=1)
test = test.drop(['Name','SibSp','Parch','Ticket','Fare','Cabin','Embarked'],axis=1)

In [3]:
# Look the first three row of the data training
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age
0,1,0,3,male,22.0
1,2,1,1,female,38.0
2,3,1,3,female,26.0


In [4]:
# Convet ['male','female'] to [0, 1]
for df in [train,test]:
    df['Sex_bin'] = df['Sex'].map({'male':0, 'female':1})
    
# Fill missing age values with 0
train['Age'] = train['Age'].fillna(0)
test['Age'] = test['Age'].fillna(0)

# Select feature column names and target variable using for training
features = ['Pclass','Age','Sex_bin']
target = 'Survived'

train[features].head(3)

Unnamed: 0,Pclass,Age,Sex_bin
0,3,22.0,0
1,1,38.0,1
2,3,26.0,1


In [5]:
train[target].head(3).values

array([0, 1, 1], dtype=int64)

In [6]:
# Create the classifier object with the default hyperparamters and fit our clf_obj using the training features and training target values
clf_obj = DecisionTreeClassifier()
clf_obj.fit(train[features], train[target])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [7]:
# Create decision tree visualization
export_graphviz(clf_obj, out_file='titanic_decisionTree.dot', feature_names=features, rounded=True, filled=True,
               class_names=['Survived', 'Did not survived'])

In [8]:
# Make predictions using the features
predictions = clf_obj.predict(test[features])
predictions

array([0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [9]:

# Create a dataframe with the passengers id and the prediction if they survived or not
result = pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':predictions})
result.head(3)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1


In [10]:
# Convert the dataframe to a .csv file to submit to kaggle
result.to_csv('titanic-decision-tree-prediction.csv', index=False)