In [1]:
import numpy as np 
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Preprocessing

<h3>Train.csv</h3>

In [2]:
my_data = pd.read_csv("database/train.csv", delimiter=",")
my_data[0:5]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
len(my_data[my_data.Embarked=='S']),len(my_data[my_data.Embarked=='C']),len(my_data[my_data.Embarked=='Q'])

(644, 168, 77)

In [4]:
len(my_data[my_data.Sex=='female']),len(my_data[my_data.Sex=='male'])

(314, 577)

In [5]:
len(my_data[my_data.Pclass==1]),len(my_data[my_data.Pclass==2]),len(my_data[my_data.Pclass==3])

(216, 184, 491)

In [6]:
my_data['Embarked'] = my_data['Embarked'].fillna('S')
my_data['Age'] = my_data['Age'].fillna(my_data['Age'].median())

In [7]:
# értékek kiválasztása, melyeknek szerepe lehetett a túlélésben
X = my_data[['Pclass', 'Sex', 'Age', 'Embarked']].values
X[0:5]

array([[3, 'male', 22.0, 'S'],
       [1, 'female', 38.0, 'C'],
       [3, 'female', 26.0, 'S'],
       [1, 'female', 35.0, 'S'],
       [3, 'male', 35.0, 'S']], dtype=object)

In [8]:
le_sex = preprocessing.LabelEncoder()
le_sex.fit(["female","male"])
X[:,1] = le_sex.transform(X[:,1]) 

In [9]:
le_emb = preprocessing.LabelEncoder()
le_emb.fit([ 'C', 'Q', 'S'])
X[:,3] = le_emb.transform(X[:,3])

In [10]:
X[0:5]

array([[3, 1, 22.0, 2],
       [1, 0, 38.0, 0],
       [3, 0, 26.0, 2],
       [1, 0, 35.0, 2],
       [3, 1, 35.0, 2]], dtype=object)

In [11]:
y = my_data['Survived']
y[0:5]

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [13]:
X_train.shape, y_train.shape

((712, 4), (712,))

<h3>Test.csv</h3>

In [14]:
test_data = pd.read_csv("database/test.csv", delimiter=",")
test_data[0:5]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [15]:
test_data['Embarked'] = test_data['Embarked'].fillna('S')
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median())

In [16]:
test_X = test_data[['Pclass', 'Sex', 'Age', 'Embarked']].values
test_X[0:5]

array([[3, 'male', 34.5, 'Q'],
       [3, 'female', 47.0, 'S'],
       [2, 'male', 62.0, 'Q'],
       [3, 'male', 27.0, 'S'],
       [3, 'female', 22.0, 'S']], dtype=object)

In [17]:
test_sex = preprocessing.LabelEncoder()
test_sex.fit(["female","male"])
test_X[:,1] = test_sex.transform(test_X[:,1])

test_emb = preprocessing.LabelEncoder()
test_emb.fit([ 'C', 'Q', 'S'])
test_X[:,3] = test_emb.transform(test_X[:,3])

<h3>Decision Tree</h3>

In [18]:
from sklearn.tree import DecisionTreeClassifier

In [19]:
trainedTree = DecisionTreeClassifier(criterion="entropy",max_depth=4)
trainedTree.fit(X_train,y_train)
predTree = trainedTree.predict(X_test)

In [20]:
print (predTree [0:20])
print (y_test[0:20])

[0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1]
395    0
85     1
201    0
542    0
702    0
51     0
237    1
548    0
527    0
157    0
144    0
48     0
887    1
239    0
270    0
575    0
627    1
164    0
525    0
498    0
Name: Survived, dtype: int64


In [21]:
from sklearn import metrics
import matplotlib.pyplot as plt
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_test, predTree))

DecisionTrees's Accuracy:  0.8212290502793296


In [22]:
testTree = trainedTree.predict(test_X)
testTree

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,