In [153]:
# Imports
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [154]:
# Read files
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')

In [155]:
# Drop insignificant columns/datas
train = train.drop(['Name', 'Cabin', 'Ticket'], axis=1)
test = test.drop(['Name', 'Cabin', 'Ticket'], axis=1)

In [156]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [157]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [158]:
# Turn the string data into numbers
new_dtrain = pd.get_dummies(train)
new_dtrain.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,0,1,0,0,1
1,2,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,1,3,26.0,0,0,7.925,1,0,0,0,1
3,4,1,1,35.0,1,0,53.1,1,0,0,0,1
4,5,0,3,35.0,0,0,8.05,0,1,0,0,1


In [159]:
new_dtest = pd.get_dummies(test)
new_dtest.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,892,3,34.5,0,0,7.8292,0,1,0,1,0
1,893,3,47.0,1,0,7.0,1,0,0,0,1
2,894,2,62.0,0,0,9.6875,0,1,0,1,0
3,895,3,27.0,0,0,8.6625,0,1,0,0,1
4,896,3,22.0,1,1,12.2875,1,0,0,0,1


In [160]:
# Correlation Matrix
new_dtrain.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658,-0.042939,0.042939,-0.001205,-0.033606,0.022148
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307,0.543351,-0.543351,0.16824,0.00365,-0.15566
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495,-0.1319,0.1319,-0.243292,0.221009,0.08172
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067,-0.093254,0.093254,0.036261,-0.022405,-0.032523
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651,0.114631,-0.114631,-0.059528,-0.026354,0.070941
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225,0.245489,-0.245489,-0.011069,-0.081228,0.063036
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0,0.182333,-0.182333,0.269335,-0.117216,-0.166603
Sex_female,-0.042939,0.543351,-0.1319,-0.093254,0.114631,0.245489,0.182333,1.0,-1.0,0.082853,0.074115,-0.125722
Sex_male,0.042939,-0.543351,0.1319,0.093254,-0.114631,-0.245489,-0.182333,-1.0,1.0,-0.082853,-0.074115,0.125722
Embarked_C,-0.001205,0.16824,-0.243292,0.036261,-0.059528,-0.011069,0.269335,0.082853,-0.082853,1.0,-0.148258,-0.778359


In [161]:
# Number of train missing data from training data by media
new_dtrain.isnull().sum().sort_values(ascending=False).head(11)

Age           177
Embarked_S      0
Embarked_Q      0
Embarked_C      0
Sex_male        0
Sex_female      0
Fare            0
Parch           0
SibSp           0
Pclass          0
Survived        0
dtype: int64

In [162]:
# Replacement of missing data from training data by media
new_dtrain['Age'].fillna(new_dtrain['Age'].mean(), inplace=True)
new_dtest['Age'].fillna(new_dtest['Age'].mean(), inplace=True)

In [163]:
# Verification of missing data
new_dtrain.isnull().sum().sort_values(ascending=False).head(11)

Embarked_S    0
Embarked_Q    0
Embarked_C    0
Sex_male      0
Sex_female    0
Fare          0
Parch         0
SibSp         0
Age           0
Pclass        0
Survived      0
dtype: int64

In [164]:
# Number of train missing data from testing data by media
new_dtest.isnull().sum().sort_values(ascending=False).head(11)

Fare           1
Embarked_S     0
Embarked_Q     0
Embarked_C     0
Sex_male       0
Sex_female     0
Parch          0
SibSp          0
Age            0
Pclass         0
PassengerId    0
dtype: int64

In [165]:
# Replacement of missing data from testing data by media
new_dtest['Fare'].fillna(new_dtest['Fare'].mean(), inplace=True)

In [166]:
# Verification of missing data
new_dtest.isnull().sum().sort_values(ascending=False).head(11)

Embarked_S     0
Embarked_Q     0
Embarked_C     0
Sex_male       0
Sex_female     0
Fare           0
Parch          0
SibSp          0
Age            0
Pclass         0
PassengerId    0
dtype: int64

In [167]:
# Number of people who have not paid a fare.
# Substitutes for the average, despite the possibility of being correct (the person does not really pay the fare)
(new_dtrain.Fare == 0).sum(), (new_dtest.Fare == 0).sum()

(15, 2)

In [168]:
# Replacing the fare by the average
new_dtrain.Fare = new_dtrain.Fare.replace(0, np.NaN)
new_dtrain.Fare.fillna(new_dtrain.Fare.mean(),inplace=True)
new_dtest.Fare = new_dtrain.Fare.replace(0, np.NaN)
new_dtest.Fare.fillna(new_dtest.Fare.mean(),inplace=True)

In [169]:
# Checking the fare
(new_dtrain.Fare == 0).sum(), (new_dtest.Fare == 0).sum()

(0, 0)

In [170]:
x = new_dtrain.drop(['Survived'], axis=1) # Features
y = new_dtrain['Survived'] # Target
tree = DecisionTreeClassifier(max_depth=3, random_state=0) # ML Algorithm - Decision Tree
tree.fit(x, y) # Training the ML model

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

In [171]:
tree.score(x, y) # Accuracy

0.8271604938271605

In [172]:
# Submission Data
submission = pd.DataFrame()
submission['PassengerId'] = new_dtest['PassengerId']
submission['Survived'] = tree.predict(new_dtest)

In [173]:
# Accuracy of Submission
g_sub = pd.read_csv('Data/gender_submission.csv')
g_sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [174]:
# Total = 418 rows
submission.PassengerId.count(), g_sub.PassengerId.count()

(418, 418)

In [175]:
# Total = 418 rows
counter = 0 # Correct Predictions
for index, row in submission.iterrows() :
    if (submission.iloc[index]['Survived'] == g_sub.iloc[index]['Survived']):
        counter += 1
print(counter/submission.shape[0]) # Percentage of correct answers

0.937799043062201


In [176]:
submission.to_csv('submission.csv', index=False)