In [456]:
# import dataset

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

data = pd.concat([train_data, test_data], ignore_index = True, sort = False)

In [457]:
# check missing values

data.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [458]:
# data imputation

# Embarked
data.Embarked.fillna(data.Embarked.mode()[0], inplace = True)
train_data.Embarked.fillna(data.Embarked.mode()[0], inplace = True)

# Cabin
data.Cabin = data.Cabin.fillna('NA')
train_data.Cabin = data.Cabin.fillna('NA')

# Age
import numpy as np
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
data['Age'] = imp.fit_transform(data[['Age']])
train_data['Age'] = imp.fit_transform(train_data[['Age']])
test_data['Age'] = imp.fit_transform(test_data[['Age']])

# Fare
test_data['Fare'] = imp.fit_transform(test_data[['Fare']])

In [459]:
# check number of unique values each column

data.nunique()

PassengerId    1309
Survived          2
Pclass            3
Name           1307
Sex               2
Age              99
SibSp             7
Parch             8
Ticket          929
Fare            281
Cabin           187
Embarked          3
dtype: int64

In [460]:
# check number of survived and unsurvived passengers

print("Count of survived Passengers: ", data['Survived'][data['Survived'] == 1].count())
print("Count of unsurvived Passengers: ", data['Survived'][data['Survived'] == 0].count())

Count of survived Passengers:  342
Count of unsurvived Passengers:  549


In [461]:
#check percentage of survived Passenger of each Pclass

print("Class 1:")
print("Percentage Survived: ", data['Survived'][(data['Survived'] == 1) & (data['Pclass'] == 1)].count() / data['Pclass'][data['Pclass'] == 1].count())
print("Percentage Unsurvived: ", data['Survived'][(data['Survived'] == 0) & (data['Pclass'] == 1)].count() / data['Pclass'][data['Pclass'] == 1].count())

print("\nClass 2:")
print("Percentage Survived: ", data['Survived'][(data['Survived'] == 1) & (data['Pclass'] == 2)].count() / data['Pclass'][data['Pclass'] == 2].count())
print("Percentage Unsurvived: ", data['Survived'][(data['Survived'] == 0) & (data['Pclass'] == 2)].count() / data['Pclass'][data['Pclass'] == 2].count())

print("\nClass 3:")
print("Percentage Survived: ", data['Survived'][(data['Survived'] == 1) & (data['Pclass'] == 3)].count() / data['Pclass'][data['Pclass'] == 3].count())
print("Percentage Unsurvived: ", data['Survived'][(data['Survived'] == 0) & (data['Pclass'] == 3)].count() / data['Pclass'][data['Pclass'] == 3].count())

Class 1:
Percentage Survived:  0.42105263157894735
Percentage Unsurvived:  0.2476780185758514

Class 2:
Percentage Survived:  0.3140794223826715
Percentage Unsurvived:  0.35018050541516244

Class 3:
Percentage Survived:  0.16784203102961917
Percentage Unsurvived:  0.5246826516220028


In [462]:
# check percentage of survived Passenger of gender

print("Male:")
print("Percentage Survived: ", data['Survived'][(data['Survived'] == 1) & (data['Sex'] == 'male')].count() / data['Sex'][data['Sex'] == 'male'].count())
print("Percentage Unsurvived: ", data['Survived'][(data['Survived'] == 0) & (data['Sex'] == 'male')].count() / data['Sex'][data['Sex'] == 'male'].count())

print("\nFemale:")
print("Percentage Survived: ", data['Survived'][(data['Survived'] == 1) & (data['Sex'] == 'female')].count() / data['Sex'][data['Sex'] == 'female'].count())
print("Percentage Unsurvived: ", data['Survived'][(data['Survived'] == 0) & (data['Sex'] == 'female')].count() / data['Sex'][data['Sex'] == 'female'].count())

Male:
Percentage Survived:  0.12930011862396204
Percentage Unsurvived:  0.5551601423487544

Female:
Percentage Survived:  0.5
Percentage Unsurvived:  0.17381974248927037


In [463]:
# check percentage of survived Passenger of age

print ("(0, 10]:")
print("Percentage Survived: ", data['Survived'][(data['Survived'] == 1) & (data['Age'] <= 10)].count() / data['Age'][data['Age'] <= 10].count())
print("Percentage Survived: ", data['Survived'][(data['Survived'] == 0) & (data['Age'] <= 10)].count() / data['Age'][data['Age'] <= 10].count())

for x in range(7):
    age = 10 + 10 * x
    print("\n(", age, ", ", 10 + age, "]:")
    print("Percentage Survived: ", data['Survived'][(data['Survived'] == 1) & (data['Age'] > age) & (data['Age'] < 10 + age)].count() / data['Age'][(data['Age'] > age) & (data['Age'] < 10 + age)].count())
    print("Percentage Survived: ", data['Survived'][(data['Survived'] == 0) & (data['Age'] > age) & (data['Age'] < 10 + age)].count() / data['Age'][(data['Age'] > age) & (data['Age'] < 10 + age)].count())

(0, 10]:
Percentage Survived:  0.4418604651162791
Percentage Survived:  0.3023255813953488

( 10 ,  20 ]:
Percentage Survived:  0.2949640287769784
Percentage Survived:  0.4244604316546763

( 20 ,  30 ]:
Percentage Survived:  0.21575342465753425
Percentage Survived:  0.4383561643835616

( 30 ,  40 ]:
Percentage Survived:  0.328125
Percentage Survived:  0.4114583333333333

( 40 ,  50 ]:
Percentage Survived:  0.23931623931623933
Percentage Survived:  0.41025641025641024

( 50 ,  60 ]:
Percentage Survived:  0.2727272727272727
Percentage Survived:  0.41818181818181815

( 60 ,  70 ]:
Percentage Survived:  0.16
Percentage Survived:  0.44

( 70 ,  80 ]:
Percentage Survived:  0.0
Percentage Survived:  0.8


In [464]:
# check percentage of survived Passeger of the total number of family members

data['Family'] = data.Parch + data.SibSp
data['Is_Alone'] = data.Family == 0

train_data['Family'] = train_data.Parch + train_data.SibSp
train_data['Is_Alone'] = train_data.Family == 0

test_data['Family'] = test_data.Parch + test_data.SibSp
test_data['Is_Alone'] = test_data.Family == 0

print ("(0, 1]:")
print("Percentage Survived: ", data['Survived'][(data['Survived'] == 1) & (data['Is_Alone'] == 1)].count() / data['Is_Alone'][data['Is_Alone'] == 1].count())
print("Percentage Unsurvived: ", data['Survived'][(data['Survived'] == 0) & (data['Is_Alone'] == 1)].count() / data['Is_Alone'][data['Is_Alone'] == 1].count())

for x in range(data['Family'].max()):
    print("\n(", x, ", ", 1 + x, "]:")
    print("Percentage Survived: ", data['Survived'][(data['Survived'] == 1) & (data['Family'] == x)].count() / data['Family'][data['Family'] == x].count())
    print("Percentage Unsurvived: ", data['Survived'][(data['Survived'] == 0) & (data['Family'] == x)].count() / data['Family'][data['Family'] == x].count())

(0, 1]:
Percentage Survived:  0.20632911392405062
Percentage Unsurvived:  0.47341772151898737

( 0 ,  1 ]:
Percentage Survived:  0.20632911392405062
Percentage Unsurvived:  0.47341772151898737

( 1 ,  2 ]:
Percentage Survived:  0.37872340425531914
Percentage Unsurvived:  0.30638297872340425

( 2 ,  3 ]:
Percentage Survived:  0.3710691823899371
Percentage Unsurvived:  0.27044025157232704

( 3 ,  4 ]:
Percentage Survived:  0.4883720930232558
Percentage Unsurvived:  0.18604651162790697

( 4 ,  5 ]:
Percentage Survived:  0.13636363636363635
Percentage Unsurvived:  0.5454545454545454

( 5 ,  6 ]:
Percentage Survived:  0.12
Percentage Unsurvived:  0.76

( 6 ,  7 ]:
Percentage Survived:  0.25
Percentage Unsurvived:  0.5

( 7 ,  8 ]:
Percentage Survived:  0.0
Percentage Unsurvived:  0.75

( 8 ,  9 ]:
Percentage Survived:  nan
Percentage Unsurvived:  nan

( 9 ,  10 ]:
Percentage Survived:  nan
Percentage Unsurvived:  nan


  print("Percentage Survived: ", data['Survived'][(data['Survived'] == 1) & (data['Family'] == x)].count() / data['Family'][data['Family'] == x].count())
  print("Percentage Unsurvived: ", data['Survived'][(data['Survived'] == 0) & (data['Family'] == x)].count() / data['Family'][data['Family'] == x].count())


In [465]:
# split the fare amount into categories

maxFare = data['Fare'].max()
minFare = data['Fare'].min()
data['Fare_Category'] = pd.cut(data['Fare'], bins = [minFare, 3 * minFare / 4 + maxFare / 4, (maxFare + minFare) / 2, minFare / 4 + 3 * maxFare / 4, maxFare], labels = ['Low', 'Mid', 'High_Mid', 'High'])
data['Fare_Category'][data['Fare'] == minFare] = 'Low'
data['Fare_Category'][data['Fare'] == 3 * minFare / 4 + maxFare / 4] = 'Mid'
data['Fare_Category'][data['Fare'] == (maxFare + minFare) / 2] = 'High_Mid'
data['Fare_Category'][data['Fare'] == minFare / 4 + 3 * maxFare / 4] = 'High'
data['Fare_Category'][data['Fare'] == maxFare] = 'High'

train_data['Fare_Category'] = pd.cut(train_data['Fare'], bins = [minFare, 3 * minFare / 4 + maxFare / 4, (maxFare + minFare) / 2, minFare / 4 + 3 * maxFare / 4, maxFare], labels = ['Low', 'Mid', 'High_Mid', 'High'])
train_data['Fare_Category'][train_data['Fare'] == minFare] = 'Low'
train_data['Fare_Category'][train_data['Fare'] == 3 * minFare / 4 + maxFare / 4] = 'Mid'
train_data['Fare_Category'][train_data['Fare'] == (maxFare + minFare) / 2] = 'High_Mid'
train_data['Fare_Category'][train_data['Fare'] == minFare / 4 + 3 * maxFare / 4] = 'High'
train_data['Fare_Category'][train_data['Fare'] == maxFare] = 'High'

test_data['Fare_Category'] = pd.cut(test_data['Fare'], bins = [minFare, 3 * minFare / 4 + maxFare / 4, (maxFare + minFare) / 2, minFare / 4 + 3 * maxFare / 4, maxFare], labels = ['Low', 'Mid', 'High_Mid', 'High'])
test_data['Fare_Category'][test_data['Fare'] == minFare] = 'Low'
test_data['Fare_Category'][test_data['Fare'] == 3 * minFare / 4 + maxFare / 4] = 'Mid'
test_data['Fare_Category'][test_data['Fare'] == (maxFare + minFare) / 2] = 'High_Mid'
test_data['Fare_Category'][test_data['Fare'] == minFare / 4 + 3 * maxFare / 4] = 'High'
test_data['Fare_Category'][test_data['Fare'] == maxFare] = 'High'

print("Count of Passengers with Fare_Category Low: ", data['Fare_Category'][data['Fare_Category'] == 'Low'].count())
print("Count of Passengers with Fare_Category Mid: ", data['Fare_Category'][data['Fare_Category'] == 'Mid'].count())
print("Count of Passengers with Fare_Category High_Mid: ", data['Fare_Category'][data['Fare_Category'] == 'High_Mid'].count())
print("Count of Passengers with Fare_Category High: ", data['Fare_Category'][data['Fare_Category'] == 'High'].count())

# check percentage of survived Passeger of the fare

print("\nLow:")
print("Percentage Survived: ", data['Survived'][(data['Survived'] == 1) & (data['Fare_Category'] == 'Low')].count() / data['Fare_Category'][data['Fare_Category'] == 'Low'].count())
print("Percentage Unsurvived: ", data['Survived'][(data['Survived'] == 0) & (data['Fare_Category'] == 'Low')].count() / data['Fare_Category'][data['Fare_Category'] == 'Low'].count())

print("\nMid:")
print("Percentage Survived: ", data['Survived'][(data['Survived'] == 1) & (data['Fare_Category'] == 'Mid')].count() / data['Fare_Category'][data['Fare_Category'] == 'Mid'].count())
print("Percentage Unsurvived: ", data['Survived'][(data['Survived'] == 0) & (data['Fare_Category'] == 'Mid')].count() / data['Fare_Category'][data['Fare_Category'] == 'Mid'].count())

print("\nHigh_Mid:")
print("Percentage Survived: ", data['Survived'][(data['Survived'] == 1) & (data['Fare_Category'] == 'High_Mid')].count() / data['Fare_Category'][data['Fare_Category'] == 'High_Mid'].count())
print("Percentage Unsurvived: ", data['Survived'][(data['Survived'] == 0) & (data['Fare_Category'] == 'High_Mid')].count() / data['Fare_Category'][data['Fare_Category'] == 'High_Mid'].count())

print("\nHigh:")
print("Percentage Survived: ", data['Survived'][(data['Survived'] == 1) & (data['Fare_Category'] == 'High')].count() / data['Fare_Category'][data['Fare_Category'] == 'High'].count())
print("Percentage Unsurvived: ", data['Survived'][(data['Survived'] == 0) & (data['Fare_Category'] == 'High')].count() / data['Fare_Category'][data['Fare_Category'] == 'High'].count())

train_data.isnull().sum()

Count of Passengers with Fare_Category Low:  1241
Count of Passengers with Fare_Category Mid:  50
Count of Passengers with Fare_Category High_Mid:  13
Count of Passengers with Fare_Category High:  4

Low:
Percentage Survived:  0.25302175664786464
Percentage Unsurvived:  0.43432715551974216

Mid:
Percentage Survived:  0.42
Percentage Unsurvived:  0.16

High_Mid:
Percentage Survived:  0.3076923076923077
Percentage Unsurvived:  0.15384615384615385

High:
Percentage Survived:  0.75
Percentage Unsurvived:  0.0


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin            0
Embarked         0
Family           0
Is_Alone         0
Fare_Category    0
dtype: int64

In [466]:
# check percentage of survived Passeger of the embarked

print("Southampton (S):")
print("Percentage Survived: ", data['Survived'][(data['Survived'] == 1) & (data['Embarked'] == 'S')].count() / data['Embarked'][data['Embarked'] == 'S'].count())
print("Percentage Unsurvived: ", data['Survived'][(data['Survived'] == 0) & (data['Embarked'] == 'S')].count() / data['Embarked'][data['Embarked'] == 'S'].count())

print("\nCherbourg (C):")
print("Percentage Survived: ", data['Survived'][(data['Survived'] == 1) & (data['Embarked'] == 'C')].count() / data['Embarked'][data['Embarked'] == 'C'].count())
print("Percentage Unsurvived: ", data['Survived'][(data['Survived'] == 0) & (data['Embarked'] == 'C')].count() / data['Embarked'][data['Embarked'] == 'C'].count())

print("\nQueenstown (Q):")
print("Percentage Survived: ", data['Survived'][(data['Survived'] == 1) & (data['Embarked'] == 'Q')].count() / data['Embarked'][data['Embarked'] == 'Q'].count())
print("Percentage Unsurvived: ", data['Survived'][(data['Survived'] == 0) & (data['Embarked'] == 'Q')].count() / data['Embarked'][data['Embarked'] == 'Q'].count())

Southampton (S):
Percentage Survived:  0.2390829694323144
Percentage Unsurvived:  0.4661572052401747

Cherbourg (C):
Percentage Survived:  0.34444444444444444
Percentage Unsurvived:  0.2777777777777778

Queenstown (Q):
Percentage Survived:  0.24390243902439024
Percentage Unsurvived:  0.3821138211382114


In [467]:
# convert into binary feature

sex = {'female' : 0, 'male' : 1}
embarked = {'C' : 0, 'Q' : 1, 'S' : 2}
fare_category = {'Low' : 0, 'Mid' : 1, 'High_Mid' : 2, 'High' : 3}

data = data[data['Sex'].notnull()].copy()
data['Sex'] = data['Sex'].map(sex)
data['Embarked'] = data['Embarked'].map(embarked)
data['Fare_Category'] = data['Fare_Category'].map(fare_category)

train_data = train_data[train_data['Sex'].notnull()].copy()
train_data['Sex'] = train_data['Sex'].map(sex)
train_data['Embarked'] = train_data['Embarked'].map(embarked)
train_data['Fare_Category'] = train_data['Fare_Category'].map(fare_category)

test_data = test_data[test_data['Sex'].notnull()].copy()
test_data['Sex'] = test_data['Sex'].map(sex)
test_data['Embarked'] = test_data['Embarked'].map(embarked)
test_data['Fare_Category'] = test_data['Fare_Category'].map(fare_category)

test_data.isnull().sum()

PassengerId        0
Pclass             0
Name               0
Sex                0
Age                0
SibSp              0
Parch              0
Ticket             0
Fare               0
Cabin            327
Embarked           0
Family             0
Is_Alone           0
Fare_Category      0
dtype: int64

In [468]:
# split the data

y = train_data['Survived']

features = ['Pclass', 'Sex', 'Age', 'Embarked', 'Family', 'Is_Alone', 'Fare_Category']
X = train_data[features]
X_test = test_data[features]

X.isnull().sum()

Pclass           0
Sex              0
Age              0
Embarked         0
Family           0
Is_Alone         0
Fare_Category    0
dtype: int64

In [469]:
# train the decision tree model

model = DecisionTreeClassifier()
model.fit(X, y)

X_test.isnull().sum()

Pclass           0
Sex              0
Age              0
Embarked         0
Family           0
Is_Alone         0
Fare_Category    0
dtype: int64

In [470]:
# predict

predictions = model.predict(X_test)

In [471]:
# output

output = pd.DataFrame({'PassengerId' : test_data.PassengerId, 'Survived' : predictions})
output.to_csv('data/submission.csv', index = False)