In [1]:
import pandas as pd
import numpy as np
import random as rnd

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [4]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

print(train_df.head())
print(train_df.shape)
print('\n')
print(test_df.head())
print(test_df.shape)

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
(8

In [5]:
print(train_df.shape)
print(test_df.shape)

drop_columns_list = ['Ticket', 'Cabin', 'Name']
train_df = train_df.drop(drop_columns_list, axis=1)
test_df = test_df.drop(drop_columns_list, axis=1)

train_df = train_df.drop(['PassengerId'], axis=1)

print('\n')
print(train_df.shape)
print(test_df.shape)

(891, 12)
(418, 11)


(891, 8)
(418, 8)


In [6]:
print(train_df.head())

train_df['Sex'] = train_df['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
test_df['Sex'] = test_df['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

print(train_df.head())

   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0         0       3    male  22.0      1      0   7.2500        S
1         1       1  female  38.0      1      0  71.2833        C
2         1       3  female  26.0      0      0   7.9250        S
3         1       1  female  35.0      1      0  53.1000        S
4         0       3    male  35.0      0      0   8.0500        S
   Survived  Pclass  Sex   Age  SibSp  Parch     Fare Embarked
0         0       3    0  22.0      1      0   7.2500        S
1         1       1    1  38.0      1      0  71.2833        C
2         1       3    1  26.0      0      0   7.9250        S
3         1       1    1  35.0      1      0  53.1000        S
4         0       3    0  35.0      0      0   8.0500        S


In [8]:
print(train_df.head())

train_df.loc[ train_df['Age'] <= 16, 'Age'] = 0
train_df.loc[(train_df['Age'] > 16) & (train_df['Age'] <= 32), 'Age'] = 1
train_df.loc[(train_df['Age'] > 32) & (train_df['Age'] <= 48), 'Age'] = 2
train_df.loc[(train_df['Age'] > 48) & (train_df['Age'] <= 64), 'Age'] = 3
train_df.loc[ train_df['Age'] > 64, 'Age']

test_df.loc[ test_df['Age'] <= 16, 'Age'] = 0
test_df.loc[(test_df['Age'] > 16) & (test_df['Age'] <= 32), 'Age'] = 1
test_df.loc[(test_df['Age'] > 32) & (test_df['Age'] <= 48), 'Age'] = 2
test_df.loc[(test_df['Age'] > 48) & (test_df['Age'] <= 64), 'Age'] = 3
test_df.loc[ test_df['Age'] > 64, 'Age']

print(train_df.head())

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare Embarked
0         0       3    0  22.0      1      0   7.2500        S
1         1       1    1  38.0      1      0  71.2833        C
2         1       3    1  26.0      0      0   7.9250        S
3         1       1    1  35.0      1      0  53.1000        S
4         0       3    0  35.0      0      0   8.0500        S
   Survived  Pclass  Sex  Age  SibSp  Parch     Fare Embarked
0         0       3    0  1.0      1      0   7.2500        S
1         1       1    1  2.0      1      0  71.2833        C
2         1       3    1  1.0      0      0   7.9250        S
3         1       1    1  2.0      1      0  53.1000        S
4         0       3    0  2.0      0      0   8.0500        S


In [9]:
common_port = train_df['Embarked'].dropna().mode()[0]
print(common_port)
train_df['Embarked'] = train_df['Embarked'].fillna(common_port)
test_df['Embarked'] = test_df['Embarked'].fillna(common_port)

train_df['Embarked'] = train_df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
test_df['Embarked'] = test_df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

print(train_df.head())

S
   Survived  Pclass  Sex  Age  SibSp  Parch     Fare  Embarked
0         0       3    0  1.0      1      0   7.2500         0
1         1       1    1  2.0      1      0  71.2833         1
2         1       3    1  1.0      0      0   7.9250         0
3         1       1    1  2.0      1      0  53.1000         0
4         0       3    0  2.0      0      0   8.0500         0


In [10]:
print(train_df.shape)
print(test_df.shape)

train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

print('\n')
print(train_df.shape)
print(test_df.shape)

(891, 8)
(418, 8)


(714, 8)
(331, 8)


In [11]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1).copy()

print(X_train.shape)
print(X_train.head())
print('\n')
print(Y_train.shape)
print(Y_train.head())
print('\n')
print(X_test.shape)
print(X_test.head())
print('\n')

(714, 7)
   Pclass  Sex  Age  SibSp  Parch     Fare  Embarked
0       3    0  1.0      1      0   7.2500         0
1       1    1  2.0      1      0  71.2833         1
2       3    1  1.0      0      0   7.9250         0
3       1    1  2.0      1      0  53.1000         0
4       3    0  2.0      0      0   8.0500         0


(714,)
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


(331, 7)
   Pclass  Sex  Age  SibSp  Parch     Fare  Embarked
0       3    0  2.0      0      0   7.8292         2
1       3    1  2.0      1      0   7.0000         0
2       2    0  3.0      0      0   9.6875         2
3       3    0  1.0      0      0   8.6625         0
4       3    1  1.0      1      1  12.2875         0




In [12]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = decision_tree.score(X_train, Y_train)
print('training accuracy: %.5f' % acc_decision_tree)

submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('./submission_decision_tree.csv', index=False)

training accuracy: 0.94398


In [22]:
from IPython.display import display
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
        display_html(html_str.replace('table','table styple="display:inline"'),raw=True)