In [95]:
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

length = len(train_df.index)
print(train_df.columns.values)


['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


### Data Dictionary

| Variable | Definition   | Key                      | 
|:-  |:-           |:-                       |
| survival | Survival     | 0 = No, 1 = Yes          |
| pclass   | Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd|
|sex	   |Sex	|
|Age	   |Age in years	|
|sibsp	   |# of siblings / spouses aboard the Titanic |	
|parch	   |# of parents / children aboard the Titanic	|
|ticket	   |Ticket number	|
|fare	   |Passenger fare	|
|cabin	   |Cabin number	|
|embarked  |Port of Embarkation	|C = Cherbourg, Q = Queenstown, S = Southampton|


In [57]:
train_df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [63]:
sex_map = train_df["Sex"].tolist()
for i in range(len(sex_map)):
    if sex_map[i] == "male":
        sex_map[i] = 1
    elif sex_map[i] == "female":
        sex_map[i] = 0

train_df["Sex"] = sex_map   # turn sex into number
train_df = train_df.drop(["Ticket"], axis=1)
train_df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,8.05,,S


In [64]:
train_df.describe()


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.647587,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,0.47799,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,0.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,1.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,1.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,1.0,80.0,8.0,6.0,512.3292


In [66]:
train_df.describe(include=['O'])

Unnamed: 0,Name,Cabin,Embarked
count,891,204,889
unique,891,147,3
top,"Colley, Mr. Edward Pomeroy",G6,S
freq,1,4,644


In [83]:
train_df['Cabin'].unique()


array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6', 'C23 C25 C27',
       'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33', 'F G73', 'E31',
       'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101', 'F E69', 'D47',
       'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4', 'A32', 'B4',
       'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35', 'C87', 'B77',
       'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19', 'B49', 'D',
       'C22 C26', 'C106', 'C65', 'E36', 'C54', 'B57 B59 B63 B66', 'C7',
       'E34', 'C32', 'B18', 'C124', 'C91', 'E40', 'T', 'C128', 'D37',
       'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44', 'A34', 'C104', 'C111',
       'C92', 'E38', 'D21', 'E12', 'E63', 'A14', 'B37', 'C30', 'D20',
       'B79', 'E25', 'D46', 'B73', 'C95', 'B38', 'B39', 'B22', 'C86',
       'C70', 'A16', 'C101', 'C68', 'A10', 'E68', 'B41', 'A20', 'D19',
       'D50', 'D9', 'A23', 'B50', 'A26', 'D48', 'E58', 'C126', 'B71',
       'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63', 'C62 C64', 'E24',

In [109]:
cabin = train_df['Cabin'].tolist()
#print(cabin)
for i in range(length):
    
    if train_df['Cabin'].isnull()[i]:
        cabin[i] = 'N'
    if not train_df['Cabin'].isnull()[i]:
        cabin[i] = str(cabin[i])[0]
#print(cabin)

train_df['Cabin2'] = cabin

train_df[['Cabin2', 'Survived']].groupby(['Cabin2'], as_index=False)\
.mean().sort_values(by='Survived', ascending=False)


Unnamed: 0,Cabin2,Survived
3,D,0.757576
4,E,0.75
1,B,0.744681
5,F,0.615385
2,C,0.59322
6,G,0.5
0,A,0.466667
7,N,0.299854
8,T,0.0


In [108]:
train_df[['Cabin2', 'Survived']].groupby(['Cabin2'], as_index=False).agg(['mean','count'])

Unnamed: 0_level_0,Survived,Survived
Unnamed: 0_level_1,mean,count
Cabin2,Unnamed: 1_level_2,Unnamed: 2_level_2
A,0.466667,15
B,0.744681,47
C,0.59322,59
D,0.757576,33
E,0.75,32
F,0.615385,13
G,0.5,4
N,0.299854,687
T,0.0,1


In [None]:
def cabin_map(x):
    
    dic = {'A':0.47,'B':0.745,'C':0.59,'D':0.758,'E':0.750,'F':0.615,'G':0.5,'N':0.3,'T':0}
    

In [67]:
train_df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean()\
.sort_values(by='Survived', ascending=False)

Unnamed: 0,Sex,Survived
0,0,0.742038
1,1,0.188908


In [70]:
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean()\
.sort_values(by='Survived', ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [71]:
train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean()\
.sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.336957


In [69]:
train_df.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.042939,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.543351,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,0.1319,-0.369226,0.083081,0.018443,-0.5495
Sex,0.042939,-0.543351,0.1319,1.0,0.093254,-0.114631,-0.245489,-0.182333
Age,0.036847,-0.077221,-0.369226,0.093254,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.114631,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.245489,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,-0.182333,0.096067,0.159651,0.216225,1.0
