In [322]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv


In [323]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [324]:
from sklearn.linear_model import LogisticRegression

In [325]:
df = pd.read_csv('../input/titanic/train.csv')

In [326]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [327]:
df.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


# Initial thoughts on each field:
* 1) PassengerId  891 non-null    int64  - Meaningless key: delete
* 2) Survived     891 non-null    int64  - The Boolean target to predict
* 3) Pclass       891 non-null    int64  - 1, 2, 3. Proxy for social class, so use as an integer
* 4) Name         891 non-null    object - Scope for feature engineering a person's title, also Swedish (lots of passengers) names often end in _sen
* 5) Sex          891 non-null    object - Simple male/female. No non-binary in 1912!
* 6) Age          714 non-null    float64- 80% present. Worth imputing the missing 20%
* 7) SibSp        891 non-null    int64  - Explore whether families all die/survive together
* 8) Parch        891 non-null    int64  - Explore whether families all die/survive together
* 9) Ticket       891 non-null    object - I can't imagine the ticket code has any importance: delete
* 10) Fare         891 non-null    float64- Originally LSD, but converted to decimal. Multiples of guineas (£1.05) might show upper class
* 11) Cabin        204 non-null    object - 23% present. First letter probably shows the deck, ie proximity to lifeboat
* 12) Embarked     889 non-null    object - Possible discrimination against Irish passengers

In [328]:
print(df['Survived'].sum()/891*100, '% survival rate')

38.38383838383838 % survival rate


# 3) Passenger Class
1, 2 or 3. 
Proxy for social class, so use it as an integer, not categorical Booleans

In [329]:
df['Pclass'].unique()

array([3, 1, 2])

In [330]:
passengers_by_class = df.groupby('Pclass').count()['PassengerId']

In [331]:
passengers_by_class

Pclass
1    216
2    184
3    491
Name: PassengerId, dtype: int64

In [332]:
survival_by_class = df.groupby('Pclass').sum()['Survived']

In [333]:
survival_by_class

Pclass
1    136
2     87
3    119
Name: Survived, dtype: int64

In [334]:
survival_by_class['total']=passengers_by_class

In [335]:
survival_by_class

Pclass
1                                                      136
2                                                       87
3                                                      119
total    Pclass
1    216
2    184
3    491
Name: Passen...
Name: Survived, dtype: object

In [336]:
survival_rate_by_class = np.divide(survival_by_class, passengers_by_class)

In [337]:
survival_rate_by_class

Pclass
1                                                 0.629630
2                                                 0.472826
3                                                 0.242363
total    Pclass
1   NaN
2   NaN
3   NaN
Name: Passenger...
dtype: object

## No surprise: negative correlation between class (number) and survival

# 4) Name

Scope for feature engineering a person's title. <br>
I read somewhere that there were lots of Swedish passengers, whose names often end in _sen <br>
Format: Surname, Title Forenames eg Tart, Mr Nigel

In [338]:
#df[['surame', 'title_forenames']] = 

#df[['surname', 'title', 'forenames']] = 
df['Name'].str.split(' ', expand=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,"Braund,",Mr.,Owen,Harris,,,,,,,,,,
1,"Cumings,",Mrs.,John,Bradley,(Florence,Briggs,Thayer),,,,,,,
2,"Heikkinen,",Miss.,Laina,,,,,,,,,,,
3,"Futrelle,",Mrs.,Jacques,Heath,(Lily,May,Peel),,,,,,,
4,"Allen,",Mr.,William,Henry,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,"Montvila,",Rev.,Juozas,,,,,,,,,,,
887,"Graham,",Miss.,Margaret,Edith,,,,,,,,,,
888,"Johnston,",Miss.,Catherine,Helen,"""Carrie""",,,,,,,,,
889,"Behr,",Mr.,Karl,Howell,,,,,,,,,,


In [339]:
df['Title'] = df['Name'].str.split(',').str[1].str.split('.').str[0].str.strip()

In [340]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Mr
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,Rev
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,Miss
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,Miss
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,Mr


In [341]:
df['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer'], dtype=object)

In [342]:
passengers_by_title = df.groupby('Title').count()['PassengerId']

In [343]:
passengers_by_title

Title
Capt              1
Col               2
Don               1
Dr                7
Jonkheer          1
Lady              1
Major             2
Master           40
Miss            182
Mlle              2
Mme               1
Mr              517
Mrs             125
Ms                1
Rev               6
Sir               1
the Countess      1
Name: PassengerId, dtype: int64

## Only the common English titles Mr, Mrs, Miss, Master have significant numbers (but comprising the vast majority of passengers). 

In [344]:
survival_by_title = df.groupby('Title').sum()['Survived']

In [345]:
survival_rate_by_title = np.divide(survival_by_title, passengers_by_title)

In [346]:
survival_rate_by_title

Title
Capt            0.000000
Col             0.500000
Don             0.000000
Dr              0.428571
Jonkheer        0.000000
Lady            1.000000
Major           0.500000
Master          0.575000
Miss            0.697802
Mlle            1.000000
Mme             1.000000
Mr              0.156673
Mrs             0.792000
Ms              1.000000
Rev             0.000000
Sir             1.000000
the Countess    1.000000
dtype: float64

In [347]:
df['IsMrs'] = df['Title'] == 'Mrs'

In [348]:
df['IsMr'] = df['Title'] == 'Mr'

In [349]:
df['IsMiss'] = df['Title'] == 'Miss'

In [350]:
df['IsMaster'] = df['Title'] == 'Master'

In [351]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,IsMrs,IsMr,IsMiss,IsMaster
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Mr,False,True,False,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,True,False,False,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Miss,False,False,True,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Mrs,True,False,False,False
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Mr,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,Rev,False,False,False,False
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,Miss,False,False,True,False
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,Miss,False,False,True,False
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,Mr,False,True,False,False


In [352]:
df = df.drop('Title', axis=1)
df = df.drop('Name', axis=1)

In [353]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,IsMrs,IsMr,IsMiss,IsMaster
0,1,0,3,male,22.0,1,0,A/5 21171,7.2500,,S,False,True,False,False
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,True,False,False,False
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S,False,False,True,False
3,4,1,1,female,35.0,1,0,113803,53.1000,C123,S,True,False,False,False
4,5,0,3,male,35.0,0,0,373450,8.0500,,S,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,211536,13.0000,,S,False,False,False,False
887,888,1,1,female,19.0,0,0,112053,30.0000,B42,S,False,False,True,False
888,889,0,3,female,,1,2,W./C. 6607,23.4500,,S,False,False,True,False
889,890,1,1,male,26.0,0,0,111369,30.0000,C148,C,False,True,False,False


# 5) Sex

Simple male/female. No non-binary in 1912!

In [354]:
df['IsMale'] = df['Sex']=='Male'

In [355]:
df.drop('Sex', axis=1, inplace=True)

In [356]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,IsMrs,IsMr,IsMiss,IsMaster,IsMale
0,1,0,3,22.0,1,0,A/5 21171,7.2500,,S,False,True,False,False,False
1,2,1,1,38.0,1,0,PC 17599,71.2833,C85,C,True,False,False,False,False
2,3,1,3,26.0,0,0,STON/O2. 3101282,7.9250,,S,False,False,True,False,False
3,4,1,1,35.0,1,0,113803,53.1000,C123,S,True,False,False,False,False
4,5,0,3,35.0,0,0,373450,8.0500,,S,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,27.0,0,0,211536,13.0000,,S,False,False,False,False,False
887,888,1,1,19.0,0,0,112053,30.0000,B42,S,False,False,True,False,False
888,889,0,3,,1,2,W./C. 6607,23.4500,,S,False,False,True,False,False
889,890,1,1,26.0,0,0,111369,30.0000,C148,C,False,True,False,False,False


# 6) Age

80% present. Worth imputing the missing 20% <br>
Use mean age for Mr, Mrs, Miss, Master, then overall mean age for others

In [357]:
grouped_by_title = df.groupby(['IsMrs', 'IsMr', 'IsMiss', 'IsMaster'])

In [358]:
mean_age_by_title = grouped_by_title['Age'].mean()

In [359]:
# mean_age_by_title is joined to df on the Boolean fields
# mean_age_by_title also has a field called 'Age', so use suffixes to append '_mean' and make it distinct
df2 = pd.merge(df, mean_age_by_title, on=['IsMrs', 'IsMr', 'IsMiss', 'IsMaster'], suffixes = ('', '_mean')) 

In [360]:
df2

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,IsMrs,IsMr,IsMiss,IsMaster,IsMale,Age_mean
0,1,0,3,22.0,1,0,A/5 21171,7.2500,,S,False,True,False,False,False,32.368090
1,5,0,3,35.0,0,0,373450,8.0500,,S,False,True,False,False,False,32.368090
2,6,0,3,,0,0,330877,8.4583,,Q,False,True,False,False,False,32.368090
3,7,0,1,54.0,0,0,17463,51.8625,E46,S,False,True,False,False,False,32.368090
4,13,0,3,20.0,0,0,A/5. 2151,8.0500,,S,False,True,False,False,False,32.368090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,767,0,1,,0,0,112379,39.6000,,C,False,False,False,False,False,42.384615
887,797,1,1,49.0,0,0,17465,25.9292,D17,S,False,False,False,False,False,42.384615
888,823,0,1,38.0,0,0,19972,0.0000,,S,False,False,False,False,False,42.384615
889,849,0,2,28.0,0,1,248727,33.0000,,S,False,False,False,False,False,42.384615


In [361]:
# Now replace the missing ages with the values of Age_mean
df2['Age'].fillna(df2['Age_mean'], inplace=True)

In [362]:
df = df2.drop(['Age_mean'], axis=1)

In [363]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,IsMrs,IsMr,IsMiss,IsMaster,IsMale
0,1,0,3,22.000000,1,0,A/5 21171,7.2500,,S,False,True,False,False,False
1,5,0,3,35.000000,0,0,373450,8.0500,,S,False,True,False,False,False
2,6,0,3,32.368090,0,0,330877,8.4583,,Q,False,True,False,False,False
3,7,0,1,54.000000,0,0,17463,51.8625,E46,S,False,True,False,False,False
4,13,0,3,20.000000,0,0,A/5. 2151,8.0500,,S,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,767,0,1,42.384615,0,0,112379,39.6000,,C,False,False,False,False,False
887,797,1,1,49.000000,0,0,17465,25.9292,D17,S,False,False,False,False,False
888,823,0,1,38.000000,0,0,19972,0.0000,,S,False,False,False,False,False
889,849,0,2,28.000000,0,1,248727,33.0000,,S,False,False,False,False,False


# 7) Sibling/Spouse

Explore whether families all die/survive together

Actually, the data doesn't show whether families die together; just whether having a sibling/spouse correlates with survival/death

The data is already integers, so ready to go

# 8) Parent/Child

Explore whether families all die/survive together

As above. Ready to go

# 9) Ticket Code

I can't imagine the ticket code has any importance: delete

In [364]:
df.drop(['Ticket'], axis=1, inplace=True)

In [365]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin,Embarked,IsMrs,IsMr,IsMiss,IsMaster,IsMale
0,1,0,3,22.000000,1,0,7.2500,,S,False,True,False,False,False
1,5,0,3,35.000000,0,0,8.0500,,S,False,True,False,False,False
2,6,0,3,32.368090,0,0,8.4583,,Q,False,True,False,False,False
3,7,0,1,54.000000,0,0,51.8625,E46,S,False,True,False,False,False
4,13,0,3,20.000000,0,0,8.0500,,S,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,767,0,1,42.384615,0,0,39.6000,,C,False,False,False,False,False
887,797,1,1,49.000000,0,0,25.9292,D17,S,False,False,False,False,False
888,823,0,1,38.000000,0,0,0.0000,,S,False,False,False,False,False
889,849,0,2,28.000000,0,1,33.0000,,S,False,False,False,False,False


# 10) Fare

Originally LSD, but converted to decimal. Multiples of guineas (£1.05) might show upper class

In [366]:
df['IsGuineas'] = [(fare % 1.05) == 0 for fare in df['Fare']]

In [367]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin,Embarked,IsMrs,IsMr,IsMiss,IsMaster,IsMale,IsGuineas
0,1,0,3,22.000000,1,0,7.2500,,S,False,True,False,False,False,False
1,5,0,3,35.000000,0,0,8.0500,,S,False,True,False,False,False,False
2,6,0,3,32.368090,0,0,8.4583,,Q,False,True,False,False,False,False
3,7,0,1,54.000000,0,0,51.8625,E46,S,False,True,False,False,False,False
4,13,0,3,20.000000,0,0,8.0500,,S,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,767,0,1,42.384615,0,0,39.6000,,C,False,False,False,False,False,False
887,797,1,1,49.000000,0,0,25.9292,D17,S,False,False,False,False,False,False
888,823,0,1,38.000000,0,0,0.0000,,S,False,False,False,False,False,True
889,849,0,2,28.000000,0,1,33.0000,,S,False,False,False,False,False,False


In [368]:
# check out those fares which are multipleas of a Guinea - the only one in the first/last five is actually zero
df[df['IsGuineas'] == True]

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin,Embarked,IsMrs,IsMr,IsMiss,IsMaster,IsMale,IsGuineas
101,180,0,3,36.0,0,0,0.0,,S,False,True,False,False,False,True
148,264,0,1,40.0,0,0,0.0,B94,S,False,True,False,False,False,True
153,272,1,3,25.0,0,0,0.0,,S,False,True,False,False,False,True
155,278,0,2,32.36809,0,0,0.0,,S,False,True,False,False,False,True
171,303,0,3,19.0,0,0,0.0,,S,False,True,False,False,False,True
226,414,0,2,32.36809,0,0,0.0,,S,False,True,False,False,False,True
259,467,0,2,32.36809,0,0,0.0,,S,False,True,False,False,False,True
268,482,0,2,32.36809,0,0,0.0,,S,False,True,False,False,False,True
338,598,0,3,49.0,0,0,0.0,,S,False,True,False,False,False,True
362,634,0,1,32.36809,0,0,0.0,,S,False,True,False,False,False,True


Nice idea, but all the multiples of £1.05 are actually zero. <br>
Interestingly, all those passengers are male (14 Mr and 1 other title)

In [369]:
df.drop(['IsGuineas'], axis=1, inplace=True)

In [370]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin,Embarked,IsMrs,IsMr,IsMiss,IsMaster,IsMale
0,1,0,3,22.000000,1,0,7.2500,,S,False,True,False,False,False
1,5,0,3,35.000000,0,0,8.0500,,S,False,True,False,False,False
2,6,0,3,32.368090,0,0,8.4583,,Q,False,True,False,False,False
3,7,0,1,54.000000,0,0,51.8625,E46,S,False,True,False,False,False
4,13,0,3,20.000000,0,0,8.0500,,S,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,767,0,1,42.384615,0,0,39.6000,,C,False,False,False,False,False
887,797,1,1,49.000000,0,0,25.9292,D17,S,False,False,False,False,False
888,823,0,1,38.000000,0,0,0.0000,,S,False,False,False,False,False
889,849,0,2,28.000000,0,1,33.0000,,S,False,False,False,False,False


# 11) Cabin Number

Only 23% present, so probably drop the field, but the first letter probably shows the deck, ie proximity to lifeboat

In [371]:
df.drop(['Cabin'], axis=1, inplace=True)

In [372]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,IsMrs,IsMr,IsMiss,IsMaster,IsMale
0,1,0,3,22.000000,1,0,7.2500,S,False,True,False,False,False
1,5,0,3,35.000000,0,0,8.0500,S,False,True,False,False,False
2,6,0,3,32.368090,0,0,8.4583,Q,False,True,False,False,False
3,7,0,1,54.000000,0,0,51.8625,S,False,True,False,False,False
4,13,0,3,20.000000,0,0,8.0500,S,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,767,0,1,42.384615,0,0,39.6000,C,False,False,False,False,False
887,797,1,1,49.000000,0,0,25.9292,S,False,False,False,False,False
888,823,0,1,38.000000,0,0,0.0000,S,False,False,False,False,False
889,849,0,2,28.000000,0,1,33.0000,S,False,False,False,False,False


# 12) Place of Embarkation

Possible discrimination against Irish passengers

In [373]:
passengers_by_embarkation = df.groupby('Embarked').count()['PassengerId']

In [374]:
passengers_by_embarkation

Embarked
C    168
Q     77
S    644
Name: PassengerId, dtype: int64

In [375]:
survival_by_embarkation = df.groupby('Embarked').sum()['Survived']

In [376]:
survival_by_embarkation

Embarked
C     93
Q     30
S    217
Name: Survived, dtype: int64

In [377]:
survival_rate_by_embarkation = np.divide(survival_by_embarkation, passengers_by_embarkation)

In [378]:
survival_rate_by_embarkation

Embarked
C    0.553571
Q    0.389610
S    0.336957
dtype: float64

Wow, the English passengers were least likely to survive! 

In [379]:
df['Southampton'] = df['Embarked'] == 'S'
df['Cherbourg'] = df['Embarked'] == 'C'
df['Queenstown'] = df['Embarked'] == 'Q'

In [380]:
df.drop(['Embarked'], axis=1, inplace=True)

In [381]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,IsMrs,IsMr,IsMiss,IsMaster,IsMale,Southampton,Cherbourg,Queenstown
0,1,0,3,22.000000,1,0,7.2500,False,True,False,False,False,True,False,False
1,5,0,3,35.000000,0,0,8.0500,False,True,False,False,False,True,False,False
2,6,0,3,32.368090,0,0,8.4583,False,True,False,False,False,False,False,True
3,7,0,1,54.000000,0,0,51.8625,False,True,False,False,False,True,False,False
4,13,0,3,20.000000,0,0,8.0500,False,True,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,767,0,1,42.384615,0,0,39.6000,False,False,False,False,False,False,True,False
887,797,1,1,49.000000,0,0,25.9292,False,False,False,False,False,True,False,False
888,823,0,1,38.000000,0,0,0.0000,False,False,False,False,False,True,False,False
889,849,0,2,28.000000,0,1,33.0000,False,False,False,False,False,True,False,False


# Data now ready for the model

In [382]:
X_train = df.drop('Survived', axis=1)
y_train = df['Survived']