In [1]:
# classifier models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# modules to handle data
import pandas as pd
import numpy as np

# visualization tools
import matplotlib.pyplot as plt
import seaborn as sns

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# environment setup
sns.set()
plt.style.use('ggplot')
%matplotlib notebook

In [3]:
# load data
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [4]:
# save PassengerId for final submission
passengerId = test.PassengerId

# merge train and test
titanic = train.append(test, ignore_index=True)

In [5]:
# create indexes to separate data later on
train_idx = len(train)
test_idx = len(titanic) - len(test)

In [6]:
# view head of data
titanic.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450


In [7]:
# get info on features
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [8]:
# PassengerId can be removed from data for now
titanic.drop('PassengerId', axis=1, inplace=True)

In [9]:
# create a new feature to extract title names from the Name column
titanic['Title'] = titanic.Name.apply(lambda name: name.split(',')[1].split('.')[0].strip())

# view the newly created feature
titanic.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Title
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0.0,A/5 21171,Mr
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1.0,PC 17599,Mrs
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,1.0,STON/O2. 3101282,Miss
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,1.0,113803,Mrs
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,3,male,0,0.0,373450,Mr


In [10]:
# create a new feature to extract title names from the Name column
titanic['Title'] = titanic.Name.apply(lambda name: name.split(',')[1].split('.')[0].strip())

# view the newly created feature
titanic.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Title
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0.0,A/5 21171,Mr
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1.0,PC 17599,Mrs
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,1.0,STON/O2. 3101282,Miss
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,1.0,113803,Mrs
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,3,male,0,0.0,373450,Mr


In [11]:
# show count of titles
print("There are {} unique titles.".format(titanic.Title.nunique()))

# show unique titles
print("\n", titanic.Title.unique())

There are 18 unique titles.

 ['Mr' 'Mrs' 'Miss' 'Master' 'Don' 'Rev' 'Dr' 'Mme' 'Ms' 'Major' 'Lady'
 'Sir' 'Mlle' 'Col' 'Capt' 'the Countess' 'Jonkheer' 'Dona']


In [12]:
#As we can see now, we have successfully extracted the title's from the name column. But if we inspect the titles a 
#little more, we see some that can be normalized like "Capt" and "Col" for example are a type of "Officer" 
#and "Mlle" is essentially the French version of "Miss". So now we will normalize these titles so we can 
#cut down the number of titles even more.

In [13]:
# normalize the titles
normalized_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royalty"
}

In [14]:
# map the normalized titles to the current titles
titanic.Title = titanic.Title.map(normalized_titles)

# view value counts for the normalized titles
print(titanic.Title.value_counts())

Mr         757
Miss       262
Mrs        200
Master      61
Officer     23
Royalty      6
Name: Title, dtype: int64


In [15]:
#For our next step, we are going to assume that their is a relationship between a person's 
#age and their title since it makes sense that someone that is younger is more likely to be a titled a "Miss" vs a "Mrs".

#With this in mind, we will group the data by Sex, Pclass, and Title and then view the median age for the grouped classes.

In [16]:
# group by Sex, Pclass, and Title
grouped = titanic.groupby(['Sex','Pclass', 'Title'])

# view the median Age by the grouped features
grouped.Age.median()

Sex     Pclass  Title  
female  1       Miss       30.0
                Mrs        45.0
                Officer    49.0
                Royalty    39.0
        2       Miss       20.0
                Mrs        30.0
        3       Miss       18.0
                Mrs        31.0
male    1       Master      6.0
                Mr         41.5
                Officer    52.0
                Royalty    40.0
        2       Master      2.0
                Mr         30.0
                Officer    41.5
        3       Master      6.0
                Mr         26.0
Name: Age, dtype: float64

In [17]:
# apply the grouped median value on the Age NaN
titanic.Age = grouped.Age.apply(lambda x: x.fillna(x.median()))

# view changes
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
Age         1309 non-null float64
Cabin       295 non-null object
Embarked    1307 non-null object
Fare        1308 non-null float64
Name        1309 non-null object
Parch       1309 non-null int64
Pclass      1309 non-null int64
Sex         1309 non-null object
SibSp       1309 non-null int64
Survived    891 non-null float64
Ticket      1309 non-null object
Title       1309 non-null object
dtypes: float64(3), int64(3), object(6)
memory usage: 122.8+ KB


In [18]:
# fill Cabin NaN with U for unknown
titanic.Cabin = titanic.Cabin.fillna('U')

In [19]:
# find most frequent Embarked value and store in variable
most_embarked = titanic.Embarked.value_counts().index[0]

# fill NaN with most_embarked value
titanic.Embarked = titanic.Embarked.fillna(most_embarked)

In [20]:
# fill NaN with median fare
titanic.Fare = titanic.Fare.fillna(titanic.Fare.median())

# view changes
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
Age         1309 non-null float64
Cabin       1309 non-null object
Embarked    1309 non-null object
Fare        1309 non-null float64
Name        1309 non-null object
Parch       1309 non-null int64
Pclass      1309 non-null int64
Sex         1309 non-null object
SibSp       1309 non-null int64
Survived    891 non-null float64
Ticket      1309 non-null object
Title       1309 non-null object
dtypes: float64(3), int64(3), object(6)
memory usage: 122.8+ KB


# Exploratory Data Analysis

In [21]:
# view the percentage of those that survived vs. those that died in the Titanic
titanic.Survived.value_counts(normalize=True)

0.0    0.616162
1.0    0.383838
Name: Survived, dtype: float64

In [22]:
#Looks like only 38% of people onboard the Titanic managed to survive its fateful voyage

In [23]:
# group by sex
group_by_sex = titanic.groupby('Sex')

# survival rate by sex
group_by_sex.Survived.mean()

Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

In [24]:
#For those of use who are familiar with the fateful story of the Titanic or who have seen the movie, 
#we know that women and children had priority for life boats so these numbers aren't all that surprising. 
#Even knowing that, it is still quite astounding to see that almost 75% of women survived the sinking of 
#the Titanic while only 19% of men did.

In [25]:
# group by passenger class and sex
group_class_sex = titanic.groupby(['Pclass', 'Sex'])

# survival rates by class and sex
group_class_sex.Survived.mean()

Pclass  Sex   
1       female    0.968085
        male      0.368852
2       female    0.921053
        male      0.157407
3       female    0.500000
        male      0.135447
Name: Survived, dtype: float64

# III. Feature Engineering

In [26]:
# size of families (including the passenger)
titanic['FamilySize'] = titanic.Parch + titanic.SibSp + 1

# -------------------------------------------

In [27]:
# create train and test data
train = titanic[ :train_idx]
test = titanic[test_idx: ]

# convert Survived back to int
train.Survived = train.Survived.astype(int)

In [28]:
#numerical_features
numerical_features = [feature for feature in train.columns if train[feature].dtypes != 'object']
train_numerical_features = train[numerical_features]
train_numerical_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Age           891 non-null float64
Fare          891 non-null float64
Parch         891 non-null int64
Pclass        891 non-null int64
SibSp         891 non-null int64
Survived      891 non-null int32
FamilySize    891 non-null int64
dtypes: float64(2), int32(1), int64(4)
memory usage: 45.3 KB


In [42]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

regressor = RandomForestRegressor()

X = train_numerical_features.drop('Survived',axis=1)
y = train_numerical_features['Survived']

In [43]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [44]:
regressor.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [45]:
regressor.score(X_test,y_test)

0.012943109564617528

In [46]:
array_of_importance = regressor.feature_importances_
array_of_importance

array([0.36218183, 0.38312239, 0.03451671, 0.12573141, 0.02881707,
       0.06563057])

In [53]:
mask = array_of_importance >= 0.12
copy = train_numerical_features.copy()
copy = copy.loc[:,mask]
copy.columns

Index(['Age', 'Fare'], dtype='object')

In [52]:
numerical_features = ['Age', 'Fare', 'Pclass']

# ---------------------------------------------------------------

In [58]:
#categorical_features
categorical_features = [feature for feature in train.columns if train[feature].dtypes == 'object']
train_categorical_features = train[categorical_features]
train_categorical_features.columns

Index(['Cabin', 'Embarked', 'Name', 'Sex', 'Ticket', 'Title'], dtype='object')

In [59]:
categorical_features = ['Cabin', 'Embarked', 'Name', 'Sex', 'Ticket', 'Title']

In [60]:
# map first letter of cabin to itself
train_categorical_features.Cabin = train_categorical_features.Cabin.map(lambda x: x[0])

# view normalized count
train_categorical_features.Cabin.value_counts(normalize=True)

U    0.771044
C    0.066218
B    0.052750
D    0.037037
E    0.035915
A    0.016835
F    0.014590
G    0.004489
T    0.001122
Name: Cabin, dtype: float64

In [61]:
# Convert the male and female groups to integer form
train_categorical_features.Sex = train_categorical_features.Sex.map({"male": 0, "female":1})

In [63]:
# create dummy variables for categorical features
#pclass_dummies = pd.get_dummies(train_categorical_features.Pclass, prefix="Pclass")
title_dummies = pd.get_dummies(train_categorical_features.Title, prefix="Title")
cabin_dummies = pd.get_dummies(train_categorical_features.Cabin, prefix="Cabin")
embarked_dummies = pd.get_dummies(train_categorical_features.Embarked, prefix="Embarked")

In [79]:
# concatenate dummy columns with main dataset
titanic_dummies = pd.concat([title_dummies, cabin_dummies, embarked_dummies], axis=1)
titanic_dummies.columns
# drop categorical fields
#titanic_dummies.drop(['Title', 'Cabin', 'Embarked', 'Name', 'Ticket'], axis=1, inplace=True)

#titanic_dummies.head()
type(titanic_dummies)

pandas.core.frame.DataFrame

In [75]:
titanic_dummies_list = ['Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Officer',
       'Title_Royalty', 'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E',
       'Cabin_F', 'Cabin_G', 'Cabin_T', 'Cabin_U', 'Embarked_C', 'Embarked_Q',
       'Embarked_S']

In [81]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

regressor = RandomForestRegressor()

X = titanic_dummies
y = train_numerical_features['Survived']
X.shape

(891, 18)

In [82]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [83]:
regressor.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [84]:
regressor.score(X_test,y_test)

0.26339831783195466

In [85]:
array_of_importance = regressor.feature_importances_
array_of_importance

array([8.63247403e-03, 1.78841933e-02, 6.43965846e-01, 1.20183494e-02,
       3.93846302e-02, 4.53819006e-03, 2.24274778e-03, 1.21306443e-02,
       8.39496940e-03, 1.70455418e-02, 2.44594562e-02, 2.19566865e-03,
       3.21838337e-03, 3.11367901e-04, 1.40888220e-01, 2.92414592e-02,
       1.00096279e-02, 2.34382308e-02])

In [86]:
mask = array_of_importance >= 0.12
copy = titanic_dummies.copy()
copy = copy.loc[:,mask]
copy.columns

Index(['Title_Mr', 'Cabin_U'], dtype='object')

In [None]:
titanic_dummies_list = ['Title_Mr', 'Cabin_U']

# ---------------------------------------------------