# Titanic - Improving Our Model 

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('../data/titanic_data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../data/titanic_data/home-data-for-ml-course:train.csv
../data/titanic_data/test.csv
../data/titanic_data/train.csv
../data/titanic_data/gender_submission.csv


In [2]:
# Reading in the training data
train_data = pd.read_csv("../data/titanic_data/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Reading in the test data
test_data = pd.read_csv("../data/titanic_data/test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
# combining training and test set
combine = [train_data, test_data]

In [5]:
print(combine)

[     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ...

## Exploring Ages

In [6]:
# Exploring the dataset 
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
# Looking at the missing values under Age in the training set 
age_nan_values = train_data[train_data['Age'].isna()]
age_nan_values

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


### Completing a Numerical Continuous Feature 

The `Age` column has some missing values. We could be simply replacing them with the median of all the other `Age` values. However, a more accurate way of guessing missing values is to user other correlated features. Let's start with `Sex` and `Pclass`. We can guess the `Age` missing values using the median values for `Age` across sets of `Sex` and `Pclass` feature combinations. 

#### Substituting Null Values in the Training Dataset

In [8]:
# Starting an empty numpy array to contain the guesses Age values based on the Sex and Pclass values combinations.
guess_ages = np.zeros((2,3)) # first dimension set as 2 since Sex is binary; second dimension set as 3 since Pclass has 3 values 
guess_ages

array([[0., 0., 0.],
       [0., 0., 0.]])

In [9]:
# Converting the categorical feature of Sex into a binary one so that we can calculate the median afterwards 
train_data['Sex'] = train_data['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
test_data['Sex'] = test_data['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

In [10]:
# Creating a nested for loop to iterate through Sex and Pclass to calculate guessed Age values
for s in range(0, 2): # iterating through the sexes 
    for c in range(1, 4): # iterating through the classes 
        guess_df = train_data[(train_data['Sex'] == s) & \
                              (train_data['Pclass'] == c)]['Age'].dropna() # creating a dataset with the sexes and the classes we are iterating through and dropping all the rows where age is missing
        age_guess = guess_df.median() # calculating the median of the age of the row we are iterating through 
        guess_ages[s,c-1] = int(age_guess) # subtracting 1 to c to guess_ages because the range of classes is 1,2,3 but the array index is 0,1,2 

# Creating another nested loop to substitute null values with medians computed above 
for dataset in combine:
    for s in range(0, 2): # iterating through the sexes
        for c in range(1, 4): # iterating through the classes
            dataset.loc[(dataset.Age.isnull()) & (dataset.Sex == s) & (dataset.Pclass == c), \
                           'Age'] = guess_ages[s,c-1] # substituting null Age values with guessed ages 

In [11]:
guess_ages

array([[40., 30., 25.],
       [35., 28., 21.]])

### Creating Age Bands 

In [12]:
# Creating age bands in both datasets
for dataset in combine:
    dataset['AgeBand'] = pd.cut(dataset['Age'], bins=[0, 16, 32, 48, 64, 80]) # converting ages (continuous variable) into groups of age ranges (categorical variable)
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeBand
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,"(16, 32]"
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,"(32, 48]"
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,"(16, 32]"
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,"(32, 48]"
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,"(32, 48]"


In [13]:
# Grouping AgeBand and Survived by AgeBand and calculating the mean of Survived - doing this only in the training set because we do not want to bias results in the test set 
train_data[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True).round(2) 

Unnamed: 0,AgeBand,Survived
0,"(0, 16]",0.55
1,"(16, 32]",0.34
2,"(32, 48]",0.41
3,"(48, 64]",0.43
4,"(64, 80]",0.09


Apparently, the highest mean among the age bands is in the (0,16) band. This might mean that the majority of survived people are aged between 0 and 16. However, computing the mean might not be the best way to assess this as the mean can be affected by outliers. We'll come back to this computation to assess what's a more accurate method to use.

In [14]:
# Replacing Age with ordinals based on the above AgeBand in both training and test set
for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeBand
0,892,3,"Kelly, Mr. James",0,2.0,0,0,330911,7.8292,,Q,"(32, 48]"
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,2.0,1,0,363272,7.0,,S,"(32, 48]"
2,894,2,"Myles, Mr. Thomas Francis",0,3.0,0,0,240276,9.6875,,Q,"(48, 64]"
3,895,3,"Wirz, Mr. Albert",0,1.0,0,0,315154,8.6625,,S,"(16, 32]"
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,1.0,1,1,3101298,12.2875,,S,"(16, 32]"


In [15]:
# Removing the AgeBand in both datasets as it is no longer needed
for dataset in combine:
    dataset.drop(['AgeBand'], axis=1)

In [16]:
train_data = train_data.drop(['AgeBand'], axis=1)
combine = [train_data, test_data]

In [17]:
test_data = test_data.drop(['AgeBand'], axis=1)
combine = [train_data, test_data]

In [18]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch", "Age"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [19]:
output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
