In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/train.csv


In [2]:
# Read the training dataset and begin to explore 
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Read the test dataset and begin to explore
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
# Find missing values for training data
missing_val_count_by_column = (train_data.isnull().sum())
print(missing_val_count_by_column)

# Determine data types
train_data.dtypes

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [5]:
# Find missing values for test data
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [6]:
# Create new feature based on passengers' titles (ex. Mr, Ms, etc.)

# extracting the passenger's title (which always ends with a ".")
for name_string in train_data['Name']:
    train_data['Title'] = train_data['Name'].str.extract('([A-Za-z]+)\.', expand=True)

train_data['Title'].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Major         2
Mlle          2
Col           2
Sir           1
Countess      1
Mme           1
Jonkheer      1
Ms            1
Don           1
Capt          1
Lady          1
Name: Title, dtype: int64

In [7]:
# Transform titles which are synonyms or similar
train_data['Title'] = train_data['Title'].replace({'Mlle':'Miss', 'Ms':'Miss', 'Mme': 'Mrs', 'Dr':'Other', 'Rev':'Other', 'Major':'Other', 'Col':'Other','Countess':'Noble',
                                                  'Sir':'Noble','Jonkheer':'Noble', 'Lady':'Noble','Don': 'Noble', 'Capt':'Other'})
train_data['Title'].value_counts()

Mr        517
Miss      185
Mrs       126
Master     40
Other      18
Noble       5
Name: Title, dtype: int64

In [8]:
# Also create the title feature for the test data

# extracting the Title (which always ends with a ".")
for name_string in test_data['Name']:
    test_data['Title'] = test_data['Name'].str.extract('([A-Za-z]+)\.', expand=True)
    
# Transform titles which are synonyms or similar
test_data['Title'] = test_data['Title'].replace({'Mlle':'Miss', 'Ms':'Miss', 'Mme': 'Mrs', 'Dr':'Other', 'Rev':'Other', 'Major':'Other', 'Col':'Other','Countess':'Noble',
                                                  'Sir':'Noble','Jonkheer':'Noble', 'Lady':'Noble','Don': 'Noble', 'Dona': 'Noble', 'Capt':'Other'})
test_data['Title'].value_counts()

Mr        240
Miss       79
Mrs        72
Master     21
Other       5
Noble       1
Name: Title, dtype: int64

In [9]:
# Replace NA values for age with median age for title category

#Create list with titles and median age
titles = train_data.groupby('Title')['Age'].median().index.tolist()
print(titles)

for title in titles:
    train_data.loc[(train_data['Age'].isnull()) & (train_data['Title']== title),'Age'] = train_data['Age'][train_data['Title'] == title].median()

    
print(train_data['Age'].describe())

['Master', 'Miss', 'Mr', 'Mrs', 'Noble', 'Other']
count    891.000000
mean      29.394130
std       13.270911
min        0.420000
25%       21.000000
50%       30.000000
75%       35.000000
max       80.000000
Name: Age, dtype: float64


In [10]:
# Repeat to replace NA values for age in test data

#Create list with titles and median age
titles = test_data.groupby('Title')['Age'].median().index.tolist()
print(titles)

for title in titles:
    test_data.loc[(test_data['Age'].isnull()) & (test_data['Title']== title),'Age'] = test_data['Age'][test_data['Title'] == title].median()

    
print(test_data['Age'].describe())

['Master', 'Miss', 'Mr', 'Mrs', 'Noble', 'Other']
count    418.000000
mean      29.660287
std       12.971976
min        0.170000
25%       22.000000
50%       28.500000
75%       36.500000
max       76.000000
Name: Age, dtype: float64


In [11]:
# Fill NA values from fare column in test data using the mean value

test_data['Fare'] = test_data['Fare'].fillna(value=test_data['Fare'].mean())

#Ensure no NA values in Age or Fare columns
print(test_data.isnull().sum())

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
Title            0
dtype: int64


In [12]:
# Compare different tree sizes

# load necessary packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Create target model, y
y = train_data.Survived

# Create X, features we think will influence survival outcome
# Must use pd.get_dummies for "one hot encoding" to take care of categorical vals
features = ['Pclass','Sex','Title']
x = pd.get_dummies(train_data[features])

# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(x, y, random_state=1)

# Define function to retrieve mean absolute error
def get_mae(depth, train_X, val_X, train_y, val_y):
    model = RandomForestClassifier(n_estimators=100, max_depth=depth, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

# Create a list of possible max_depth
candidate_depth = [2, 3, 4, 5, 6, 7, 8, 10, 15, 20, 25,100,200]
mae_dict = {}

#write loop to find ideal max_depth from candidates
for candidate in candidate_depth:
    my_mae = get_mae(candidate, train_X, val_X, train_y, val_y)
    mae_dict.update({candidate:my_mae})
    
print(mae_dict)
# Store the best value of max_depth
best_tree_size = min(mae_dict, key = mae_dict.get)
print(best_tree_size)

{2: 0.21524663677130046, 3: 0.21076233183856502, 4: 0.26905829596412556, 5: 0.26905829596412556, 6: 0.242152466367713, 7: 0.242152466367713, 8: 0.242152466367713, 10: 0.242152466367713, 15: 0.242152466367713, 20: 0.242152466367713, 25: 0.242152466367713, 100: 0.242152466367713, 200: 0.242152466367713}
3


In [None]:
# Compare outcomes using different features
# already created target model, y
# Create X, features we think will influence survival outcome
features = ['Pclass','Sex', 'Title', 'Fare', 'Parch']
x = pd.get_dummies(train_data[features])

# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(x, y, random_state=1)

#Specify Model
model = RandomForestClassifier(n_estimators = 100, max_depth = 6, random_state = 1)
model.fit(train_X, train_y)

#Make predicitions and calculate mean absolute error
test = pd.get_dummies(val_X)
predictions = model.predict(test)
val_mae = mean_absolute_error(predictions, val_y)

print(val_mae)

In [None]:
# Create target model, y
y = train_data.Survived

# Create X, features we think will influence survival outcome
# Must use pd.get_dummies for "one hot encoding" to take care of categorical vals
features = ['Pclass','Sex','Title','Parch','Fare']
x = pd.get_dummies(train_data[features])

# Specify Model - Random Forest Classifier
titanic_model = RandomForestClassifier(n_estimators = 100, max_depth = 6, random_state=1)
titanic_model.fit(x, y)

# Make predictions
test_X = pd.get_dummies(test_data[features])
titanic_predictions = titanic_model.predict(test_X)


In [None]:
# Output data to csv for submission
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': titanic_predictions})
output.to_csv('my_submission.csv', index=False)