In [36]:
#hide

from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from dtreeviz.trees import *
from IPython.display import Image, display_svg, SVG

pd.options.display.max_rows = 20
pd.options.display.max_columns = 8

In [260]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#from pathlib import Path

datap = Path('./kaggle/titanic')

for dirname, _, filenames in os.walk(datap):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

kaggle/titanic/train.csv
kaggle/titanic/test.csv
kaggle/titanic/gender_submission.csv
kaggle/titanic/to.pkl


In [38]:
train_data = pd.read_csv(datap / "train.csv")
train_data.head()
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [39]:
test_data = pd.read_csv(datap / "test.csv")
test_data.head()
test_data.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

# Kaggle method

In [114]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [40]:
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


# fastai method

In [131]:
def r_mse(pred,y): return round(math.sqrt(((pred-y)**2).mean()), 6)
def m_rmse(m, xs, y): return r_mse(m.predict(xs), y)

In [271]:
procs = [FillMissing, Categorify, Normalize]
features = [
    'Pclass', 'Sex',
    'Age', 'SibSp', 'Parch', 'Cabin', 
    'Embarked', 'Fare', 'Ticket'
]
dep_var = 'Survived'
# all_data = pd.concat([train_data, test_data], sort=False)
# It looks to me like the test data has no survival column
all_data = train_data[features + [dep_var]]

cont, cat = cont_cat_split(all_data, 1, dep_var=dep_var)

split_idx = int(math.floor(len(all_data) * 0.75))
splits = (list(range(0, split_idx)), 
          list(range(split_idx, len(all_data))))

In [272]:
to = TabularPandas(
    all_data[features + [dep_var]],
    procs,
    cat,
    cont,
    y_names=dep_var,
    splits=splits
)
to.show(10)

# save_pickle(datap / 'to.pkl', to)
# to = (datap / 'to.pkl').load()

xs,y = to.train.xs, to.train.y
valid_xs,valid_y = to.valid.xs, to.valid.y

Unnamed: 0,Sex,Cabin,Embarked,Ticket,Age_na,Pclass,Age,SibSp,Parch,Fare,Survived
0,male,#na#,S,A/5 21171,False,3.0,22.0,1.0,0.0,7.25,0
1,female,C85,C,PC 17599,False,1.0,38.0,1.0,0.0,71.283302,1
2,female,#na#,S,STON/O2. 3101282,False,3.0,26.0,0.0,0.0,7.925,1
3,female,C123,S,113803,False,1.0,35.0,1.0,0.0,53.099998,1
4,male,#na#,S,373450,False,3.0,35.0,0.0,0.0,8.05,0
5,male,#na#,Q,330877,True,3.0,28.0,0.0,0.0,8.4583,0
6,male,E46,S,17463,False,1.0,54.0,0.0,0.0,51.862499,0
7,male,#na#,S,349909,False,3.0,2.0,3.0,1.0,21.075001,0
8,female,#na#,S,347742,False,3.0,27.0,0.0,2.0,11.1333,1
9,female,#na#,C,237736,False,2.0,14.0,1.0,0.0,30.070801,1


In [273]:
m = DecisionTreeRegressor(min_samples_leaf=14)
m.fit(xs, y)

m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y)

(0.339971, 0.353539)

In [274]:
test_to = TabularPandas(
    test_data[features],
    procs,
    cat,
    cont
)
test_to.show(3)
test_cleaned = test_to.train.xs

Unnamed: 0,Sex,Cabin,Embarked,Ticket,Age_na,Fare_na,Pclass,Age,SibSp,Parch,Fare
0,male,#na#,Q,330911,False,False,3.0,34.5,0.0,0.0,7.8292
1,female,#na#,S,363272,False,False,3.0,47.0,1.0,0.0,7.0
2,male,#na#,Q,240276,False,False,2.0,62.0,0.0,0.0,9.6875


In [275]:
del test_cleaned['Fare_na']

In [276]:
predictions = m.predict(test_cleaned);
predictions[0:10], predictions.round()[0:10]

(array([0.        , 0.26666667, 0.38095238, 0.        , 0.65      ,
        0.        , 0.57142857, 0.38095238, 0.57142857, 0.        ]),
 array([0., 0., 0., 0., 1., 0., 1., 0., 1., 0.]))

In [277]:
output = pd.DataFrame(
    {
        'PassengerId': test_data.PassengerId,
        'Survived': predictions.round().astype(int)
    }
)
output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [278]:
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
