In [36]:
#hide

from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from dtreeviz.trees import *
from IPython.display import Image, display_svg, SVG

pd.options.display.max_rows = 20
pd.options.display.max_columns = 8

In [37]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#from pathlib import Path

datap = Path('./kaggle/titanic')

for dirname, _, filenames in os.walk(datap):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

kaggle/titanic/train.csv
kaggle/titanic/test.csv
kaggle/titanic/gender_submission.csv
kaggle/titanic/to.pkl


In [38]:
train_data = pd.read_csv(datap / "train.csv")
train_data.head()
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [39]:
test_data = pd.read_csv(datap / "test.csv")
test_data.head()
test_data.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

# Kaggle method

In [114]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [40]:
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


# fastai method

In [101]:
procs = [FillMissing, Categorify]
features = ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked']
dep_var = 'Survived'
# all_data = pd.concat([train_data, test_data], sort=False)
# It looks to me like the test data has no survival column
all_data = train_data[features + [dep_var]]

cont, cat = cont_cat_split(all_data, 1, dep_var=dep_var)

In [102]:
split_idx = int(math.floor(len(all_data) * 0.70))
splits = (list(range(0, split_idx)), 
          list(range(split_idx, len(all_data))))

In [103]:
to = TabularPandas(
    all_data[features + [dep_var]],
    procs,
    cat,
    cont,
    y_names=dep_var,
    splits=splits
)
to.show(3)

Unnamed: 0,Sex,Ticket,Cabin,Embarked,Age_na,PassengerId,Pclass,Age,SibSp,Parch,Survived
0,male,A/5 21171,#na#,S,False,1,3,22.0,1,0,0
1,female,PC 17599,C85,C,False,2,1,38.0,1,0,1
2,female,STON/O2. 3101282,#na#,S,False,3,3,26.0,0,0,1


In [104]:
save_pickle(datap / 'to.pkl', to)

In [105]:
to = (datap / 'to.pkl').load()

AttributeError: 'PosixPath' object has no attribute 'load'

In [106]:
xs,y = to.train.xs, to.train.y
valid_xs,valid_y = to.valid.xs, to.valid.y

In [78]:
def r_mse(pred,y): return round(math.sqrt(((pred-y)**2).mean()), 6)
def m_rmse(m, xs, y): return r_mse(m.predict(xs), y)

In [107]:
m = DecisionTreeRegressor(min_samples_leaf=14)
m.fit(xs, y)

m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y)

(0.340313, 0.361941)

In [109]:
test_to = TabularPandas(
    test_data[features],
    procs,
    cat,
    cont
)
test_to.show(3)
test_cleaned = test_to.train.xs

Unnamed: 0,Sex,Ticket,Cabin,Embarked,Age_na,PassengerId,Pclass,Age,SibSp,Parch
0,male,330911,#na#,Q,False,892,3,34.5,0,0
1,female,363272,#na#,S,False,893,3,47.0,1,0
2,male,240276,#na#,Q,False,894,2,62.0,0,0


In [117]:
predictions = m.predict(test_cleaned);
predictions[0:10], predictions.round()[0:10]

(array([0.        , 0.2173913 , 0.36842105, 0.        , 0.2173913 ,
        0.        , 0.84      , 0.36842105, 0.6       , 0.        ]),
 array([0., 0., 0., 0., 0., 0., 1., 0., 1., 0.]))

In [118]:
output = pd.DataFrame(
    {
        'PassengerId': test_cleaned.PassengerId,
        'Survived': predictions.round().astype(int)
    }
)
output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [119]:
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
