Inspiration and code snippets from 
- https://www.kaggle.com/dromosys/fast-ai-titanic
- https://www.kaggle.com/gunesevitan/advanced-feature-engineering-tutorial-with-titanic

In [1]:
#!pip install fastai --upgrade

In [2]:

from fastai import *
from fastai.tabular import *
import pandas as pd

In [3]:
df_test = pd.read_csv('../input/test.csv')
df_train = pd.read_csv('../input/train.csv')

def concat_df(train_data, test_data):
    # Returns a concatenated df of training and test set on axis 0
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)

def divide_df(all_data):
    # Returns divided dfs of training and test set
    return all_data.loc[:890], all_data.loc[891:].drop(['Survived'], axis=1)

df_all = concat_df(df_train, df_test)

df_train.name = 'Training Set'
df_test.name = 'Test Set'
df_all.name = 'All Set' 

dfs = [df_train, df_test]

#print(dfs[:5])
print(df_test[:5])

   PassengerId  Pclass   ...    Cabin Embarked
0          892       3   ...      NaN        Q
1          893       3   ...      NaN        S
2          894       2   ...      NaN        Q
3          895       3   ...      NaN        S
4          896       3   ...      NaN        S

[5 rows x 11 columns]


In [4]:
print('Training examples = {}'.format(df_train.shape[0]))
print('Test examples = {}'.format(df_test.shape[0]))

print('\nTraining columns:\n',df_train.columns)
print('\nTesting colums:\n',df_test.columns)

Training examples = 891
Test examples = 418

Training columns:
 Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

Testing colums:
 Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [5]:
#Display missing values

def display_missing(df):    
    for col in df.columns.tolist():          
        print('{} column missing values: {}'.format(col, df[col].isnull().sum()))
    print('\n')
    
for df in dfs:
    print('{}'.format(df.name))
    display_missing(df)



Training Set
PassengerId column missing values: 0
Survived column missing values: 0
Pclass column missing values: 0
Name column missing values: 0
Sex column missing values: 0
Age column missing values: 177
SibSp column missing values: 0
Parch column missing values: 0
Ticket column missing values: 0
Fare column missing values: 0
Cabin column missing values: 687
Embarked column missing values: 2


Test Set
PassengerId column missing values: 0
Pclass column missing values: 0
Name column missing values: 0
Sex column missing values: 0
Age column missing values: 86
SibSp column missing values: 0
Parch column missing values: 0
Ticket column missing values: 0
Fare column missing values: 1
Cabin column missing values: 327
Embarked column missing values: 0




In [6]:
#Extract title from name
#Extract deck from the first letter in cabin number
#Fill missing values in age 

for df in [df_train, df_test]:
    df['Title'] = df['Name'].str.split(',').str[1].str.split(' ').str[1]
    df['Deck'] = df['Cabin'].str[0]

# find mean age for each Title across train and test data sets
all_df = pd.concat([df_train, df_test], sort=False)
mean_age_by_title = all_df.groupby('Title').mean()['Age']
# update missing ages
for df in [df_train,df_test]:
    for title, age in mean_age_by_title.iteritems():
        df.loc[df['Age'].isnull() & (df['Title'] == title), 'Age'] = age

In [7]:
print(mean_age_by_title)

Title
Capt.        70.000000
Col.         54.000000
Don.         40.000000
Dona.        39.000000
Dr.          43.571429
Jonkheer.    38.000000
Lady.        48.000000
Major.       48.500000
Master.       5.482642
Miss.        21.774238
Mlle.        24.000000
Mme.         24.000000
Mr.          32.252151
Mrs.         36.994118
Ms.          28.000000
Rev.         41.250000
Sir.         49.000000
the          33.000000
Name: Age, dtype: float64


In [8]:
#Two missing values for embarked in training set
df_all[df_all['Embarked'].isnull()]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
61,38.0,B28,,80.0,"Icard, Miss. Amelie",0,62,1,female,0,1.0,113572
829,62.0,B28,,80.0,"Stone, Mrs. George Nelson (Martha Evelyn)",0,830,1,female,0,1.0,113572


In [9]:
#https://www.encyclopedia-titanica.org/titanic-survivor/martha-evelyn-stone.html
#Mrs Stone embarked from Southamptonn with her maid
df_all['Embarked'] = df_all['Embarked'].fillna('S')

In [10]:
#missing value for Fare
df_all[df_all['Fare'].isnull()]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
1043,60.5,,S,,"Storey, Mr. Thomas",0,1044,3,male,0,,3701


In [11]:
#Median fare for passenger in class 3, Parch 0 and 0 Siblings&Spouse
med_fare = df_all.groupby(['Pclass','Parch','SibSp']).Fare.median()[3][0][0]
df_test.Fare.fillna(med_fare,inplace=True)

### FastAi setup

##### We still have missing cabin values, but check the accuracy first

In [12]:
dep_var = 'Survived'
cat_names = ['Pclass', 'Sex', 'Embarked', 'Title', 'Deck']
cont_names = ['Age', 'Fare', 'SibSp', 'Parch']
procs = [FillMissing, Categorify, Normalize]

test = TabularList.from_df(df_test, cat_names=cat_names, cont_names=cont_names, procs=procs)
data = (TabularList.from_df(df_train, path='.', cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_by_idx(list(range(0,200)))
                           #.split_by_idx(valid_idx=range(200,400))
                           .label_from_df(cols=dep_var)
                           .add_test(test, label=0)
                           .databunch())

#### Training

In [13]:
np.random.seed(101)

In [14]:
learn = tabular_learner(data, layers=[600,200], metrics=accuracy, emb_drop=0.1)
learn.fit_one_cycle(20)

epoch,train_loss,valid_loss,accuracy,time
0,0.667501,0.697125,0.36,00:01
1,0.616304,0.689088,0.56,00:00
2,0.585829,0.662249,0.62,00:00
3,0.553692,0.561784,0.74,00:00
4,0.527322,0.499023,0.71,00:00
5,0.506394,0.396857,0.84,00:00
6,0.482307,0.469767,0.745,00:00
7,0.465879,0.435952,0.815,00:00
8,0.445569,0.439601,0.785,00:00
9,0.432952,0.387419,0.855,00:00


In [15]:
#learn.lr_find()

In [16]:
#learn.recorder.plot()

In [17]:
#learn.fit(15, 1e-02)

In [18]:
# get predictions
preds, targets = learn.get_preds()

predictions = np.argmax(preds, axis = 1)
pd.crosstab(predictions, targets)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,115,18
1,16,51


In [19]:
predictions, *_ = learn.get_preds(DatasetType.Test)
labels = np.argmax(predictions, 1)

In [20]:
sub_df = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': labels})
sub_df.to_csv('submission.csv', index=False)

In [21]:
sub_df.tail()

Unnamed: 0,PassengerId,Survived
413,1305,0
414,1306,1
415,1307,0
416,1308,0
417,1309,1
