In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import scipy as sp
%matplotlib inline
plt.rcParams['figure.figsize'] = [15, 5]

In [2]:
!ls ../input_data/
train = pd.read_csv('../input_data/train.csv')
test = pd.read_csv('../input_data/test.csv')

gender_submission.csv test.csv              train.csv


# Feature engineering

In [3]:

def preprocess(T):
    T = T.set_index('PassengerId')
    # Create a marker for missing data
    T['UnknownCabin'] = T['Cabin'].isna().astype(int)
    T['UnknownAge'] = T['Age'].isna().astype(int)
    T['Sp-Pa'] = T['SibSp'] - T['Parch']
    
    # Define fare categories
    T['FareCat'] = 1
    T.loc[T['Fare'] <= 10.0, 'FareCat'] = 0
    T.loc[T['Fare'] >= 50.0, 'FareCat'] = 2
    
    T['AgeCat'] = 0 # Children and babies
    T.loc[T['Age'] >= 10.0, 'AgeCat'] = 1 # Adult
    
    # Estimate Age category based on Title
    # Mr & Mrs ... are adults
    # Miss & Master ... are children
    # All the rest should be adults too
    T['Title'] = T['Name'].str.split().apply(lambda name: name[1])
    T['AgeCatByTitle'] = 1;
    T.loc[T['Title'].isin(['Miss.', 'Master.']), 'AgeCatByTitle'] = 0
    
    # For missing entries overwrite AgeCat
    T.loc[T['UnknownAge'].astype(bool), 'AgeCat'] = T.loc[T['UnknownAge'].astype(bool), 'AgeCatByTitle']
    
    # Convert to easy to process values

    # 0 ... female, 1 ... male
    T['ppSex'] = (T['Sex'] == 'male').astype(int)

    T['ppEmbarked'] = T['Embarked'].astype('category').cat.codes
    
    if 'Survived' in T.columns:
        # Split depended and indepened
        Y = pd.DataFrame(T['Survived'])
        T.drop('Survived', axis=1, inplace=True)
    else:
        Y = pd.DataFrame((T['Age']*np.nan).rename('Survived'))
    
    # Only keep some features
    keep = ['Pclass', 'AgeCat', 'Sp-Pa', 'SibSp', 'Parch', 'FareCat', 'ppEmbarked', 'UnknownCabin', 'UnknownAge', 'ppSex']


    return T[keep], Y

In [4]:
Xtrain, Ytrain = preprocess(train)
Xtest, Ytest = preprocess(test)

In [5]:
print("Number of missing values")
pd.DataFrame(Xtrain.isna().sum(axis=0)).T

Number of missing values


Unnamed: 0,Pclass,AgeCat,Sp-Pa,SibSp,Parch,FareCat,ppEmbarked,UnknownCabin,UnknownAge,ppSex
0,0,0,0,0,0,0,0,0,0,0


In [6]:
pd.DataFrame(Xtest.isna().sum(axis=0)).T

Unnamed: 0,Pclass,AgeCat,Sp-Pa,SibSp,Parch,FareCat,ppEmbarked,UnknownCabin,UnknownAge,ppSex
0,0,0,0,0,0,0,0,0,0,0


# Storing results
write results to a hdf file in ../data

In [7]:
from pandas import HDFStore
with HDFStore('../data/processed.h5', mode='w') as hdf:
    hdf.put('Xtrain', Xtrain)
    hdf.put('Ytrain', Ytrain)
    hdf.put('Xtest', Xtest)
    hdf.put('Ytest', Ytest)