In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['figure.figsize'] = [15, 5]

In [80]:
!ls ../input_data/
train = pd.read_csv('../input_data/train.csv')
test = pd.read_csv('../input_data/test.csv')

gender_submission.csv test.csv              train.csv


# Define several proxis and new features

In [100]:

def preprocess(T):
    T = T.set_index('PassengerId')
    # Create a marker for missing data
    T['UnknownCabin'] = T['Cabin'].isna().astype(int)
    T['UnknownAge'] = T['Age'].isna().astype(int)
    T['Sp-Pa'] = T['SibSp'] - T['Parch']
    
    # Convert to easy to process values

    # 0 ... female, 1 ... male
    T['ppSex'] = (T['Sex'] == 'male').astype(int)

    T['ppEmbarked'] = T['Embarked'].astype('category').cat.codes

    # Fill Missing age with age of 0.5
    T['Age'].fillna(0.5, inplace=True)
    
    
    if 'Survived' in T.columns:
        # Split depended and indepened
        Y = pd.DataFrame(T['Survived'])
        T.drop('Survived', axis=1, inplace=True)
    else:
        Y = pd.DataFrame((T['Age']*np.nan).rename('Survived'))
    
    # Only keep some features
    keep = ['Pclass', 'Age', 'Sp-Pa', 'Fare', 'ppEmbarked', 'UnknownCabin', 'UnknownAge', 'ppSex']


    return T[keep], Y

In [101]:
Xtrain, Ytrain = preprocess(train)
Xtest, Ytest = preprocess(test)

In [72]:
Xtrain.head()

Unnamed: 0_level_0,Pclass,Age,Sp-Pa,Fare,ppEmbarked,UnknownCabin,UnknownAge,ppSex
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,22.0,1,7.25,2,1,0,1
2,1,38.0,1,71.2833,0,0,0,0
3,3,26.0,0,7.925,2,1,0,0
4,1,35.0,1,53.1,2,0,0,0
5,3,35.0,0,8.05,2,1,0,1


In [95]:
print("Number of missing values")
pd.DataFrame(Xtrain.isna().sum(axis=0)).T

Number of missing values


Unnamed: 0,Pclass,Age,Sp-Pa,Fare,ppEmbarked,UnknownCabin,UnknownAge,ppSex
0,0,0,0,0,0,0,0,0


In [104]:
pd.DataFrame(Xtest.isna().sum(axis=0)).T

Unnamed: 0,Pclass,Age,Sp-Pa,Fare,ppEmbarked,UnknownCabin,UnknownAge,ppSex
0,0,0,0,1,0,0,0,0


## Fix the missing value per hand

In [111]:
Xtest[Xtest.isna().any(axis=1)]

Unnamed: 0_level_0,Pclass,Age,Sp-Pa,Fare,ppEmbarked,UnknownCabin,UnknownAge,ppSex
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1044,3,60.5,0,,2,1,0,1


In [112]:
Xtrain[['Pclass', 'Fare']].groupby('Pclass').mean()

Unnamed: 0_level_0,Fare
Pclass,Unnamed: 1_level_1
1,84.154687
2,20.662183
3,13.67555


In [113]:
Xtest.fillna(13.7, inplace=True)

# Storing results
write results to a hdf file in ../data

In [114]:
from pandas import HDFStore
with HDFStore('../data/processed.h5', mode='w') as hdf:
    hdf.put('Xtrain', Xtrain)
    hdf.put('Ytrain', Ytrain)
    hdf.put('Xtest', Xtest)
    hdf.put('Ytest', Ytest)