<a href="https://colab.research.google.com/github/mzignis/titanic/blob/master/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install eli5



In [2]:
HOME = '/content/drive/My Drive/ml_competition/titanic'
%cd $HOME

/content/drive/My Drive/ml_competition/titanic


In [3]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import eli5

import warnings

  import pandas.util.testing as tm


In [4]:
sns.set()
warnings.filterwarnings('ignore')

In [5]:
data_dir = os.path.join(HOME, 'data')
input_dir = os.path.join(data_dir, 'input')
output_dir = os.path.join(data_dir, 'output')

In [6]:
def load_input_data():
    return pd.read_csv(os.path.join(input_dir, 'train.csv'))

train_data_raw = load_input_data()
train_data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
train_data_raw.sample()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
234,235,0,2,"Leyson, Mr. Robert William Norman",male,24.0,0,0,C.A. 29566,10.5,,S


In [8]:
train_data = train_data_raw.copy()

In [9]:
def load_test_data():
    return pd.read_csv(os.path.join(input_dir, 'test.csv'))

test_data_raw = load_test_data()
test_data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [10]:
test_data_raw.sample()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
349,1241,2,"Walcroft, Miss. Nellie",female,31.0,0,0,F.C.C. 13528,21.0,,S


In [11]:
test_data = test_data_raw.copy()

In [12]:
train_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [13]:
train_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [14]:
train_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


In [15]:
def fill_data(dataset):
    dataset['Age'].fillna(dataset['Age'].median(), inplace=True)
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace=True)
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace=True)


def drop_columns(dataset):
    dataset.drop(columns=['PassengerId', 'Ticket', 'Cabin'], inplace=True)


fill_data(train_data)
drop_columns(train_data)
train_data.isna().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [16]:
def feature_engineering(dataset):
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['IsAlone'] = 1
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0
    dataset['Title'] = dataset['Name'].str.split(', ', expand=True)[1].str.split('. ', expand=True)[0]
    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)
    dataset['FareBin'] = pd.qcut(dataset['Fare'].astype(int), 4)
    title_names = (dataset['Title'].value_counts() < 10)
    dataset['Title'] = dataset['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)

feature_engineering(train_data)
train_data.sample(5)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,AgeBin,FareBin
664,1,3,"Lindqvist, Mr. Eino William",male,20.0,1,0,7.925,S,2,0,Mr,"(16.0, 32.0]","(-0.001, 7.0]"
350,0,3,"Odahl, Mr. Nils Martin",male,23.0,0,0,9.225,S,1,1,Mr,"(16.0, 32.0]","(7.0, 14.0]"
885,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,29.125,Q,6,0,Mrs,"(32.0, 48.0]","(14.0, 31.0]"
714,0,2,"Greenberg, Mr. Samuel",male,52.0,0,0,13.0,S,1,1,Mr,"(48.0, 64.0]","(7.0, 14.0]"
256,1,1,"Thorne, Mrs. Gertrude Maybelle",female,28.0,0,0,79.2,C,1,1,Mrs,"(16.0, 32.0]","(31.0, 512.0]"


In [17]:
train_data['Title'].value_counts()

Mr        517
Miss      182
Mrs       125
Master     40
Misc       27
Name: Title, dtype: int64

In [20]:
def label_data(dataset, features=['Sex', 'Embarked', 'Title', 'AgeBin', 'FareBin']):
    label = LabelEncoder()
    for feature in features:
        dataset[f'{feature}Code'] = label.fit_transform(dataset[feature])
        

label_data(train_data)
train_data.sample()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,AgeBin,FareBin,SexCode,EmbarkedCode,TitleCode,AgeBinCode,FareBinCode,Pclass1,Pclass2,Pclass3,EmbarkedC,EmbarkedQ,EmbarkedS,TitleMaster,TitleMisc,TitleMiss,TitleMr,TitleMrs
421,0,3,"Charters, Mr. David",male,21.0,0,0,7.7333,Q,1,1,Mr,"(16.0, 32.0]","(-0.001, 7.0]",1,1,3,1,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [19]:
def one_hot_encode_data(dataset, features=['Pclass', 'Embarked', 'Title']):
    encoder = OneHotEncoder()
    for feature in features:
        new_values = encoder.fit_transform(dataset[feature].values.reshape(-1, 1))
        for idx, category in enumerate(encoder.categories_[0]):
            dataset[f'{feature}{category}'] = new_values.todense()[:, idx]
            # print(new_values.todense()[:, idx])

one_hot_encode_data(train_data)
train_data.sample()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,AgeBin,FareBin,SexCode,EmbarkedCode,TitleCode,AgeBinCode,FareBinCode,Pclass1,Pclass2,Pclass3,EmbarkedC,EmbarkedQ,EmbarkedS,TitleMaster,TitleMisc,TitleMiss,TitleMr,TitleMrs
523,1,1,"Hippach, Mrs. Louis Albert (Ida Sophia Fischer)",female,44.0,0,1,57.9792,C,2,0,Mrs,"(32.0, 48.0]","(31.0, 512.0]",0,0,4,2,3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [22]:
def preprocessing_data(dataset, output_name):
    fill_data(dataset)
    drop_columns(dataset)
    feature_engineering(dataset)
    label_data(dataset)
    one_hot_encode_data(dataset)

    dataset.to_csv(os.path.join(input_dir, output_name))

    return dataset

In [27]:
train_data = preprocessing_data(load_input_data(), 'train_preprocessed.csv')
train_data.sample(1)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,AgeBin,FareBin,SexCode,EmbarkedCode,TitleCode,AgeBinCode,FareBinCode,Pclass1,Pclass2,Pclass3,EmbarkedC,EmbarkedQ,EmbarkedS,TitleMaster,TitleMisc,TitleMiss,TitleMr,TitleMrs
623,0,3,"Hansen, Mr. Henry Damsgaard",male,21.0,0,0,7.8542,S,1,1,Mr,"(16.0, 32.0]","(-0.001, 7.0]",1,2,3,1,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [25]:
test_data = preprocessing_data(load_test_data(), 'test_preprocessed.csv')
test_data.sample()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,AgeBin,FareBin,SexCode,EmbarkedCode,TitleCode,AgeBinCode,FareBinCode,Pclass1,Pclass2,Pclass3,EmbarkedC,EmbarkedQ,EmbarkedS,TitleMaster,TitleMisc,TitleMiss,TitleMr,TitleMrs
371,1,"Wilson, Miss. Helen Alice",female,31.0,0,0,134.5,C,1,1,Miss,"(30.4, 45.6]","(31.0, 512.0]",0,0,2,2,3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
