<a href="https://colab.research.google.com/github/mzignis/titanic/blob/master/preapare_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [68]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler

sns.set()

In [2]:
HOME = '/content/drive/My Drive/ml_competition/titanic'
%cd $HOME

/content/drive/My Drive/ml_competition/titanic


In [3]:
data_dir = os.path.join(HOME, 'data')
os.listdir(data_dir)

['test.csv', 'train.csv', 'gender_submission.csv']

In [60]:
train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [61]:
def fill_age(df):
    df['Age'] = df['Age'].fillna(df['Age'].median())
    return df

fill_age(train_df)
train_df.Age.isna().sum()

0

In [62]:
def encode_sex(df):
    lbl_encoder = LabelEncoder()
    encoded_sex = lbl_encoder.fit_transform(df['Sex'])
    df['Sex'] = encoded_sex
    return df

encode_sex(train_df)
train_df['Sex'].unique()

array([1, 0])

In [63]:
def scrap_digit(text):
    digits = [int(s) for s in text.split() if s.isdigit()]
    digit = digits[0] if digits else 0
    return digit

def scrap_ticket_number(df):
    df['Ticket'] = df['Ticket'].apply(scrap_digit)
    return df

scrap_ticket_number(train_df)
train_df.sample()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
890,891,0,3,"Dooley, Mr. Patrick",1,32.0,0,0,370376,7.75,,Q


In [64]:
def one_hot_encode_embarked(df):
    filled_embarked = df['Embarked'].fillna(df['Embarked'].value_counts().index[0],)
    encoded_embarked = pd.get_dummies(filled_embarked)
    df = pd.concat([df, encoded_embarked], axis=1)
    df.drop(columns=['Embarked'], inplace=True)
    return df

train_df = one_hot_encode_embarked(train_df)
train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'C', 'Q', 'S'],
      dtype='object')

In [83]:
def preprocessing_data(csv_name, scaller=None):
    df = pd.read_csv(os.path.join(data_dir, csv_name))
    df = fill_age(df)
    df = encode_sex(df)
    df = scrap_ticket_number(df)
    df = one_hot_encode_embarked(df)
    df = df.drop(columns=['Cabin', 'Name', 'PassengerId'])

    columns = ['Age', 'Pclass', 'SibSp', 'Parch', 'Ticket', 'Fare']
    if not scaller:
        std_scaller = StandardScaler()
        std_scaller.fit(df[columns])
    else:
        std_scaller = scaller
    df[columns] = std_scaller.transform(df[columns])
    return df, std_scaller

train_df, scaller = preprocessing_data('train.csv')
train_df.sample(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,C,Q,S
159,0,0.827377,1,-0.104637,6.784163,2.008933,-0.449108,0.751946,0,0,1
516,1,-0.369365,0,0.356462,-0.474545,-0.473674,-0.400459,-0.437007,0,0,1
339,0,-1.566107,1,1.20181,-0.474545,-0.473674,-0.279246,0.06636,0,0,1
19,1,0.827377,0,-0.104637,-0.474545,-0.473674,-0.448641,-0.502949,1,0,0
235,0,0.827377,0,-0.104637,-0.474545,-0.473674,-0.442606,-0.496405,0,0,1
687,0,0.827377,1,-0.796286,-0.474545,-0.473674,0.079624,-0.443636,0,0,1
120,0,-0.369365,1,-0.642586,1.340132,-0.473674,-0.43,0.831478,0,0,1
490,0,0.827377,1,-0.104637,0.432793,-0.473674,-0.353141,-0.246398,0,0,1
244,0,0.827377,1,0.049062,-0.474545,-0.473674,-0.448573,-0.502949,1,0,0
400,1,0.827377,1,0.740711,-0.474545,-0.473674,4.274392,-0.488854,0,0,1


In [85]:
test_df, _ = preprocessing_data('test.csv', scaller)
test_df.sample(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,C,Q,S
218,-1.566107,0,1.586059,0.432793,0.76763,-0.279675,3.610065,1,0,0
168,-1.566107,0,-0.181487,-0.474545,-0.473674,-0.425594,-0.090272,1,0,0
316,-1.566107,1,2.124008,0.432793,-0.473674,-0.4259,2.301729,1,0,0
403,-1.566107,1,-0.949986,-0.474545,-0.473674,-0.280351,0.299922,0,0,1
175,-0.369365,0,-1.103685,-0.474545,2.008933,-0.407333,0.136831,0,0,1
217,-1.566107,1,2.124008,0.432793,0.76763,-0.396392,2.671118,0,0,1
302,-0.369365,1,1.04811,-0.474545,0.76763,-0.452676,-0.225593,0,0,1
292,0.827377,1,-0.181487,-0.474545,-0.473674,-0.448637,-0.502864,1,0,0
238,-0.369365,0,-0.873136,0.432793,0.76763,-0.070631,-0.386671,0,0,1
212,-0.369365,1,-0.949986,-0.474545,-0.473674,-0.43,0.831478,0,0,1
