In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("data/train.csv", 
                 usecols=['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp',
                           'Parch', 'Fare', 'Cabin', 'Embarked'],
                 index_col="PassengerId")
df.head(10)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,male,22.0,1,0,7.25,,S
2,1,1,female,38.0,1,0,71.2833,C85,C
3,1,3,female,26.0,0,0,7.925,,S
4,1,1,female,35.0,1,0,53.1,C123,S
5,0,3,male,35.0,0,0,8.05,,S
6,0,3,male,,0,0,8.4583,,Q
7,0,1,male,54.0,0,0,51.8625,E46,S
8,0,3,male,2.0,3,1,21.075,,S
9,1,3,female,27.0,0,2,11.1333,,S
10,1,2,female,14.0,1,0,30.0708,,C


In [None]:
def retrieve_str_by_index_wApply(x):
    if not isinstance(x, float):
        return x[0]  
    return "Missing"

df['Cabin'] = df.Cabin.apply(retrieve_str_by_index_wApply)

## Train test split

In [None]:
features = df.drop("Survived", axis=1).copy()
target = df.Survived.copy()

X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    target, 
                                                    random_state=1)

In [None]:
# fix missing: Age and Cabin
# TRAIN DATA
mean_age = X_train.Age.mean().round(1)
mode_Embarked = "S"

X_train.Age.fillna(mean_age, inplace=True)
X_train.Embarked.fillna(mode_Embarked, inplace=True)

In [None]:
# Test data
X_test.Age.fillna(mean_age,inplace=True)
X_test.Embarked.fillna(mode_Embarked,inplace=True)

In [None]:
# has family
def family_count(df):
    return df['SibSp'] + df["Parch"]

def has_family(df):
    return np.where(family_count(df) != 0, True, False)

X_train["Has_family"] = has_family(X_train)
X_test["Has_family"] = has_family(X_test)

In [None]:
#verification

X_train.loc[X_train["Has_family"]==False,["SibSp","Parch"]].head()

In [None]:
# drop redundant features
X_train.drop(['SibSp','Parch'], axis=1, inplace=True)
X_test.drop(['SibSp','Parch'], axis=1, inplace=True)

X_train.head(1)

In [None]:
# Conversion to number

for col in ['Sex', 'Cabin', 'Embarked']:
    X_train[col] = X_train[col].astype("category").cat.codes
    X_test[col] = X_test[col].astype("category").cat.codes

In [None]:
# save data

training_processed = pd.concat([X_train, y_train], axis=1)
test_processed = pd.concat([X_test, y_test], axis=1)

In [None]:
training_processed.to_csv("data/training_processed.csv", index=False)

In [None]:
test_processed.to_csv("data/test_processed.csv", index=False)

In [None]:
def retrieve_str_by_index_wApply(x):
    if not isinstance(x, float):
        return x[0]  
    return "Missing"

df['Cabin'] = df.Cabin.apply(retrieve_str_by_index_wApply)

In [None]:
def retrieve_str_by_index_wApply(x):
    if not isinstance(x, float):
        return x[0]  
    return "Missing"
#np.where(retrieve_str_by_index_wApply(df.Cabin) != "Missing"
df.Cabin.apply(retrieve_str_by_index_wApply).values

In [3]:
from utils.transformers import Cabin_Extraction, FamilyPresence

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
class Cabin_Extraction(BaseEstimator, TransformerMixin):
    
    def __init__(self, string_from_index=0):
        self.index = string_from_index
        
    def fit(self, X_train, y_train=None):
        return self
    
    def transform(self, X):
        return X.Cabin.apply(self.retrieve_str_by_index_wApply).values
    
    def retrieve_str_by_index_wApply(self,x):
            '''
            If not np.nan, returns 0th element of the str
            '''
            if not isinstance(x, float):
                # hardcoded to extract 0th element
                return x[self.index]  
            return "Missing"

In [None]:
cabin_ex = Cabin_Extraction()

In [None]:
transformer1 = cabin_ex.fit(df[['Cabin']])

In [None]:
transformer1.transform(df[['Cabin']])

In [None]:
class FamilyPresence(BaseEstimator, TransformerMixin):
    '''Returns binary of whether passenger was travelling with family'''
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return self.has_family(X)
    
    def family_count(self, x):
        return x['SibSp'] + x["Parch"]

    def has_family(self, x):
        return np.where(self.family_count(x) != 0, True, False)
         

In [4]:
famil_pres = FamilyPresence()

In [5]:
transformer2 = famil_pres.fit(df)

In [6]:
transformer2.transform(df)

array([ True,  True, False,  True, False, False, False,  True,  True,
        True,  True, False, False,  True, False, False,  True, False,
        True, False, False, False, False, False,  True,  True, False,
        True, False, False, False,  True, False, False,  True,  True,
       False, False,  True,  True,  True,  True, False,  True, False,
       False,  True, False,  True,  True,  True, False,  True,  True,
        True, False, False, False,  True,  True, False, False,  True,
        True, False,  True, False, False,  True,  True, False,  True,
       False,  True, False, False, False, False,  True, False, False,
       False, False, False, False,  True,  True, False,  True, False,
       False, False,  True,  True, False, False, False,  True,  True,
        True, False, False,  True, False,  True, False, False, False,
       False,  True, False,  True, False,  True, False, False, False,
        True,  True,  True,  True, False,  True, False,  True,  True,
       False, False,

## Target - Feature Split

In [None]:
features = df.drop("Survived", axis=1).copy()
target = df.Survived.copy()

## train test split

In [None]:
features = df.drop("Survived", axis=1).copy()
target = df.Survived.copy()

X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    target, 
                                                    random_state=1)

In [None]:
## Fix Missing

simple_Imputer = SimpleImputer(strategy="mean")