In [274]:
import pandas as pd 
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import RandomForestClassifier

# Feature engineer 
from feature_engine.selection import DropFeatures
from feature_engine.encoding import OneHotEncoder as OneHotFE
from feature_engine.encoding import RareLabelEncoder
from sklearn.preprocessing import OneHotEncoder as OneHotSK
from feature_engine.outliers import Winsorizer, ArbitraryOutlierCapper
from feature_engine.imputation import (AddMissingIndicator, MeanMedianImputer, CategoricalImputer)
from sklearn.impute import SimpleImputer

# Model_Selection
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV

# Load the Data

In [267]:
df = pd.read_csv("Dados/train.csv",index_col=0)
df_pred = pd.read_csv("Dados/test.csv",index_col=0)

x = df.drop('Survived',axis=1)
y = df.Survived

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state = 0)

num_var = ['Pclass','Age','SibSp','Parch','Fare','Cabin_Size']
cat_var = ['Sex','Cabin_code','Embarked','Name_Title', 'Name_family']
target = ['Survived']

In [270]:
df.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

# Custom Transformers

In [252]:
class Fill_na_transf(BaseEstimator, TransformerMixin):
    """Fills NA with fill_na value"""
    
    def __init__(self,fill_na):
        self.fill_na = fill_na
    
    def fit(self, x:pd.DataFrame ,y:pd.Series=None):
        return self
    
    def transform(self,x:pd.DataFrame):
        x = x.copy()
        x = x.replace(np.nan, self.fill_na)
        return x

class Cabin_code(BaseEstimator, TransformerMixin):
    """Create column with first character of the first Cabin and how many cabins"""
    
    def __init__(self):
        pass
    def fit(self, x:pd.DataFrame ,y:pd.Series=None):
        return self
    
    def transform(self,x:pd.DataFrame):
        x = x.copy()
        try:
            x['Cabin_code'] = x['Cabin'].apply(lambda x: x[0])
            x['Cabin_Size'] = x['Cabin'].apply(lambda x: len(x.split(" ")))
        except:
            pass
            #print('erro no Cabin_code')
        return x

class Mapper(BaseEstimator, TransformerMixin):
    """Create column with first character of the first Cabin and how many cabins"""
    
    def __init__(self,features:list,map_dict:dict):
        
        if not isinstance(features,list):
            raise ValueError('Features should be a list')
            
        self.features = features
        self.map_dict = map_dict
        pass
    
    def fit(self, x:pd.DataFrame ,y:pd.Series=None):
        return self
    
    def transform(self,x:pd.DataFrame):
        x = x.copy()
        
        for feature in self.features:
            x[feature] = x[feature].map(self.map_dict)
        return x

#dict for enconding "Sex"
sex_map = {'male':1,
           'female':0}

class Get_title(BaseEstimator, TransformerMixin):
    """Create column with title from 'Name' """
    
    def __init__(self):
        pass
    
    def fit(self, x:pd.DataFrame ,y:pd.Series=None):
        return self
    
    def transform(self,x:pd.DataFrame, y:pd.Series=None):
        x = x.copy()
        x['Title'] = x['Name'].apply(return_title)
        return x

class Get_family(BaseEstimator, TransformerMixin):
    """Create column with title from 'Name' """
    
    def __init__(self):
        pass
    
    def fit(self, x:pd.DataFrame ,y:pd.Series=None):
        return self
    
    def transform(self,x:pd.DataFrame, y:pd.Series=None):
        x = x.copy()
        x['Family'] = x['Name'].apply(return_family)
        return x

def return_title(passenger):
    line = passenger
    if re.search('Mrs', line):
        return 'Mrs'
    elif re.search('Mr', line):
        return 'Mr'
    elif re.search('Miss', line):
        return 'Miss'
    elif re.search('Master', line):
        return 'Master'
    else:
        return 'Other'

def return_family(passanger):
    line = passanger
    return line['Name'].iloc[0].split(',')[0]
    
class dtype_fix(BaseEstimator,TransformerMixin):
    """ corrects dtype of all initial features"""
    
    def __init__(self):
        pass
    
    def fit(self, x:pd.DataFrame=None, y:pd.Series=None):
        return self
    
    def transform(self, x:pd.DataFrame,y:pd.Series=None):
        df=df.copy()
        try:
            df['Pclass']=df['Pclass'].astype('object')
            df['Sex']=df['Sex'].astype('object')
            df['Age']=df['Age'].astype('float')
            df['SibSp']=df['SibSp'].astype('int')
            df['Parch']=df['Parch'].astype('int')
            df['Fare']=df['Fare'].astype('float')
            df['Cabin']=df['Cabin'].astype('object')
            df['Embarked']=df['Embarked'].astype('object')
        except:
            print('erro em dtype_fix')
        return df

In [323]:
def get_cross_score(pipe,cv=5):
    score = cross_val_score(pipe, x_train, y_train, cv=cv)
    print(f'accuracy of {np.round(score.mean(),4)} and standard deviation of {np.round(score.std(),4)}')

# Baseline

In [12]:
x_train.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
446,1,"Dodge, Master. Washington",male,4.0,0,2,33638,81.8583,A34,S
651,3,"Mitkoff, Mr. Mito",male,,0,0,349221,7.8958,,S
173,3,"Johnson, Miss. Eleanor Ileen",female,1.0,1,1,347742,11.1333,,S
451,2,"West, Mr. Edwy Arthur",male,36.0,1,2,C.A. 34651,27.75,,S
315,2,"Hart, Mr. Benjamin",male,43.0,1,1,F.C.C. 13529,26.25,,S


In [265]:
# baseline simple pipeline
pipe_base = make_pipeline(
                        MeanMedianImputer(imputation_method='mean', variables=['Age']),
                        DropFeatures(features_to_drop=['Name','Sex','Ticket','Cabin','Embarked']),
                        RandomForestClassifier(random_state=0))
get_cross_score(pipe_base,5)

accuracy of 0.6565 and standard deviation of 0.0223


In [263]:
# baseline simple pipeline
pipe_base = make_pipeline(DropFeatures(features_to_drop=['Name','Sex','Ticket','Cabin','Embarked']),
                          SimpleImputer(missing_values=np.nan, strategy='mean'),
                          RandomForestClassifier(random_state=0))
get_cross_score(pipe_base,5)

accuracy of 0.6565 and standard deviation of 0.0223


So we have our baseline model and score now.

# Feature Engeneer

In [310]:
cat_pp = ColumnTransformer((SimpleImputer))

num_pp = make_pipeline(MeanMedianImputer())
num_ct = ColumnTransformer(
    [('numerical transformations',num_pp, make_column_selector(dtype_exclude='object'))])

## "Age"

There's only one thing to be done in Age: impute missing values. Let's check each imputing strategy is better.

In [318]:
# baseline simple pipeline
pipe_age = make_pipeline(DropFeatures(features_to_drop=['Name','Sex','Ticket','Cabin','Embarked']),
                          SimpleImputer(missing_values=np.nan, strategy='median'),
                          RandomForestClassifier(random_state=0))
get_cross_score(pipe_age,5)

accuracy of 0.6549 and standard deviation of 0.0084


Imputing with median lowers very slightly the acc but also lowers the std.

## "Fare"

Tree algorithms usually doesn't benefit much from scaling numerical features but let's try it anyway.

In [266]:
pipe_fare = make_pipeline(DropFeatures(features_to_drop=['Name','Sex','Ticket','Cabin','Embarked']),
                          MeanMedianImputer(imputation_method='mean', variables=['Age','Fare']),
                          StandardScaler(),
                          RandomForestClassifier(random_state=0))
get_cross_score(pipe_fare,5)

accuracy of 0.6597 and standard deviation of 0.0214


There was a slight improvement in acc.

# "Name"

Extract Title and Family name from feature "Name"

In [256]:
class get_name(BaseEstimator, TransformerMixin):
    """create column with family name  and title of each passenger"""
    def __init__(self, title:bool=True,family:bool=True):
        self.title = title
        self.family = family
        pass
    
    def fit(self, x:pd.DataFrame, y:pd.Series=None):
        return self
    
    def transform(self, x:pd.DataFrame, y:pd.Series=None):
        df = x.copy()
        try:
            if self.title:
                df['Name_title'] = df['Name'].apply(lambda x: x.split(",")[1]).apply(lambda x: x.split(".")[0])
            if self.family:
                df['Name_family'] = df['Name'].apply(lambda x: x.split(",")[0])
            
        except:
            print('erro em get_name')
        return df

x_train_ = get_name(family=True,title=True).fit_transform(X=x_train)
x_train_.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_title,Name_family
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
858,1,"Daly, Mr. Peter Denis",male,51.0,0,0,113055,26.55,E17,S,Mr,Daly
53,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C,Mrs,Harper
387,3,"Goodwin, Master. Sidney Leonard",male,1.0,5,2,CA 2144,46.9,,S,Master,Goodwin
125,1,"White, Mr. Percival Wayland",male,54.0,0,1,35281,77.2875,D26,S,Mr,White
579,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C,Mrs,Caram


In [243]:
family_labels = pd.DataFrame(x_train_['Name_family'].value_counts())
print(f'amount of unique labels in family name:', family_labels.shape)
print(f'\n Family names with more than 3 occurrences:')
family_labels[family_labels["Name_family"]>3]

amount of unique labels in family name: (489, 1)

 Family names with more than 3 occurrences:


Unnamed: 0,Name_family
Sage,7
Andersson,6
Goodwin,6
Johnson,5
Panula,5
Harris,4
Rice,4
Fortune,4


"Name_family" is now a categorical feature and we'll have to encode it so the model can use it.

In [326]:
# feature engineer: "Name"
pipe_name_family = make_pipeline(
    
                        #categorical features transformations
                          get_name(family=True,title=False),
                          RareLabelEncoder(max_n_categories=8,tol=0,variables=['Name_family']),
                          OneHotEncoder(variables=['Name_family']),
                          DropFeatures(features_to_drop=['Name','Sex','Ticket','Cabin','Embarked']),
                        # numerical features transformation
                          MeanMedianImputer(imputation_method='mean', variables=['Age','Fare']),
                        # model  
                          RandomForestClassifier(random_state=0))
get_cross_score(pipe_name_family,5)

#x_train_1 = pipe_name_family.fit_transform(x_train)
#x_train_1.head()

accuracy of 0.6581 and standard deviation of 0.0166


Now we add "Title"

In [341]:
pipe_name_title = make_pipeline(
    
                        #categorical features transformations
                          get_name(family=True,title=True),
                          RareLabelEncoder(max_n_categories=8,tol=0,variables=['Name_family']),
                          RareLabelEncoder(max_n_categories=4,tol=0,variables=['Name_title']),
                          OneHotEncoder(variables=['Name_family','Name_title']),
                          DropFeatures(features_to_drop=['Name','Sex','Ticket','Cabin','Embarked']),
                        # numerical features transformation
                          MeanMedianImputer(imputation_method='mean', variables=['Age','Fare']),
                        # model  
                        RandomForestClassifier(random_state=0))
get_cross_score(pipe_name_title,5)

#x_train_1 = pipe_name_title.fit_transform(x_train)
#x_train_1.head()

accuracy of 0.8026 and standard deviation of 0.0368


In [335]:
title_labels = pd.DataFrame(x_train_['Name_title'].value_counts())
print(f'amount of unique labels in title:', title_labels.shape)
print(f'\n Family names with more than 3 occurrences:')
title_labels[title_labels["Name_title"]>3]

amount of unique labels in title: (14, 1)

 Family names with more than 3 occurrences:


Unnamed: 0,Name_title
Mr,367
Miss,121
Mrs,88
Master,31


In [333]:
title_labels["Name_title"]

 Mr              367
 Miss            121
 Mrs              88
 Master           31
 Rev               3
 Dr                3
 Mlle              2
 Major             2
 Ms                1
 Mme               1
 the Countess      1
 Lady              1
 Capt              1
 Sir               1
Name: Name_title, dtype: int64

Let's remove Name_family again to see it anything changes.

In [342]:
pipe_name_title_1 = make_pipeline(
    
                        #categorical features transformations
                          get_name(family=False,title=True),
                          RareLabelEncoder(max_n_categories=4,tol=0,variables=['Name_title']),
                          OneHotEncoder(variables=['Name_title']),
                          DropFeatures(features_to_drop=['Name','Sex','Ticket','Cabin','Embarked']),
                        # numerical features transformation
                          MeanMedianImputer(imputation_method='mean', variables=['Age','Fare']),
                        # model  
                        RandomForestClassifier(random_state=0))
get_cross_score(pipe_name_title_1,5)

accuracy of 0.8042 and standard deviation of 0.0437


## Scikit-learn approach

Using pipeline and column transformers

In [109]:
cat_pipe = make_pipeline( SimpleImputer(missing_values=np.nan, strategy='most_frequent', add_indicator=False),
                                    OneHotSK(handle_unknown='ignore'))

num_pipe = make_pipeline( SimpleImputer(missing_values=np.nan, strategy='median',add_indicator=False))

pipe_merge = ColumnTransformer( [("numeric_transf", num_pipe, make_column_selector(dtype_exclude=object)),
                                        ("categorical_transf", cat_pipe, make_column_selector(dtype_include=object))])


In [110]:
sk_pipe = Pipeline(steps=[
                    ('Fix dtype',dtype_fix),
                    ('Get name infos',get_name),
                    ('Get cabin infos', Cabin_code()),
                    return_title
                    ('Encode SEX', Mapper(features=['Sex'],map_dict = sex_map)),
                    ('Merge Cat and Num',pipe_merge),
                    ('RF model',RandomForestClassifier(random_state=0))])

In [111]:
get_cross_score(sk_pipe)

accuracy of 0.8121 and standard deviation of 0.0415


In [104]:
from sklearn import set_config  
set_config(display='diagram')  

In [105]:
sk_pipe

## Feature-engine approach

In [106]:
#basic pre processing 
pipe_pp = make_pipeline(
                        MeanMedianImputer(imputation_method='mean', variables=['Age']),
                        Fill_na_transf(fill_na="?"),
                        Cabin_code(),
                        Get_title(),
                        Mapper(features=['Sex'],map_dict = sex_map),
                        OneHotEncoder(top_categories = 3, variables=['Cabin_code','Title','Embarked']),
                        DropFeatures(features_to_drop=['Name','Ticket','Cabin']),)

In [107]:
pipe_pp

In [33]:
x_base = pipe_base.fit_transform(x_train)
x_base.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_Size,Cabin_code_?,Cabin_code_C,Cabin_code_B,Title_Mr,Title_Miss,Title_Mrs,Embarked_S,Embarked_C,Embarked_Q
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
446,1,1,4.0,0,2,81.8583,1,0,0,0,0,0,0,1,0,0
651,3,1,29.256353,0,0,7.8958,1,1,0,0,1,0,0,1,0,0
173,3,0,1.0,1,1,11.1333,1,1,0,0,0,1,0,1,0,0
451,2,1,36.0,1,2,27.75,1,1,0,0,1,0,0,1,0,0
315,2,1,43.0,1,1,26.25,1,1,0,0,1,0,0,1,0,0


In [38]:
pipe_rf = make_pipeline(pipe_base,RandomForestClassifier(random_state=0))
score_base = cross_val_score(pipe_rf, x_train, y_train, cv=5)
print(f'accuracy of {np.round(score_base.mean(),4)} and standard deviation of {np.round(score_base.std(),4)}')

accuracy of 0.8203 and standard deviation of 0.0348


In [330]:
# filling na of "Age" with median instead of mean
pipe_base2 = make_pipeline(
                        MeanMedianImputer(imputation_method='mean', variables=['Age']),
                        Fill_na_transf(fill_na="?"),
                        Cabin_code(),
                        Get_title(),
                        Mapper(features=['Sex'],map_dict = sex_map),
                        OneHotEncoder(top_categories = 3, variables=['Cabin_code','Title','Embarked']),
                        DropFeatures(features_to_drop=['Name','Ticket','Cabin']),
                        Winsorizer(variables=["Age","Fare"],capping_method='quantiles', fold=0.05),
                        RandomForestClassifier(random_state=42))

In [331]:
score_base2 = cross_val_score(pipe_base2, x_train, y_train, cv=5)
print(f'accuracy of {np.round(score_base2.mean(),4)} and standard deviation of {np.round(score_base2.std(),4)}')

accuracy of 0.8236 and standard deviation of 0.0357


In [473]:
#basic pre processing 
pipe_2 = make_pipeline(
                        MeanMedianImputer(imputation_method='mean', variables=['Age']),
                        Fill_na_transf(fill_na='?'),
                        Cabin_code(),
                        Get_title(),
                        Mapper(features=['Sex'],map_dict = sex_map),
                        OneHotEncoder(top_categories = 3, variables=['Cabin_code','Title','Embarked']),
                        DropFeatures(features_to_drop=['Name','Ticket','Cabin']))

In [474]:
x_eda_ = pipe_2.fit_transform(x_train)

In [476]:
algoritmo = SelectKBest(score_func=f_classif, k=10)
best_k_features = algoritmo.fit_transform(x_eda_,y_train)

scores_fclassif = pd.DataFrame(algoritmo.scores_,x_eda_.columns)

#print results
print(scores_fclassif.sort_values(by=0, ascending=False).head(10))

                       0
Title_Mr      251.580692
Sex           245.907141
Title_Mrs      94.181396
Pclass         67.080704
Cabin_code_?   64.614775
Title_Miss     61.333777
Fare           37.641272
Embarked_C     21.977995
Cabin_code_B   21.440811
Embarked_S     18.201298
