In [233]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')

train_new = train.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)
test_new = test.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)

In [234]:
train_copy = train_new.copy()
test_copy = test_new.copy()

train_new['Age'].fillna(method='ffill', inplace=True)
test_new['Age'].fillna(method='ffill', inplace=True)
train_new['Embarked'].fillna(method='ffill', inplace=True)

test_new['Age_Intervals'] = pd.cut(test_new['Age'], bins=5)
train_new['Age_Intervals'] = pd.cut(train_new['Age'], bins=5)

train_new_copy = train_new.copy()
test_new_copy = test_new.copy()
train_new.info()
#test.info()

**01 - Encoding de Variáveis Categóricas**

In [235]:
def indent(db, typ):
    p = (db.dtypes == typ)
    return list(p[p].index)

s = indent(train_new, 'object')
n = indent(train_new, 'float64')
d = indent(train_new, 'int64')
print('Variáveis categóricas:\n %s\nVariáveis Númericas:\n %s' %(s, n+d))




In [236]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

def ONEHOT(db, obj):
    for x in obj:
        if db[x].dtypes == 'object':
            label_encod = LabelEncoder()
            label_bd = label_encod.fit_transform(db[x])
            
            Onehotencod = OneHotEncoder(sparse=False)
            db[x] = Onehotencod.fit_transform(label_bd.reshape(len(label_bd),1))
            
        elif (db[x].dtypes != 'object') and (db[x].dtypes != 'float64') and (db[x].dtypes != 'int64'):
            label_encod = LabelEncoder()
            db[x] = label_encod.fit_transform(db[x])
       
ONEHOT(train_new, s + ['Age_Intervals'])
ONEHOT(test_new, s  + ['Age_Intervals'])

**02 - Encoding de Variáveis Categóricas**

**Com os BDs Criados vamos pegar e agora treinar eles**

In [237]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

Y = train_new['Survived']
X = train_new.drop(['Survived'], axis=1)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size= 0.2)

model = RandomForestClassifier()
model.fit(X_train,Y_train)

predict = model.predict(X_test)

score = accuracy_score(Y_test, predict)
print(score)

**01 - Pipelines**

In [238]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

train_new = train_copy
test_new = test_copy

● data: Banco de dados a partir do qual será produzido o modelo.

● encoder: a estratégia de encoding de variáveis categóricas,
como one-hot ou label encoder.

● model: o algoritmo que irá produzir o modelo, inicialmente será
o random forest classifier.

● numerical_imputer: a estratégia usada para substituir valores
nulos nas features numéricas, como "mean" ou "median".

● categorical_imputer: estratégia para preencher valores nulos
nas features categóricas, aqui será usada apenas "mostfrequent", 
que é substituir pelo valor que mais aparece no banco
de dados.

In [239]:
def pipelin(db, encoder, model, numerical_imputer = SimpleImputer(), categorical_imputer = SimpleImputer(strategy= 'most_frequent')):
    Y_new = db['Survived']
    X_new = db.drop(['Survived'], axis = 1)
    
    X_train_new, X_test_new, Y_train_new, Y_test_new = train_test_split(X_new, Y_new, test_size = 0.2)
    
    numerical_cols = n + d
    
    categorical_transform = Pipeline(steps=[('imputer', categorical_imputer), ('encoder', encoder)])
    
    preprocessor = ColumnTransformer(transformers=[('num', numerical_imputer, numerical_cols), ('cat', categorical_transform, s)])
    
    pipeline_test = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    
    pipeline_test.fit(X_train_new, Y_train_new)
    
    pipe_predict = pipeline_test.predict(X_test_new)
    
    score = accuracy_score(Y_test_new, pipe_predict)
    
    return round(score*100, 4)


**03 - Pipelines**

In [240]:
train_new.info()

In [241]:
mean = SimpleImputer(strategy='mean')
most_frequent = SimpleImputer(strategy='most_frequent')
median = SimpleImputer(strategy='median')
zero = SimpleImputer(strategy='constant', fill_value = 0)

one_hot_encoder = OneHotEncoder(sparse=False)
ordinal_encoder = OrdinalEncoder()

types_of_encoders = [one_hot_encoder, ordinal_encoder]
types_of_infos = [mean, most_frequent, median, zero]

d.pop(0)
for encods in types_of_encoders:
    for infos in types_of_infos:
        score = pipelin(train_new, encods, RandomForestClassifier(), numerical_imputer = infos)
        print('Encoder -> %s, temos uma porcentagem de: %s, para o numerical imputer: %s' %(str(encods), str(score), str(infos)))

**01 - Cross Validation e Gradient Boosting**

In [242]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

ONEHOT(train_new_copy, s + ['Age_Intervals'])
ONEHOT(test_new_copy, s  + ['Age_Intervals'])

def gradboostingclass(db, X, Y):
    p = cross_val_score(db, X, Y, cv = 5, scoring = "accuracy")
    return(round(p.mean() * 100, 4))
    
gradientboostclassi = GradientBoostingClassifier(random_state = 0, n_iter_no_change = 100)

Y_new_copy = train_new_copy['Survived']
X_new_copy = train_new_copy.drop(['Survived'],axis=1)

X_new_train, X_new_test, Y_new_train,Y_new_test = train_test_split(X_new_copy, Y_new_copy, test_size=0.2)

gradientboostclassi.fit(X_new_train, Y_new_train)

score = gradboostingclass(gradientboostclassi, X_new_test, Y_new_test)
print('Porcentagem usando Gradient Boosting Classifier: %s' % str(score))