In [22]:
import pandas as pd

train  = pd.read_csv("https://raw.githubusercontent.com/lopeslopesedu/padroes/main/train.csv",sep =",")
test = pd.read_csv("https://raw.githubusercontent.com/lopeslopesedu/padroes/main/test.csv",sep =",")
X = train[list(test.columns)]
y = train[train.columns[~train.columns.isin(test.columns)]]

In [23]:
from sklearn.base import BaseEstimator, TransformerMixin

def extraiPronome(nome):
    return nome.split(',')[1].split('.')[0].strip()

class AtributosDesejados(BaseEstimator, TransformerMixin):
    def __init__(self, excluirName=True):
        self.excluirName = excluirName
    def fit(self, X, y=None):
        self.colunasIndesejadas = ['PassengerId', 'Ticket', 'Cabin']
        if self.excluirName:
            self.colunasIndesejadas.append('Name')
        return self
    def transform(self, X, y=None):
        Xdrop = X.drop(self.colunasIndesejadas,axis=1)
        if 'Name' not in self.colunasIndesejadas:
            Xdrop['Name'] = Xdrop['Name'].apply(extraiPronome)
        return Xdrop

In [24]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasNumericas].to_numpy()

In [25]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasCategoricas].to_numpy()

In [26]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion

trataAtributos = Pipeline([
    ('unecaracteristicas', FeatureUnion([
        ('pipenum', Pipeline([
            ('atributos_numericos', AtributosNumericos()),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])),
        ('pipecat', Pipeline([
            ('atributos_categoricos', AtributosCategoricos()),
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]))
    ])),
])

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, RepeatedKFold
import numpy as np

pipetotal = Pipeline([
    ('atributosDesejados', AtributosDesejados()),
    ('trataAtributos', trataAtributos),
    ('classificador', RandomForestClassifier())
])

parametros = {
    'atributosDesejados__excluirName': [True, False],
    'classificador__max_depth': [7, 14, 21]
}
modelo = GridSearchCV(pipetotal, param_grid=parametros, n_jobs=-1, scoring='roc_auc_ovo_weighted', refit=True)

scores = cross_validate(modelo, X, y.to_numpy().ravel(), cv=RepeatedKFold())
scores['test_score'], np.mean(scores['test_score']), np.std(scores['test_score'])

(array([0.87522936, 0.86434949, 0.88767123, 0.84760006, 0.88079096,
        0.8738915 , 0.88046255, 0.84186521, 0.90086898, 0.85336317,
        0.84499205, 0.85148061, 0.87277059, 0.87640374, 0.88778088,
        0.84949377, 0.92362151, 0.86102614, 0.8576269 , 0.85074013,
        0.82570513, 0.88727911, 0.85181492, 0.87979024, 0.91764535,
        0.84266409, 0.84462933, 0.8748065 , 0.90815508, 0.85086207,
        0.89942529, 0.87123441, 0.89943741, 0.83710007, 0.8529106 ,
        0.88414149, 0.87896825, 0.87409504, 0.88472222, 0.8310559 ,
        0.87238304, 0.83958585, 0.85869413, 0.9160579 , 0.82705502,
        0.84130266, 0.89029126, 0.87375648, 0.88181309, 0.83516771]),
 0.8682915700552263,
 0.02415857414674661)

In [28]:
modelo.fit(X,y.to_numpy().ravel())
y_pred = modelo.predict(test)
result = pd.DataFrame()
result['PassengerId'] = test.loc[0:,'PassengerId']
result['Survived'] = y_pred
result.to_csv('submission_eduardo_Santos_lopes.csv',index=False)