<a href="https://colab.research.google.com/github/messias077/REP/blob/main/Ajustes_caracteristicas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
X = train[list(test.columns)]
y = train[train.columns[~train.columns.isin(test.columns)]]


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

def extraiPronome(nome):
    return nome.split(',')[1].split('.')[0].strip()

class AtributosDesejados(BaseEstimator, TransformerMixin):
    def __init__(self, excluirName=True):
        self.excluirName = excluirName
    def fit(self, X, y=None):
        self.colunasIndesejadas = ['PassengerId', 'Ticket', 'Cabin']
        if self.excluirName:
            self.colunasIndesejadas.append('Name')
        return self
    def transform(self, X, y=None):
        Xdrop = X.drop(self.colunasIndesejadas,axis=1)
        if 'Name' not in self.colunasIndesejadas:
            Xdrop['Name'] = Xdrop['Name'].apply(extraiPronome)
        return Xdrop


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasNumericas].to_numpy()


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasCategoricas].to_numpy()


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion

trataAtributos = Pipeline([
    ('unecaracteristicas', FeatureUnion([
        ('pipenum', Pipeline([
            ('atributos_numericos', AtributosNumericos()),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])),
        ('pipecat', Pipeline([
            ('atributos_categoricos', AtributosCategoricos()),
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]))
    ])),
])


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, RepeatedKFold
import numpy as np

pipetotal = Pipeline([
    ('atributosDesejados', AtributosDesejados()),
    ('trataAtributos', trataAtributos),
    ('classificador', RandomForestClassifier())
])

parametros = {
    'atributosDesejados__excluirName': [True, False],
    'classificador__max_depth': [5],
}

modelo = GridSearchCV(pipetotal, param_grid=parametros, n_jobs=-1, scoring='roc_auc')

scores = cross_validate(modelo, X, np.ravel(y), cv=RepeatedKFold())
scores['test_score'], np.mean(scores['test_score']), np.std(scores['test_score'])

(array([0.83142324, 0.86537217, 0.91450605, 0.86561129, 0.8484604 ,
        0.87128326, 0.80614887, 0.88379329, 0.89201977, 0.89953704,
        0.84539223, 0.909608  , 0.8295177 , 0.88123701, 0.87564969,
        0.82379518, 0.86609927, 0.84997219, 0.8847481 , 0.91525424,
        0.9163113 , 0.84248425, 0.8592233 , 0.85814145, 0.8439681 ,
        0.89479491, 0.8461073 , 0.8915107 , 0.88413149, 0.81644728,
        0.84065408, 0.86181076, 0.90787623, 0.84027778, 0.87331518,
        0.87503173, 0.82997533, 0.88876529, 0.84499476, 0.88737825,
        0.8802969 , 0.89761417, 0.875129  , 0.86459594, 0.79948718,
        0.83429487, 0.84987226, 0.90158991, 0.91260823, 0.85615079]),
 0.8666853540500573,
 0.02937206431221209)

In [None]:
modelo.fit(X,np.ravel(y))
y_pred = modelo.predict(test)
result = test[['PassengerId']]
result.insert(1, 'Survived', y_pred)
result.to_csv('submission.csv',index=False)