In [39]:
import pandas as pd

#Controlando o treinamento de todos classificadores para estimar o melhor a ser usado
TREINAR_TODOS = False

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
X = train[list(test.columns)]
y = train[train.columns[~train.columns.isin(test.columns)]]

In [40]:
import warnings
warnings.filterwarnings('ignore')

X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [41]:
#preencher NaN da Cabin com outro valor para utiliza-lo de outra forma
X.fillna(X.mean(), inplace=True)
test.fillna(test.mean(), inplace=True)

X.fillna("unknown", inplace=True)
test.fillna("unknown", inplace=True)

In [42]:
#extraindo mais caracteristicas
def extraiPronome(nome):
    return nome.split(',')[1].split('.')[0].strip()

def extraiSobrenome(name):
    return name.split(',')[0]

def extraiPrefixoCabine(cabine):
    if cabine == "unknown":
        return "Z" #categoria ficticia para os nao preenchidos
    return cabine[0]

#nova coluna com sobrenome
X["Pronome"] = X["Name"].apply(extraiPronome)
X["Sobrenome"] = X["Name"].apply(extraiSobrenome)
#X["Cabin"] = X["Cabin"].apply(extraiPrefixoCabine)

test["Pronome"] = test["Name"].apply(extraiPronome)
test["Sobrenome"] = test["Name"].apply(extraiSobrenome)
#test["Cabin"] = test["Cabin"].apply(extraiPrefixoCabine)

X.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Pronome,Sobrenome
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,unknown,S,Mr,Braund
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,Cumings
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,unknown,S,Miss,Heikkinen
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,Futrelle
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,unknown,S,Mr,Allen


A ideia foi extrair apenas o primeiro caracter do campo Cabin e verificar se isso ajuda na classificação. Além disso, o campo Sobrenome foi adicionado.

In [43]:
from sklearn.base import BaseEstimator, TransformerMixin

def extraiPronome(nome):
    return nome.split(',')[1].split('.')[0].strip()

def extraiSobrenome(name):
    return name.split(',')[0]

def extraiPrefixoCabine(cabine):
    if cabine == "unknown":
        return "Z" #categoria ficticia para os nao preenchidos
    return cabine[0]

class AtributosDesejados(BaseEstimator, TransformerMixin):
    def __init__(self, excluirName=True, excluirCabin=False, excluirSobrenome=False):
        self.excluirName = excluirName
        self.excluirCabin = excluirCabin
        self.excluirSobrenome = excluirSobrenome
    def fit(self, X, y=None):
        #self.colunasIndesejadas = ['PassengerId', 'Ticket', 'Cabin']
        self.colunasIndesejadas = ['PassengerId', 'Ticket']
        if self.excluirName:
            self.colunasIndesejadas.append('Name')
        if self.excluirCabin:
            self.colunasIndesejadas.append('Cabin')
        if self.excluirSobrenome:
            self.colunasIndesejadas.append('Sobrenome')

        return self
    def transform(self, X, y=None):
        Xdrop = X.drop(self.colunasIndesejadas,axis=1)
        if 'Name' not in self.colunasIndesejadas:
            Xdrop['Name'] = Xdrop['Name'].apply(extraiPronome)
        
        if 'Cabin' not in self.colunasIndesejadas:
            Xdrop['Cabin'] = Xdrop['Cabin'].apply(extraiPrefixoCabine)
        return Xdrop

In [44]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasNumericas].to_numpy()

In [45]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasCategoricas].to_numpy()

In [46]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion

trataAtributos = Pipeline([
    ('unecaracteristicas', FeatureUnion([
        ('pipenum', Pipeline([
            ('atributos_numericos', AtributosNumericos()),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])),
        ('pipecat', Pipeline([
            ('atributos_categoricos', AtributosCategoricos()),
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]))
    ])),
])

In [70]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, RepeatedKFold

#classifier = RandomForestClassifier()
def TreinarPadrao(classifier):
    pipetotal = Pipeline([
        ('atributosDesejados', AtributosDesejados()),
        ('trataAtributos', trataAtributos),
        ('classificador', classifier) 
    ])

    parametros = {
        'atributosDesejados__excluirName': [True, False],
        'atributosDesejados__excluirCabin': [True],
        'atributosDesejados__excluirSobrenome': [True], #após alguns testes mantive as duas informações novas. Como demora muito, estou fizando em True
        'classificador__max_depth': [5]
    }

    modelo = GridSearchCV(pipetotal, param_grid=parametros)

    scores = cross_validate(modelo, X, y, cv=RepeatedKFold())
    scores['test_score'], np.mean(scores['test_score']), np.std(scores['test_score'])

In [71]:
def FitAndPrintModel(modelo):
    modelo.fit(X,y)
    y_pred = modelo.predict(test)
    result = test[['PassengerId']]
    result['Survived'] = y_pred
    result.to_csv('submission.csv',index=False)

### Otimizando parâmetros para SVM, LogisticRegression, RandomForest e KNN

Testaremos otimizar os parâmetros para estes dois classificadores e depois utilizá-lo em um stacking no final.

In [52]:
#separando parte da base de treino e teste para validação do modelo e otimização de parâmetros
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape
X_train


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Pronome,Sobrenome
298,299,1,"Saalfeld, Mr. Adolphe",male,29.699118,0,0,19988,30.5000,C106,S,Mr,Saalfeld
884,885,3,"Sutehall, Mr. Henry Jr",male,25.000000,0,0,SOTON/OQ 392076,7.0500,unknown,S,Mr,Sutehall
247,248,2,"Hamalainen, Mrs. William (Anna)",female,24.000000,0,2,250649,14.5000,unknown,S,Mrs,Hamalainen
478,479,3,"Karlsson, Mr. Nils August",male,22.000000,0,0,350060,7.5208,unknown,S,Mr,Karlsson
305,306,1,"Allison, Master. Hudson Trevor",male,0.920000,1,2,113781,151.5500,C22 C26,S,Master,Allison
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,107,3,"Salkjelsvik, Miss. Anna Kristine",female,21.000000,0,0,343120,7.6500,unknown,S,Miss,Salkjelsvik
270,271,1,"Cairns, Mr. Alexander",male,29.699118,0,0,113798,31.0000,unknown,S,Mr,Cairns
860,861,3,"Hansen, Mr. Claus Peter",male,41.000000,2,0,350026,14.1083,unknown,S,Mr,Hansen
435,436,1,"Carter, Miss. Lucile Polk",female,14.000000,1,2,113760,120.0000,B96 B98,S,Miss,Carter


In [86]:
#Construindo um stacking de RandomForest aplicando GridSerach para otimizar os parâmetros do classficiador e utilizar no final
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, mean_squared_error

def BuildTestModel(model, parameters, x_tr, y_tr):
    '''
    Recebe um modelo de classificador e os parâmetros desejados para otimizar no parâmetro. Constroi o modelo, oferece a acuracia e retorna o modelo.
    '''
    pipetotal = Pipeline([
        ('atributosDesejados', AtributosDesejados()),
        ('trataAtributos', trataAtributos),
        ('classificador', model)
    ])

    parametros = dict()
    parametros['atributosDesejados__excluirName'] = [True]
    parametros['atributosDesejados__excluirCabin'] = [False]
    parametros['atributosDesejados__excluirSobrenome'] = [False]
    
    for i in range(len(parameters)):
        parametros['classificador__' + parameters[i][0]] = parameters[i][1]

    modelo = GridSearchCV(pipetotal, param_grid=parametros)
    modelo.fit(x_tr, y_tr)

    scores = cross_validate(modelo, x_tr, y_tr, cv=RepeatedKFold())
    print("Score médio:", np.mean(scores['test_score']), "\nDesvio P.:", np.std(scores['test_score']))

    return modelo

### SVM

In [59]:
if TREINAR_TODOS:
    parSVM = []
    parSVM.append(['C', [0.1, 1, 2]]) 
    parSVM.append(['gamma', [0.01,0.001]])
    parSVM.append(['kernel', ['rbf', 'poly', 'sigmoid']])

    modSVM = BuildTestModel(SVC(), parSVM, X_train, y_train)
    modSVM.best_params_
    modSVM.best_estimator_

Score médio: 0.8139131410616092 
Desvio P.: 0.030936769167001683


Pipeline(steps=[('atributosDesejados', AtributosDesejados()),
                ('trataAtributos',
                 Pipeline(steps=[('unecaracteristicas',
                                  FeatureUnion(transformer_list=[('pipenum',
                                                                  Pipeline(steps=[('atributos_numericos',
                                                                                   AtributosNumericos()),
                                                                                  ('imputer',
                                                                                   SimpleImputer(strategy='median')),
                                                                                  ('scaler',
                                                                                   StandardScaler())])),
                                                                 ('pipecat',
                                                                  Pipel

### Logistic Regression

In [60]:
from sklearn.linear_model import LogisticRegression

if TREINAR_TODOS:
    parLR = []
    parLR.append(['solver', ['newton-cg', 'lbfgs', 'liblinear']]) 
    parLR.append(['penalty',  ['none', 'l1', 'l2', 'elasticnet']])
    parLR.append(['C', [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]])

    modLR = BuildTestModel(LogisticRegression(), parLR, X_train, y_train)
    modLR.best_params_
    modLR.best_estimator_

Score médio: 0.8323319492761755 
Desvio P.: 0.02944056791107692


Pipeline(steps=[('atributosDesejados', AtributosDesejados()),
                ('trataAtributos',
                 Pipeline(steps=[('unecaracteristicas',
                                  FeatureUnion(transformer_list=[('pipenum',
                                                                  Pipeline(steps=[('atributos_numericos',
                                                                                   AtributosNumericos()),
                                                                                  ('imputer',
                                                                                   SimpleImputer(strategy='median')),
                                                                                  ('scaler',
                                                                                   StandardScaler())])),
                                                                 ('pipecat',
                                                                  Pipel

### Random Forest

In [87]:
if True:
    parRF = []
    parRF.append(['max_depth', [5, 10, 15]])
    parRF.append(['criterion', ['gini', 'entropy']])
    parRF.append(['max_features', ['auto', 'sqrt', 'log2']])

    modRF = BuildTestModel(RandomForestClassifier(), parRF, X, y)
    #modRF.best_params_
    #modRF.best_estimator

Score médio: 0.826274559035842 
Desvio P.: 0.02609843400632406


### KNN

In [61]:
from sklearn.neighbors import KNeighborsClassifier

if TREINAR_TODOS:
    
    parKNN = []
    parKNN.append(['n_neighbors', [13, 15, 17]]) 
    parKNN.append(['weights', ['uniform', 'distance']])
    parKNN.append(['metric', ['euclidian', 'manhattan']])

    modKNN = BuildTestModel(KNeighborsClassifier(), parKNN, X_train, y_train)
    modKNN.best_estimator_
    modKNN.best_params_

Score médio: 0.8223252160251374 
Desvio P.: 0.0286493155483302


{'atributosDesejados__excluirCabin': False,
 'atributosDesejados__excluirName': True,
 'atributosDesejados__excluirSobrenome': False,
 'classificador__metric': 'manhattan',
 'classificador__n_neighbors': 17,
 'classificador__weights': 'uniform'}

LogisticRegression e RandomForest tiveram bons resultados. Então, utilizarei os parametros encontrados no modelo inicial com parte dos dados para definir o modelo com todos os dados.

In [90]:
#Vamos treinar os dados conforme o modelo definido e gerar uma saida
print("Logistic Regression")
modLRFinal = BuildTestModel(LogisticRegression(C=1, penalty='l2', solver='newton-cg'), [], X, y)

print("Random Forest")
modRFFinal = BuildTestModel(RandomForestClassifier(criterion='entropy', max_depth=15, max_features='log2'), [], X, y)

Logistic Regression
Score médio: 0.8321065846462872 
Desvio P.: 0.027270511737228956
Random Forest
Score médio: 0.8233331240976713 
Desvio P.: 0.024679437495963952


In [91]:
from sklearn.ensemble import BaggingClassifier

print("Logistic Regression")
modLRBagging = BuildTestModel(BaggingClassifier(LogisticRegression(C=1, penalty='l2', solver='newton-cg')), [], X, y)

Logistic Regression
Score médio: 0.8316527524951353 
Desvio P.: 0.02420116747138205


In [92]:
def FitPredictToCsv(model, filename, x_tr, y_tr):
    model.fit(x_tr,y_tr)
    y_pred = model.predict(test)
    s = cross_validate(model, x_tr, y_tr, cv=RepeatedKFold())
    print("SCORE FINAL\nMédio:", np.mean(s['test_score']), "\nDesvio P.:", np.std(s['test_score']))

    result = test[['PassengerId']]
    result['Survived'] = y_pred
    result.to_csv(filename+'.csv',index=False)

def PredictToCsv(model, filename):   
    y_pred = model.predict(test)

    result = test[['PassengerId']]
    result['Survived'] = y_pred
    result.to_csv(filename+'.csv',index=False)

In [93]:
PredictToCsv(modLRFinal, "submission_v2")