In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(0)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
dftreino = pd.read_csv("../input/titanic/train.csv")
dfteste = pd.read_csv("../input/titanic/test.csv")

catfeat = [coln for coln in data.columns if data[coln].nunique() < 10 and data[coln].dtype == 'object'
                                              and coln not in ["Survived","Name","PassengerId","Ticket",'Cabin'] == True]

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def pipelinecreator(data,encoder,model,numerical_imputer,categorical_imputer):
    # definindo as features e dividindo os dados
    catfeat = [coln for coln in data.columns if data[coln].nunique() < 10 and data[coln].dtype == 'object']
    numfeat = [coln for coln in data.columns if data[coln].dtype in ['float64','int64'] and coln not in ['Survived','Name','PassengerId','Ticket','Cabin']]
    features = catfeat + numfeat
    target = data['Survived']
    X = data[features]
    X_train, X_valid, y_train, y_valid = train_test_split(X, target,test_size=0.2,random_state=0)
    
    
    #criando o préprocessador
    numerical_transformer = SimpleImputer(strategy=numerical_imputer)
    
    categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  
    ('encode', encoder)])
    
    preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numfeat),('cat', categorical_transformer, catfeat)])
    
    #criando a pipeline final
    pipelinef = Pipeline(steps=[('preprocessor',preprocessor),('model',model)])
    
    #fit 
    pipelinef.fit(X_train,y_train)
    pred = pipelinef.predict(X_valid)
    score = accuracy_score(pred,y_valid)
    percscore = (score * 100)
    prec = print('Encoder: %s , imputing: %s,score: %.3f,score em porcentagem: %d%%'%(encoder,numerical_imputer,score,percscore))
    return (pipelinef)

In [4]:
# listas
strat = ['mean','median','most_frequent','constant']
encoders = [OneHotEncoder(handle_unknown='ignore',sparse = False),OrdinalEncoder()]

In [5]:
categorical_imputer = 'most_frequent'
for i in encoders:
    for j in strat:
        pipelinecreator(dftreino,i,RandomForestClassifier(),j,categorical_imputer)      

Encoder: OneHotEncoder(handle_unknown='ignore', sparse=False) , imputing: mean,score: 0.860,score em porcentagem: 86%
Encoder: OneHotEncoder(handle_unknown='ignore', sparse=False) , imputing: median,score: 0.849,score em porcentagem: 84%
Encoder: OneHotEncoder(handle_unknown='ignore', sparse=False) , imputing: most_frequent,score: 0.855,score em porcentagem: 85%
Encoder: OneHotEncoder(handle_unknown='ignore', sparse=False) , imputing: constant,score: 0.832,score em porcentagem: 83%
Encoder: OrdinalEncoder() , imputing: mean,score: 0.816,score em porcentagem: 81%
Encoder: OrdinalEncoder() , imputing: median,score: 0.827,score em porcentagem: 82%
Encoder: OrdinalEncoder() , imputing: most_frequent,score: 0.838,score em porcentagem: 83%
Encoder: OrdinalEncoder() , imputing: constant,score: 0.804,score em porcentagem: 80%


MODELO QUE SE SAIU MELHOR NOS TESTES
encoder: onehot 
estratégia imputing: most_frequent

In [6]:
#  8.3
# cross validation
from sklearn.model_selection import cross_val_score
#definindo features e target
features = ['Sex', 'Embarked', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
X = dftreino[features]
y = dftreino['Survived'] #target

#gerando a pipeline com a função
my_pipeline = pipelinecreator(dftreino,OneHotEncoder(handle_unknown='ignore', sparse=False),RandomForestClassifier(),'most_frequent',categorical_imputer)

# cross validating 
scores = cross_val_score(my_pipeline, X, y,
                              cv=5,
                              scoring='accuracy')
print(scores.mean())

Encoder: OneHotEncoder(handle_unknown='ignore', sparse=False) , imputing: most_frequent,score: 0.855,score em porcentagem: 85%
0.8081036971941498


In [7]:
# 8.3
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
my_pipelinexg = pipelinecreator(dftreino,OneHotEncoder(handle_unknown='ignore', sparse=False),GradientBoostingClassifier(n_iter_no_change=100,random_state=0),'most_frequent',categorical_imputer)
scoresxg = cross_val_score(my_pipelinexg, X, y,
                              cv=5,
                              scoring='accuracy')
print(scoresxg.mean())

Encoder: OneHotEncoder(handle_unknown='ignore', sparse=False) , imputing: most_frequent,score: 0.810,score em porcentagem: 81%
0.8406628585776159


O GradientBoosting se saiu melhor que o RandomForest, se mantendo quase sempre em 0.84

In [8]:
# Treinando utilizando o banco de dados completo
fxgpipeline = pipelinecreator(dftreino,OneHotEncoder(handle_unknown='ignore', sparse=False),GradientBoostingClassifier(n_iter_no_change=100,random_state=0),'most_frequent',categorical_imputer)
ind = dfteste['PassengerId']
predf = fxgpipeline.predict(dfteste)

output = pd.DataFrame({'PassengerId' : ind,'Survived' : predf })
output.to_csv('submission.csv',index = False)

Encoder: OneHotEncoder(handle_unknown='ignore', sparse=False) , imputing: most_frequent,score: 0.810,score em porcentagem: 81%
