In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(0)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

test = pd.read_csv('/kaggle/input/titanic/test.csv')
train = pd.read_csv('/kaggle/input/titanic/train.csv')

In [None]:
test.info()
train.info()

In [2]:
test2 = test.drop(columns = ['Ticket','Cabin'])
train2 = train.drop(columns = ['Ticket','Cabin'])

In [3]:
train2['Age'] = train2.Age.fillna(train2.Age.mean())
test2['Age'] = test2.Age.fillna(test2.Age.mean())
train2['Embarked'] = train2.Embarked.fillna(train2.Embarked.mode()[0])
test2['Fare'] = test2.Fare.fillna(test2.Fare.mean())


In [4]:
train2['Age_group'] = pd.cut(train2['Age'],bins = 5)
test2['Age_group'] = pd.cut(test2['Age'],bins = 5)
train2['Fare_group'] = pd.qcut(train2['Fare'], 6)
test2['Fare_group'] = pd.qcut(test2['Fare'], 6)

In [5]:
train2.select_dtypes('object')

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

for coluna in ['Sex','Embarked']:
    for t in [train2,test2]:
        if t[coluna].dtypes == 'object':
            label_encoder = LabelEncoder()
            valores_numericos = label_encoder.fit_transform(t[coluna])
    
            onehot_encoder = OneHotEncoder(sparse=False)
            inteiros = valores_numericos.reshape(len(valores_numericos),1)
            t[coluna] = onehot_encoder.fit_transform(inteiros)
            
for t in [train2,test2]:
    t["Age_group"] = label_encoder.fit_transform(t["Age_group"])
    t["Fare_group"] = label_encoder.fit_transform(t["Fare_group"])
    t = t.drop(["Age_group","Fare_group","Name"], axis = 1, inplace = True)


In [None]:
###Aqui começa o treinamento e validação do modelo###
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


trainX = train2.drop(columns='Survived')
trainY = train2.Survived
x_train, x_val, y_train, y_val = train_test_split(trainX, trainY, random_state=1, train_size=0.8)


In [None]:
mod_for = RandomForestClassifier()
mod_for.fit(x_train, y_train)

In [None]:
pred_for = mod_for.predict(x_val)
print(accuracy_score(y_val, pred_for))

In [None]:
###Pipelines###
train2 = train
train2['Age_group'] = pd.cut(train2['Age'],bins = 5)
test2['Age_group'] = pd.cut(test2['Age'],bins = 5)
train2['Fare_group'] = pd.qcut(train2['Fare'], 6)
test2['Fare_group'] = pd.qcut(test2['Fare'], 6)

In [None]:
train2.info()

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

#Criar uma função que produza e avalie pipelines que utilizam estratégias indicadas nos argumentos

# data: banco de dados para construção do modelo
# encoder: estratégia de encoding - one-hot ou label encoder
# model: algortimo para produzir o modelo, inicialmente random forest classifier
# numerical_imputer: estratégia para substituir valores nulos numéricos - mean, median
# categorical_imputer: estratégia para substituir valores nulos categóricos - most frequent
def pipeline(data, encoder, model, numerical_imputer = SimpleImputer(),
                   categorical_imputer = SimpleImputer(strategy = "most_frequent")):
    
    #Definir as features e o target que serão usados para treinar o modelo e separar o banco de dados utilizando o train-test split.
    X_pipe = data.drop("Survived", axis = 1)
    y_pipe = data["Survived"]

    X_train_pipe,X_valid_pipe,y_train_pipe,y_valid_pipe = train_test_split(X_pipe,y_pipe,
                                                                           test_size = 0.2,
                                                                           random_state = 0)

    #Criando os passos do preprocessor, para features numéricas e categóricas
    num_transf = numerical_imputer
    cat_transf = Pipeline(steps = [("imputer", categorical_imputer),("encoder",encoder)])
    preprocessor = ColumnTransformer(transformers = [("num",num_transf,["Age"]),
                                                ("cat",cat_transf,["Age_group","Fare_group","Sex","Embarked"])])
    
    #Produzir pipeline utilizando o preprocessor previamente construído e o modelo do argumento “model”.
    #Realizar o treinamento da pipeline, gerar predições e avaliar utilizando accuracy score.
    pipe = Pipeline(steps = [("preprocessor", preprocessor),("model", model)])
    pipe.fit(X_train_pipe,y_train_pipe)
    preds_pipe = pipe.predict(X_valid_pipe)

    #Avaliando as predições
    accuracy = round(accuracy_score(preds_pipe,y_valid_pipe)*100,2)
    
    #A função deve retornar o resultado da avaliação em porcentagem.
    return accuracy, preds_pipe


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

Mean = SimpleImputer()
Mode = SimpleImputer(strategy = "most_frequent")
Median = SimpleImputer(strategy = "median")
zero_imputer = SimpleImputer(strategy = "constant", fill_value = 0)
imputers = [Mean, Mode, Median, zero_imputer]

onehot_encoder = OneHotEncoder(sparse = False)
ordinal_encoder = OrdinalEncoder()
encoders = [onehot_encoder, ordinal_encoder]

In [None]:
for imputer in imputers:
    for encoder in encoders:
        accuracy = pipeline(train2, encoder, RandomForestClassifier(), numerical_imputer = imputer)
        print("pipe_"+str(encoder)+'_'+str(imputer)+" = "+str(accuracy[0]))

In [14]:
###Gradient Boosting###
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

test = pd.read_csv('/kaggle/input/titanic/test.csv')
train = pd.read_csv('/kaggle/input/titanic/train.csv')

test2 = test.drop(columns = ['Ticket','Cabin'])
train2 = train.drop(columns = ['Ticket','Cabin'])

train2['Age'] = train2.Age.fillna(train2.Age.mean())
test2['Age'] = test2.Age.fillna(test2.Age.mean())
train2['Embarked'] = train2.Embarked.fillna(train2.Embarked.mode()[0])
test2['Fare'] = test2.Fare.fillna(test2.Fare.mean())


train2['Age_group'] = pd.cut(train2['Age'],bins = 5)
test2['Age_group'] = pd.cut(test2['Age'],bins = 5)
train2['Fare_group'] = pd.qcut(train2['Fare'], 6)
test2['Fare_group'] = pd.qcut(test2['Fare'], 6)

for coluna in ['Sex','Embarked']:
    for t in [train2,test2]:
        if t[coluna].dtypes == 'object':
            label_encoder = LabelEncoder()
            valores_numericos = label_encoder.fit_transform(t[coluna])
    
            onehot_encoder = OneHotEncoder(sparse=False)
            inteiros = valores_numericos.reshape(len(valores_numericos),1)
            t[coluna] = onehot_encoder.fit_transform(inteiros)
            
for t in [train2,test2]:
    t["Age_group"] = label_encoder.fit_transform(t["Age_group"])
    t["Fare_group"] = label_encoder.fit_transform(t["Fare_group"])
    t = t.drop(["Age_group","Fare_group","Name"], axis = 1, inplace = True)


In [15]:
def cross_validation(model, X, y):
    scores = cross_val_score(model, X, y, cv = 5, scoring = "accuracy")
    return round(scores.mean()*100,2)

trainX = train2.drop(columns='Survived')
trainY = train2.Survived
x_train, x_val, y_train, y_val = train_test_split(trainX, trainY, random_state=1, train_size=0.8)


In [17]:
gb = GradientBoostingClassifier(random_state = 0, n_iter_no_change = 100) 
gb.fit(x_train,y_train)
score = cross_validation(gb, x_val, y_val)
print(score)