<a href="https://colab.research.google.com/github/nortonvanz/ML_Pocket_Reference/blob/main/notebooks/Titanic_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Resolução do Titanic utilizando CRISP-DM, seguindo o livro Machine Learning: Guia de Referência Rápica.

# Faça uma pergunta

Crie um modelo preditivo, que classifique se um indivíduo sobreviveu à catástrofe do Titanic (sim / não).

In [None]:
#Dataset e problema original no Kaggle: https://www.kaggle.com/competitions/titanic/data?select=train.csv

# Imports

In [125]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score


## Load Data

In [133]:
#Carrega datasets do github:
!wget --quiet --show-progress https://raw.githubusercontent.com/nortonvanz/ML_Pocket_Reference/main/datasets/gender_submission.csv
!wget --quiet --show-progress https://hbiostat.org/data/repo/titanic3.xls #-> mesma estrutura do livro, mas vem sem PassengerId, que é necessário para o submeter no Kaggle
!wget --quiet --show-progress https://raw.githubusercontent.com/nortonvanz/ML_Pocket_Reference/main/datasets/train.csv



In [134]:
df_raw = pd.read_excel('titanic3.xls')
df_sub = pd.read_csv('gender_submission.csv')
df_raw_kaggle = pd.read_csv('train.csv')

In [9]:
#mais features que o do kaggle
df_raw.head(5)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [135]:
#utilizado para submeter no Kagle
df_raw_kaggle.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [10]:
df_sub.head(2)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1


In [11]:
df = df_raw.copy()

In [13]:
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


# Pipelines

In [None]:
#O scikit-learn utiliza a noção de pipeline.
#Ao usar a classe Pipeline, você poderá não apenas encadear transformadores e modelos, mas também tratar o processo todo como um único modelo do scikit-learn.

In [105]:
df.head(3)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [96]:
#limpeza de colunas
def tweak_titanic(df):
  df = df.drop(columns=["name", "ticket", "home.dest", "boat", "body", "cabin"]).pipe(pd.get_dummies, drop_first=True)
  return df

In [103]:
class TitanicTransformer( BaseEstimator, TransformerMixin):

  #limpa as colunas, e tira o target do X
  def transform(self, X):
    # supõe que X é o resultado da leitura de um arquivo Excel
    X = tweak_titanic(X)
    X = X.drop(columns="survived")
    return X

  #fita o modelo BaseEstimator, por padrão
  def fit(self, X, y):
    return self

In [104]:
#Cria um pipeline, contendo:
pipe = Pipeline([
  ("titan", TitanicTransformer()),
  ("impute", IterativeImputer()),
  ("std", StandardScaler()),
  ("rf", RandomForestClassifier())
])

In [106]:
#splita treino e teste
X_train, X_test, y_train, y_test = train_test_split( df,  df.survived,  test_size=0.3,  random_state=42)

In [107]:
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.8040712468193384

In [108]:
#pipeline:
pipe

In [116]:
#Usando grid search com pipelines, para melhorar a performance do modelo:
params = {
  "rf__max_features": [0.4, "sqrt"],
  "rf__n_estimators": [100, 200],
}
grid = GridSearchCV(pipe, cv=5, param_grid=params)

In [117]:
#Fitar o grid search, junto com o pipeline já criado:
grid.fit(df,  df.survived)

In [118]:
grid.best_params_

{'rf__max_features': 'sqrt', 'rf__n_estimators': 200}

In [None]:
#Agora podemos extrair os melhores parâmetros e treinar o modelo final.

In [120]:
pipe.set_params(**grid.best_params_)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.7989821882951654

In [None]:
#não houve melhoria de performance

In [127]:
#Podemos usar o pipeline no lugar em que usamos modelos do scikit-learn:
f1_score(y_test, pipe.predict(X_test) )

0.7538940809968847

# Kaggle Submissions

In [130]:
#formato
df_sub

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [132]:
X_test

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1148,3,0,"Rintamaki, Mr. Matti",male,35.0,0,0,STON/O 2. 3101273,7.1250,,S,,,
1049,3,1,"Nakid, Mr. Sahid",male,20.0,1,1,2653,15.7417,,C,C,,
982,3,0,"Lyntakoff, Mr. Stanko",male,,0,0,349235,7.8958,,S,,,
808,3,0,"Ford, Mr. Arthur",male,,0,0,A/5 1478,8.0500,,S,,,"Bridgwater, Somerset, England"
1195,3,0,"Shaughnessy, Mr. Patrick",male,,0,0,370374,7.7500,,Q,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,2,0,"Aldworth, Mr. Charles Augustus",male,30.0,0,0,248744,13.0000,,S,,,"Bryn Mawr, PA, USA"
919,3,0,"Katavelas, Mr. Vassilios (""Catavelas Vassilios"")",male,18.5,0,0,2682,7.2292,,C,,58.0,
532,2,0,"Peruschitz, Rev. Joseph Maria",male,41.0,0,0,237393,13.0000,,S,,,
1159,3,1,"Roth, Miss. Sarah A",female,,0,0,342712,8.0500,,S,C,,
