<a href="https://colab.research.google.com/github/nortonvanz/ML_Pocket_Reference/blob/main/notebooks/Titanic_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Resolução do Titanic utilizando CRISP-DM, seguindo o livro Machine Learning: Guia de Referência Rápica.

# Faça uma pergunta

Crie um modelo preditivo, que classifique se um indivíduo sobreviveu à catástrofe do Titanic (sim / não).

In [None]:
#Dataset e problema original no Kaggle: https://www.kaggle.com/competitions/titanic/data?select=train.csv

# Imports

In [125]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score

## Load Data

In [158]:
#Carrega datasets do github:
!wget --quiet --show-progress https://raw.githubusercontent.com/nortonvanz/ML_Pocket_Reference/main/datasets/gender_submission.csv #-> exemplo para submissão kaggle
!wget --quiet --show-progress https://hbiostat.org/data/repo/titanic3.xls #-> mesma estrutura do livro, mas vem sem PassengerId, que é necessário para o submeter no Kaggle
!wget --quiet --show-progress https://raw.githubusercontent.com/nortonvanz/ML_Pocket_Reference/main/datasets/train.csv
!wget --quiet --show-progress https://raw.githubusercontent.com/nortonvanz/ML_Pocket_Reference/main/datasets/test.csv



In [165]:
df_raw = pd.read_excel('titanic3.xls') #não utilizado aqui
df_sub = pd.read_csv('gender_submission.csv')
df_raw_train_kaggle = pd.read_csv('train.csv')
df_raw_test_kaggle = pd.read_csv('test.csv')

In [166]:
#mais features que o do kaggle (não utilizado aqui)
df_raw_train_kaggle.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [167]:
df = df_raw_train_kaggle.copy()

# Pipelines

In [168]:
#O scikit-learn utiliza a noção de pipeline.
#Ao usar a classe Pipeline, você poderá não apenas encadear transformadores e modelos, mas também tratar o processo todo como um único modelo do scikit-learn.

In [169]:
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [188]:
#novo
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [189]:
#limpeza de colunas
def tweak_titanic(df):
  df = df.drop(columns=["PassengerId" ,"Name", "Ticket", "Cabin"]).pipe(pd.get_dummies, drop_first=True)
  return df

In [190]:
class TitanicTransformer( BaseEstimator, TransformerMixin):

  #limpa as colunas, e tira o target do X
  def transform(self, X):
    X = tweak_titanic(X)
    return X

  #fita o modelo BaseEstimator, por padrão
  def fit(self, X, y):
    return self

In [191]:
#Cria um pipeline, contendo:
pipe = Pipeline([
  ("titan", TitanicTransformer()),
  ("impute", IterativeImputer()),
  ("std", StandardScaler()),
  ("rf", RandomForestClassifier())
])

In [192]:
#splita treino e teste
X_train, X_test, y_train, y_test = train_test_split( df.drop(columns="Survived"),  df.Survived,  test_size=0.3,  random_state=42)

In [193]:
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.7761194029850746

In [194]:
#pipeline:
pipe

In [195]:
#Usando grid search com pipelines, para melhorar a performance do modelo:
params = {
  "rf__max_features": [0.4, "sqrt"],
  "rf__n_estimators": [100, 200],
}
grid = GridSearchCV(pipe, cv=5, param_grid=params)

In [197]:
#Fitar o grid search, junto com o pipeline já criado:
grid.fit(df.drop(columns="Survived"),  df.Survived)

In [198]:
grid.best_params_

{'rf__max_features': 'sqrt', 'rf__n_estimators': 100}

In [199]:
#Agora podemos extrair os melhores parâmetros e treinar o modelo final.

In [200]:
pipe.set_params(**grid.best_params_)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.7649253731343284

In [182]:
#não houve melhoria de performance

In [201]:
#Podemos usar o pipeline no lugar em que usamos modelos do scikit-learn:
f1_score(y_test, pipe.predict(X_test) )

0.7149321266968325

# Kaggle Submissions

In [209]:
#formato para submissão, apenas como modelo:
print(df_sub.shape)
df_sub.head(3)

(418, 2)


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0


In [203]:
#dataset teste, que deve ser submetido:
df_raw_test_kaggle.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [205]:
#usar o pipeline para prever os sobreviventes:
pipe_prediction = pipe.predict(df_raw_test_kaggle)

In [207]:
pipe_prediction.shape

(418,)

In [210]:
df_sub['Survived'] = pipe_prediction

In [211]:
#df_sub com as previsões do modelo:
df_sub.head(3)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0


In [217]:
#salvar arquivo
df_sub.to_csv('titanic_submission.csv', index=False)
#Kaggle score: 0.76794

# Titanic - Improvement

In [None]:
#objetivo: utilizar pipelines, implementando melhorias par aumentar a acurácia ao submeter no Kaggle

In [None]:
#Kaggle score: 0.76794
# a métrica é acurácia: Quantas predições acertei do total.
  #sofre influência com dados desbalanceados

#possíveis melhorias:
#- Balanceamento de classes: class_weight='balanced'
#- Feature selection melhor

In [246]:
df = df_raw_train_kaggle.copy()
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


## Aux Functions:

In [247]:
def standardize_cols(df):
  #converter tudo para minúsculo e com underline no lugar de espaços e pontos
  df.columns = df.columns.str.replace(' ','_').str.replace('.','_', regex=False).str.lower()
  return df

In [248]:
standardize_cols(df)

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C
