<a href="https://colab.research.google.com/github/marcelorandolfo/medium/blob/master/ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Códigos para ensemble

In [0]:
# importando pacotes
import pandas as pd
import numpy as np

In [24]:
# importando datasets
train_path = 'https://www.dropbox.com/s/s5yislxjxdw0uti/train.csv?dl=1'
train = pd.read_csv(train_path)

# criando cópia dos conjuntos de dados
train_copy = train.copy()

# excluir Name, Ticket - muitos valores únicos - e Cabin - muitos valores ausentes. 
train_copy.drop(labels = ['Name', 'Ticket', 'Cabin'], axis = 1, inplace = True)

# preencher com a mediana
train_copy['Age'].fillna(train_copy['Age'].median(), inplace = True)

# preencher com o valor mais comum
train_copy.fillna(train_copy['Embarked'].value_counts().sort_values(ascending = False).index[0], inplace = True)

# substitudo valores para categorias
train_copy['Pclass'] = train_copy['Pclass'].map({1 : 'First', 2 : 'Second', 3 : 'Third'})

# criação da variável família
train_copy['Family'] = train_copy['SibSp'] + train_copy['Parch']

# excluir variáveis familiares. 
train_copy.drop(labels = ['SibSp','Parch'], axis = 1, inplace = True)

# transformar variáveis 'object' (sex e embarked) em dummies - One-hot encoding
train_processed = pd.get_dummies(train_copy)

# setando PassengerId como Index
train_processed.set_index('PassengerId', inplace = True)

# visualizando entradas
train_processed.head()

Unnamed: 0_level_0,Survived,Age,Fare,Family,Pclass_First,Pclass_Second,Pclass_Third,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,22.0,7.25,1,0,0,1,0,1,0,0,1
2,1,38.0,71.2833,1,1,0,0,1,0,1,0,0
3,1,26.0,7.925,0,0,0,1,1,0,0,0,1
4,1,35.0,53.1,1,1,0,0,1,0,0,0,1
5,0,35.0,8.05,0,0,0,1,0,1,0,0,1


In [0]:
# separando em X para treino
X = train_processed.drop(labels=['Survived','Sex_male','Embarked_C','Pclass_Second'], axis = 1)

y = train_processed['Survived']

In [0]:
# separação entre treino e teste
from sklearn.model_selection import train_test_split

SEED = 42
np.random.seed(SEED)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [27]:
# estimando os modelos individuais
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
X_scaled_test = scaler.transform(X_test)

model_sgd = SGDClassifier()
model_svc = SVC()
model_dt = DecisionTreeClassifier()

np.random.seed(SEED)
predict = pd.DataFrame(y_test)
for model in (model_sgd, model_svc, model_dt):
  model.fit(X_scaled, y_train)
  y_pred = model.predict(X_scaled_test)
  predict[model.__class__.__name__] = y_pred

predict

Unnamed: 0_level_0,Survived,SGDClassifier,SVC,DecisionTreeClassifier
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
566,0,0,0,0
161,0,0,0,0
554,1,1,0,0
861,0,0,0,0
242,1,1,1,1
...,...,...,...,...
881,1,1,1,1
92,0,0,0,0
884,0,0,0,0
474,1,1,1,1


In [28]:
# obtendo os resultados para cinco diferentes passageiros
predict.iloc[[0,2,4,7,13]]

Unnamed: 0_level_0,Survived,SGDClassifier,SVC,DecisionTreeClassifier
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
566,0,0,0,0
554,1,1,0,0
242,1,1,1,1
537,0,0,0,1
818,0,1,0,0


In [29]:
# utilizando o classificador de votação
from sklearn.ensemble import VotingClassifier
np.random.seed(SEED)
voting_clf = VotingClassifier(
    estimators = [('sgd', model_sgd),('svc', model_svc),('dt', model_dt)]
)

predict2 = pd.DataFrame(y_test)
for model in (model_sgd, model_svc, model_dt, voting_clf):
  model.fit(X_scaled, y_train)
  y_pred = model.predict(X_scaled_test)
  predict2[model.__class__.__name__] = y_pred

predict2

Unnamed: 0_level_0,Survived,SGDClassifier,SVC,DecisionTreeClassifier,VotingClassifier
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
566,0,0,0,0,0
161,0,0,0,0,0
554,1,1,0,0,0
861,0,0,0,0,0
242,1,1,1,1,1
...,...,...,...,...,...
881,1,1,1,1,1
92,0,0,0,0,0
884,0,0,0,0,0
474,1,1,1,1,1


In [30]:
# obtendo os resultados para cinco diferentes passageiros
predict2.iloc[[0,2,4,7,13]]

Unnamed: 0_level_0,Survived,SGDClassifier,SVC,DecisionTreeClassifier,VotingClassifier
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
566,0,0,0,0,0
554,1,1,0,0,0
242,1,1,1,1,1
537,0,0,0,1,0
818,0,1,0,0,0


In [31]:
# obtendo as acurácias
from sklearn.metrics import accuracy_score
np.random.seed(SEED)
model = []
accuracy = []
for clf in (model_sgd, model_svc, model_dt, voting_clf):
  clf.fit(X_scaled, y_train)
  y_pred = clf.predict(X_scaled_test)
  model.append(clf.__class__.__name__)
  accuracy.append(accuracy_score(y_test,y_pred))

col = ['Acurácia']
ac = pd.DataFrame(data=accuracy, index = model,columns=col)
ac

Unnamed: 0,Acurácia
SGDClassifier,0.709497
SVC,0.810056
DecisionTreeClassifier,0.804469
VotingClassifier,0.826816


#### Códigos para submissão no Kaggle

In [32]:
# datasets

test_path = 'https://www.dropbox.com/s/l2l7q280dxa3jn6/test.csv?dl=1'
test = pd.read_csv(test_path)

# criando cópia dos conjuntos de dados
test_copy = test.copy()

# excluir Name, Ticket - muitos valores únicos - e Cabin - muitos valores ausentes.
test_copy.drop(labels = ['Name', 'Ticket', 'Cabin'], axis = 1, inplace = True)

# preencher com a mediana
test_copy['Age'].fillna(test_copy['Age'].median(), inplace = True)

# preencher com a media
test_copy['Fare'].fillna(test_copy['Fare'].mean(), inplace = True)

# substitudo valores para categorias - treino
test_copy['Pclass'] = test_copy['Pclass'].map({1 : 'First', 2 : 'Second', 3 : 'Third'})

# criação da variável família
test_copy['Family'] = test_copy['SibSp'] + test_copy['Parch']

# excluir variáveis familiares. 
test_copy.drop(labels = ['SibSp','Parch'], axis = 1, inplace = True)

# transformar variáveis 'object' (sex e embarked) em dummies - One-hot encoding
test_processed = pd.get_dummies(test_copy)

# guardar a variável PassengerId para submissão no Kaggle
PassengerId = test_processed['PassengerId']
test_processed.drop(labels = ['PassengerId'], axis = 1, inplace = True)

test_processed.head()

Unnamed: 0,Age,Fare,Family,Pclass_First,Pclass_Second,Pclass_Third,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,34.5,7.8292,0,0,0,1,0,1,0,1,0
1,47.0,7.0,1,0,0,1,1,0,0,0,1
2,62.0,9.6875,0,0,1,0,0,1,0,1,0
3,27.0,8.6625,0,0,0,1,0,1,0,0,1
4,22.0,12.2875,2,0,0,1,1,0,0,0,1


In [33]:
# rest index
train_processed.reset_index(inplace=True)
train_processed.head()

# separando X e y
X = train_processed.drop(labels=['PassengerId','Survived','Sex_male','Embarked_C','Pclass_Second'], axis = 1)
y = train_processed['Survived']
test_processed.drop(labels = ['Sex_male','Embarked_C','Pclass_Second'], axis = 1, inplace = True)

# tamanho dos dataframes
print('X: ', X.shape)
print('Teste: ', test_processed.shape)

X:  (891, 8)
Teste:  (418, 8)


In [34]:
# colunas de X
X.columns

Index(['Age', 'Fare', 'Family', 'Pclass_First', 'Pclass_Third', 'Sex_female',
       'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [35]:
# colunas do teste
test_processed.columns

Index(['Age', 'Fare', 'Family', 'Pclass_First', 'Pclass_Third', 'Sex_female',
       'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [0]:
# escalando variáveis
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_test = scaler.transform(test_processed)

In [37]:
# submissão sgd
np.random.seed(SEED)
model_sgd = SGDClassifier()
model_sgd.fit(X_scaled, y)
y_pred_sgd = model_sgd.predict(X_scaled_test)

values = {
    'PassengerId' : PassengerId,
    'Survived' : y_pred_sgd
}

values = pd.DataFrame(values)
values.to_csv('./submission_sgd.csv', index=False)

values.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [38]:
# submissão svc
np.random.seed(SEED)
model_svc = SVC()
model_svc.fit(X_scaled, y)
y_pred_svc = model_svc.predict(X_scaled_test)

values = {
    'PassengerId' : PassengerId,
    'Survived' : y_pred_svc
}

values = pd.DataFrame(values)
values.to_csv('./submission_svc.csv', index=False)

values.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [39]:
# submissão dt
np.random.seed(SEED)
model_dt = DecisionTreeClassifier()
model_dt.fit(X_scaled, y)
y_pred_dt = model_dt.predict(X_scaled_test)

values = {
    'PassengerId' : PassengerId,
    'Survived' : y_pred_dt
}

values = pd.DataFrame(values)
values.to_csv('./submission_dt.csv', index=False)

values.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,1
4,896,1


In [40]:
# submissão classificador
np.random.seed(SEED)
voting_clf = VotingClassifier(
    estimators = [('sgd', model_sgd),('svc', model_svc),('dt', model_dt)]
)

voting_clf.fit(X_scaled, y)
y_pred_clf = voting_clf.predict(X_scaled_test)

values = {
    'PassengerId' : PassengerId,
    'Survived' : y_pred_clf
}

values = pd.DataFrame(values)
values.to_csv('./submission_clf.csv', index=False)

values.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
