### **Pede-se nesta prova prática que o candidato treine um classificador binário incorporando o conceito de stacking em sua solução, e que esteja apto a justificar as escolhas que foram feitas ao longo do desenvolvimento.**

In [1]:
import sweetviz as sv
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from statistics import mean
from sklearn.metrics import roc_auc_score
from modAL.models import Committee
from modAL.models import ActiveLearner
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process.kernels import RBF
from modAL.utils.selection import multi_argmax
from modAL.utils.combination import make_linear_combination, make_product
from modAL.uncertainty import classifier_uncertainty, classifier_margin

**Analisando o conjunto de treinamento**

In [2]:
# X_train.csv
df_X_train = pd.read_csv("X_train.csv", sep=",", header=None)
print(f'Tamanho X_train: {len(df_X_train)}')
df_X_train.head()

Tamanho X_train: 800


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.356205,-1.509168,0.286767,1.945567,1.356205,0.874246,0.874246,-1.509168,1.945567,-1.760478,-0.375705,0.509557,-0.164318,-1.595699,1.356205,-3.983082,-1.595699,-4.403218,1.893802,0.945089
1,-0.899063,-0.480603,-1.670255,-2.282391,-0.899063,-1.076536,-1.076536,-0.480603,-2.282391,-0.472103,1.144313,1.061767,0.185259,-2.876827,-0.899063,-0.430605,-2.876827,1.694226,1.799027,2.771487
2,-2.323976,-3.348052,0.607788,0.425983,-2.323976,-0.387229,-0.387229,-3.348052,0.425983,-5.14231,0.164442,4.493414,-2.402873,-7.267695,-2.323976,-3.394197,-7.267695,-4.420329,2.168286,4.533912
3,0.437791,-0.362643,0.227631,-0.078306,0.437791,2.358362,2.358362,-0.362643,-0.078306,-0.445158,-1.464171,0.544595,-2.65234,0.837214,0.437791,-0.76207,0.837214,-4.422075,1.357079,0.332415
4,-1.550006,-0.540902,2.40781,2.205159,-1.550006,-1.106735,-1.106735,-0.540902,2.205159,-1.901841,0.062618,1.879379,0.171096,-3.137197,-1.550006,-0.883804,-3.137197,-1.191569,-1.025149,-0.440226


In [3]:
# #Usando sweetviz para analisar a distribuição das variáveis individualmente
# report_X_train= sv.analyze(df_X_train)
# report_X_train.show_html('df_X_train.html')

In [4]:
# Preenchendo os valores NaN
df_X_train = df_X_train.fillna(df_X_train.median())

# Eliminando os outliers
z_scores = stats.zscore(df_X_train)
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
df_X_train_unlab = df_X_train[filtered_entries]

print(f'Tamanho X_train: {len(df_X_train_unlab)}')
df_X_train_unlab.head()

Tamanho X_train: 770


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.356205,-1.509168,0.286767,1.945567,1.356205,0.874246,0.874246,-1.509168,1.945567,-1.760478,-0.375705,0.509557,-0.164318,-1.595699,1.356205,-3.983082,-1.595699,-4.403218,1.893802,0.945089
1,-0.899063,-0.480603,-1.670255,-2.282391,-0.899063,-1.076536,-1.076536,-0.480603,-2.282391,-0.472103,1.144313,1.061767,0.185259,-2.876827,-0.899063,-0.430605,-2.876827,1.694226,1.799027,2.771487
3,0.437791,-0.362643,0.227631,-0.078306,0.437791,2.358362,2.358362,-0.362643,-0.078306,-0.445158,-1.464171,0.544595,-2.65234,0.837214,0.437791,-0.76207,0.837214,-4.422075,1.357079,0.332415
4,-1.550006,-0.540902,2.40781,2.205159,-1.550006,-1.106735,-1.106735,-0.540902,2.205159,-1.901841,0.062618,1.879379,0.171096,-3.137197,-1.550006,-0.883804,-3.137197,-1.191569,-1.025149,-0.440226
5,-0.853613,-0.348163,-0.61906,-0.374499,-0.853613,-0.337948,-0.337948,-0.348163,-0.374499,-0.996757,-0.130235,0.462139,0.03654,-1.388654,-0.853613,0.124742,-1.388654,0.287287,0.484055,1.810847


In [5]:
# report_X_train_unlab = sv.analyze(df_X_train_unlab)
# report_X_train_unlab.show_html('report_X_train_unlab.html')

**Analisando o conjunto de teste**

In [6]:
# X_test.csv
df_X_test = pd.read_csv("X_test.csv", sep=",", header=None)
print(f'Tamanho X_test: {len(df_X_test)}')
df_X_test.head()

Tamanho X_test: 200


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.615102,-1.631801,-3.047008,-1.374026,-0.615102,0.819881,0.819881,-1.631801,-1.374026,-2.515739,-1.068738,0.004869,-0.898613,-1.013791,-0.615102,0.020927,-1.013791,-0.735647,1.859347,5.277776
1,0.481778,-2.690799,1.929163,2.1948,0.481778,-1.374999,-1.374999,-2.690799,2.1948,-2.650589,1.968976,2.987171,0.2249,-5.592626,0.481778,-5.644713,-5.592626,-3.26192,1.293436,0.056862
2,-0.75888,0.624947,3.096758,2.366358,-0.75888,0.614977,0.614977,0.624947,2.366358,-0.041634,-1.194561,0.320281,-1.0956,1.737322,-0.75888,1.703506,1.737322,-1.503375,-2.976826,-3.347693
3,1.547708,-0.82465,0.446411,1.94627,1.547708,2.577463,2.577463,-0.82465,1.94627,-0.986832,-1.923297,-0.587897,-1.630104,2.047893,1.547708,-1.639816,2.047893,-5.171651,0.599213,-0.23689
4,-1.593411,1.549793,3.488298,3.629761,-1.593411,0.998859,0.998859,1.549793,3.629761,-0.543969,-2.78635,-0.314272,-0.619047,2.110562,-1.593411,2.709332,2.110562,-2.212294,-3.313402,-2.805601


In [7]:
# report_X_test = sv.analyze(df_X_test)
# report_X_test.show_html('report_X_test.html')

In [8]:
# Y_test.csv
df_Y_test = pd.read_csv("y_test.csv", sep=",", header=None)
print(f'Tamanho X_test: {len(df_Y_test)}')
df_Y_test.head()

Tamanho X_test: 200


Unnamed: 0,0
0,1.0
1,0.0
2,1.0
3,0.0
4,1.0


In [9]:
# report_Y_test = sv.analyze(df_Y_test)
# report_Y_test.show_html('report_Y_test.html')

# **Solução aplicando Active Learning**

In [10]:
def testa_learning(Xtest, ytest):
    """
    Recebe o conjunto de dados com labels (passado como teste).
    Divide o conjunto de dados com labels em 10 partições diferentes (treino 20% : teste 80%).
    Retorna 4 listas: X_treinamento_list, y_treinamento_list, X_teste_list, y_teste_list. 
    Cada item das listas é um numpy array correspondente a uma das 10 partições realizadas.
    """

    X_treinamento_list = []
    y_treinamento_list = []
    X_teste_list = []
    y_teste_list = []

    for i in range(10):
        df_X_train_lab, df_X_test_small, df_y_train_lab, df_y_test_small = train_test_split(Xtest, ytest, test_size=0.80, stratify=df_Y_test)
        # Resetando os índices
        df_X_train_lab = df_X_train_lab.reset_index(drop=True)
        df_y_train_lab = df_y_train_lab.reset_index(drop=True)
        df_X_test_small = df_X_test_small.reset_index(drop=True)
        df_y_test_small = df_y_test_small.reset_index(drop=True)
        # Amostras para treinamento
        X_treinamento = df_X_train_lab.to_numpy()
        y_treinamento = df_y_train_lab.to_numpy()
        y_treinamento = np.ravel(y_treinamento)
        # Amostras para teste
        X_teste = df_X_test_small.to_numpy()
        y_teste = df_y_test_small.to_numpy()
        y_teste = np.ravel(y_teste)

        X_treinamento_list.append(X_treinamento)
        y_treinamento_list.append(y_treinamento)
        X_teste_list.append(X_teste)
        y_teste_list.append(y_teste)
    
    return X_treinamento_list, y_treinamento_list, X_teste_list, y_teste_list

In [11]:
Xtest = df_X_test
ytest = df_Y_test
X_treinamento_list, y_treinamento_list, X_teste_list, y_teste_list = testa_learning(Xtest, ytest)

### **1. Active Learning com GaussianProcessClassifier**

In [12]:
score_list_1 = []

for i in range(10):
    # Inicializando o learner
    n_learners = 3
    learner_list = []
    for _ in range(n_learners):
        learner = ActiveLearner(
            estimator=GaussianProcessClassifier(1.0 * RBF(1.0)),
            X_training=X_treinamento_list[i], y_training=y_treinamento_list[i],
            bootstrap_init=True
        )
        learner_list.append(learner)

    # Montando o Committee
    committee1 = Committee(learner_list)

    # Ensemble do active learner a partir do Committee
    ensemble_learner1 = ActiveLearner(
        estimator=committee1
    )
    # Cálculo do AUC
    predicoes_1 = learner.predict(X_teste_list[i])
    et_auc_1 = roc_auc_score(y_teste_list[i], predicoes_1)
    score_list_1.append(et_auc_1)

In [13]:
print(f'Lista de scores:')
score_list_1

Lista de scores:


[0.7672292545710266,
 0.862322237849664,
 0.8110642287857478,
 0.779887482419128,
 0.7619940615721207,
 0.6999531176746366,
 0.7618377871542429,
 0.7928582591029848,
 0.7176902641037664,
 0.7752773870917331]

In [14]:
print(f'Média dos scores: {mean(score_list_1)}')

Média dos scores: 0.7730114080325051


### **2. Active Learning com RandomForestClassifier()**

In [15]:
score_list_2 = []

for i in range(10):
    # Inicializando o learner
    n_learners = 3
    learner_list = []
    for _ in range(n_learners):
        learner = ActiveLearner(
            estimator=RandomForestClassifier(),
            X_training=X_treinamento_list[i], y_training=y_treinamento_list[i],
            bootstrap_init=True
        )
        learner_list.append(learner)

    # Montando o Committee
    committee2 = Committee(learner_list)

    # Ensemble do active learner a partir do Committee
    ensemble_learner2 = ActiveLearner(
        estimator=committee2
    )
    # Cálculo do AUC
    predicoes_2 = learner.predict(X_teste_list[i])
    et_auc_2 = roc_auc_score(y_teste_list[i], predicoes_2)
    score_list_2.append(et_auc_2)

In [16]:
print(f'Lista de scores:')
score_list_2

Lista de scores:


[0.8512267541803407,
 0.8243475543053602,
 0.8303641193936554,
 0.850289107673074,
 0.8007501172058133,
 0.8050476636974527,
 0.7705110173464604,
 0.8121581497108923,
 0.7754336615096108,
 0.7290982966088452]

In [17]:
print(f'Média dos scores: {mean(score_list_2)}')

Média dos scores: 0.8049226441631505


## **Para melhorar a solução recomendo: Criar um comittee com diferentes tipos de modelos**

**Dessa forma serão obtidas previsões de todos os modelos para todos os exemplos. Os exemplos em que os modelos discordam muito, são os exemplos mais difíceis. O uso de diferentes tipos de modelos como por exemplo, linear, tree, neighbors ou bayes, garantem que a estrategia de query não esteja enviesada por um único tipo de modelo, e portanto, aumentaria o desempenho.**