# Classificador Hierárquico

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
df_pb = pd.read_csv("./pgsb.csv", encoding='utf-8')
df_rb = pd.read_csv("./repbase.csv", encoding='utf-8')

In [None]:
df_pb.head()

Unnamed: 0,id,AA,AT,AC,AG,TT,TA,TC,TG,CC,...,TTGG,TTGA,TTGC,TTAA,TTAC,TTAG,TTCC,TTCA,TTCG,classification
0,RIX_Gr_EU103440.1_10195,36,26,13,19,36,25,19,34,21,...,1,1,3,4,1,2,5,1,1,1.4
1,PRSiTERT00300001,255,288,165,199,392,210,330,265,272,...,19,30,18,16,12,17,39,17,5,1.4
2,PRSiTERT00300006,47,33,14,42,30,32,15,29,6,...,4,1,0,2,1,4,0,2,0,1.4
3,ZRSiTERT00300001,23,23,11,25,48,19,27,47,12,...,6,2,2,0,1,2,3,5,0,1.4
4,ZRSiTERT00300004,118,75,64,148,96,80,68,110,65,...,14,3,4,3,3,11,2,4,6,1.4


In [None]:
df_rb.head()

Unnamed: 0,id,1,2,3,4,5,6,7,8,9,...,328,329,330,331,332,333,334,335,336,classification
0,G_DM,131,84,67,81,67,76,60,74,87,...,7,4,5,4,5,1,6,4,2,1.4.3
1,Hebe_Av,386,185,339,304,171,134,280,231,418,...,14,11,13,9,10,2,17,18,12,1.4.3
2,HELENA,743,511,418,289,388,394,361,228,160,...,10,16,18,40,29,17,19,34,20,1.4.3
3,HELENA_RT,432,292,298,279,204,230,215,235,206,...,14,19,19,17,12,2,8,19,11,1.4.3
4,HeT-A2_DVi,126,117,93,81,76,98,73,80,73,...,5,5,6,6,6,0,2,2,5,1.4.3


## Visualização dos dados

In [None]:
df_rb.value_counts("classification")

classification
1.1.2      10068
1.1.1       6313
2.1         4242
2.1.1.2     2437
2.1.1.1     2351
1.1.3       1827
1.4.4       1566
2.1.1.8      874
2.1.1.3      735
2.1.1.9      581
1.1          560
1.5.1        505
1.4.2        439
2.1.1.6      376
1.2          374
2.1.1.7      353
1.4.3        242
1.4.5        194
1.5          124
2.1.1.5      123
1.5.2         95
1.4.1         78
2.1.1.4       73
1.5.3         29
dtype: int64

# Pré-processamento

In [None]:
from sklearn.utils import shuffle
df_rb = shuffle(df_rb)
df_pb = shuffle(df_pb)

# Construção da árvore hierárquica

In [None]:
class Node:
        
    def __init__(self, X, y):
        self.count = y.value_counts() # quantidade de dados no dataset
        self.X = X # dataset
        self.y = y # classe alvo do dataset
        self.name = None
        self.model = None
        self.children = []
        
        self.f1 = 0
        self.score = None
    
    def add_child(self, obj):
        self.children.append(obj)

In [None]:
def build_hierarchy(dataset):
    # Classes - todos os elementos
    X = dataset.drop(columns=['classification', 'id'], axis = 1)
    y = dataset["classification"].map(lambda x: "1" if x[0] == "1" else "2")
    classes = Node(X, y)
    classes.name = "raiz"
    
    # Subclasses
    # ------------
    # Classe 1 - classificação das subclasses(LTR, DIRS, LINE, SINE)
    X = dataset.drop(dataset[dataset.classification.str.contains(r'^2.*')].index)
    y = X['classification'].map(lambda x: x[0:3])
    X = X.drop(columns=['classification', 'id'])
    class_retro = Node(X, y)
    class_retro.name = "retrotransposons"
    classes.add_child(class_retro)

    # Classe 2 - todos são da subclasse 1
    
    X = dataset.drop(dataset[dataset.classification.str.contains(r'^1.*')].index)
    y = X['classification'].map(lambda x: x[0:3])
    X = X.drop(columns=['classification', 'id'])
    class_transp = Node(X, y)
    class_transp.name = "transposons"
    classes.add_child(class_transp)
    
    # Ordens
    # -----------
    # LTR
    X = dataset[dataset.classification.str.contains(r'^1.1.*')]
    y = X['classification'].map(lambda x: x[0:5])
    X = X.drop(columns=['classification', 'id'])
    or_ltr = Node(X, y)
    or_ltr.name = "ltr"
    class_retro.add_child(or_ltr)

    # DIRS
    X = dataset[dataset.classification.str.contains(r'^1.2.*')]
    y = X['classification'].map(lambda x: x[0:5])
    X = X.drop(columns=['classification', 'id'])
    or_dirs = Node(X, y)
    or_dirs.name = "dirs"
    class_retro.add_child(or_dirs)
    # pq não tinha o add child?

    # LINE
    X = dataset[dataset.classification.str.contains(r'^1.4.*')]
    y = X['classification'].map(lambda x: x[0:5])
    X = X.drop(columns=['classification', 'id'])
    or_line = Node(X, y)
    or_line.name = "line"
    class_retro.add_child(or_line)

    # SINE
    X = dataset[dataset.classification.str.contains(r'^1.5.*')]
    y = X['classification'].map(lambda x: x[0:5])
    X = X.drop(columns=['classification', 'id'])
    or_sine = Node(X, y)
    or_sine.name = "sine"
    class_retro.add_child(or_sine)

    # Subclasse 1
    # Aqui é criado uma classe 2.1.X para treinar o modelo para reconhecer os dados que não estão classificados,
    # podendo ser então ou pertencente à classe 2.1.X ou 2.1.1 
    X = dataset[dataset.classification.str.contains(r'^2.1.*')]
    y = X['classification'].map(lambda x: "2.1.X" if x == "2.1" else x[0:5])
    X = X.drop(columns=['classification', 'id'])
    or_sc1 = Node(X, y)
    or_sc1.name = "subclasse 1"
    class_transp.add_child(or_sc1)

    # TIR
    X = dataset[dataset.classification.str.contains(r'^2.1.1.*')]
    y = X['classification'].map(lambda x: x[0:7])
    X = X.drop(columns=['classification', 'id'])
    or_tir = Node(X, y)
    or_tir.name = "tir"
    or_sc1.add_child(or_tir)
    
    return classes, class_retro, class_transp, or_ltr, or_dirs, or_line, or_sine, or_sc1, or_tir

In [None]:
classes, class_retro, class_transp, or_ltr, or_dirs, or_line, or_sine, or_sc1, or_tir = build_hierarchy(df_rb)

Os nós que não possuem outros rótulos são: **class_transp** e or_dirs, sendo que or_sc1 foi criada um rótulo 2.1.X.

In [None]:
print(class_transp.count)
print(or_dirs.count)

2.1    12145
Name: classification, dtype: int64
1.2    374
Name: classification, dtype: int64


# Visualização 

In [None]:
classes.count

1    22414
2    12145
Name: classification, dtype: int64

In [None]:
class_retro.count

1.1    18768
1.4     2519
1.5      753
1.2      374
Name: classification, dtype: int64

In [None]:
print(or_ltr.count)
print()
print(or_dirs.count)
print()
print(or_line.count)
print()
print(or_sine.count)

1.1.2    10068
1.1.1     6313
1.1.3     1827
1.1        560
Name: classification, dtype: int64

1.2    374
Name: classification, dtype: int64

1.4.4    1566
1.4.2     439
1.4.3     242
1.4.5     194
1.4.1      78
Name: classification, dtype: int64

1.5.1    505
1.5      124
1.5.2     95
1.5.3     29
Name: classification, dtype: int64


# K-fold

In [None]:
k = 3

In [None]:
#df_rb.drop(df_rb.index[23:], inplace=True)
df_rb.shape

(34559, 338)

In [None]:
df_rb.drop(df_rb.index[23:], inplace=True)
df_rb.shape

(23, 338)

### K-fold interno

Tirará a média dos modelos em cada nó individual

In [None]:
def f1_scores_node(f1_scores):



In [2]:
"""
  Irá ocorrer o treinamento e o kfold, aqui será escolhido o melhor modelo dentro do kfold do kfold 
  do dataset inteiro.
"""

def k_fold_node(Node):
    
    index = list(Node.X.index) # pega os indexes de todos os dados dentro do dataset
    subset_size = round(len(index) / k) # tamanho de cada subset
    sets = [] # array que irá conter o dataset dividido em folds

    """
    print("---------------")
    print("Dados recebidos:", index)
    print("---------------\n")
    """
    
    # atribuindo os indices a cada fold 
    for i in range(0, len(index), subset_size):
        aux = index[i:i+subset_size]
        sets.append(aux)
    
    #print("Folds criados no nó:", sets)
    kfolds = []
    for j in range(k):
        test = sets[j] # irá pegar um subset como subset de teste/validação
        train = []
        for h in sets: 
            if h != test:
                train.append(h) # irá colocar todos os subsets restantes em uma lista de treino
        kfolds.append((train, test)) # irá juntar o subset de teste e os subsets de treino em uma tupla só
        
    i = 0
    f1_scores = []
    model = [] # lista com os melhores modelos de cada iteração no k-fold interno do respectivo nó
    for i in range(k):
        try:
            df_train = Node.X.drop(kfolds[i][1]) # removendo o subset de teste
            y_train = Node.y.drop(kfolds[i][1]) # removendo a classe alvo do subset de teste

            df_test = Node.X.loc[kfolds[i][1]] # pegando o subset de teste
            y_test = Node.y.loc[kfolds[i][1]] # pegando a classe alvo do subset de teste
            
            print("Treinamento:", i)
            print(df_train.index)
            print("KFOLDS de teste:", kfolds[i][1])
            print()
            #print(df_test)
            
            aux_f1, _ = train_node(df_train, y_train, df_test, y_test)
            
            f1_scores.append(aux_f1)
            
      
        except Exception as e:
            print("---------------")
            print("Algo deu errado:")
            print(e)
            print("---------------")

    return get_best_model(f1_scores)
    
    return model

### K-fold externo

In [3]:
index = list(range(0, len(df_rb.index))) # pega os indexes de todos os dados dentro do dataset
subset_size = round(len(index) / k) # tamanho de cada subset
sets = [] # array que irá conter o dataset dividido em folds

print("Index de todos os dados:", index, "\n\n")

# atribuindo os indices a cada fold 
for i in range(0, len(index), subset_size):
    aux = index[i:i+subset_size]
    sets.append(aux)

kfolds = []
for j in range(k):
    test = sets[j] # irá pegar um subset como subset de teste/validação
    train = []
    for h in sets: 
        if h != test:
            train.append(h) # irá colocar todos os subsets restantes em uma lista de treino
    kfolds.append((train, test)) # irá juntar o subset de teste e os subsets de treino em uma tupla só

i = 0

for i in range(k):
    
    #print("Árvore Geral - Fold: ", i, "\n")
    
    df_train = df_rb.drop(kfolds[i][1]) # removendo o subset de teste
    print("Fold enviado para o nó:",df_train.index.tolist())
    df_test = df_rb.loc[kfolds[i][1]] # pegando o subset de teste
    print("Fold reservado para o teste:",df_test.index.tolist(), "\n")

    classes, class_retro, class_transp, or_ltr, or_dirs, or_line, or_sine, or_sc1, or_tir = build_hierarchy(df_train)
    
    print("--- Classes ---")
    model = k_fold_node(classes)
    model.append(model)
    
    print("\n--- Retro ---")
    model = k_fold_node(class_retro)
    model.append(model)

    print("\n--- LTR ---")
    model = k_fold_node(or_ltr)
    model.append(model)

    print("\n--- LINE ---")
    model = k_fold_node(or_line)
    model.append(model)

    print("\n--- SINE ---")
    model = k_fold_node(or_sine)
    model.append(model)
    
    print("\n--- SC1 ---")
    model = k_fold_node(or_sc1)
    model.append(model)
    
    print("\n--- TIR ---")
    model = k_fold_node(or_tir)
    model.append(model)

NameError: ignored

In [None]:
def get_best_model(f1_scores):

    nb = dt = rf = lr = svm = knn = grd = mlp = 0
    
    for itens in f1_scores:
        for item in itens:
            nb += item[0]
            dt += item[1]
            rf += item[2]
            lr += item[3]
            svm += item[4]
            knn += item[5]
            grd += item[6]
            mlp += item[7]
  
    means = [nb/k, dt/k, rf/k, lr/k, svm/k, knn/k, grd/k, mlp/k]
    print("Médias:", means)
    index = means.index(max(means))
    print(index)
    if index == 0:
        model = "naive bayes"
    elif index == 1:
        model = "decision tree"
    elif index == 2:
        model = "random forest"
    elif index == 3:
        model = "logistic regression"
    elif index == 4:
        model = "svm"
    elif index == 5:
        model = "knn"
    elif index == 6:
        model = "gradient"
    elif index == 7:
        model = "mlp"
    
    return model

### Teste dos modelos

In [None]:
for i in range(k):
        
    df_train = df_rb.drop(kfolds[i][1]) # removendo o subset de teste
    print("Fold enviado para o nó:",df_train.index.tolist())
    df_test = df_rb.loc[kfolds[i][1]] # pegando o subset de teste

    # realmente necessário?
    classes, class_retro, class_transp, or_ltr, or_dirs, or_line, or_sine, or_sc1, or_tir = build_hierarchy(df_train)
    
    classes_f1 = get_f1(models, classes.X_test)
    prediction = model.predict(X_test)
    #scores.append(classification_report(y_test, prediction))
    f1 = f1_score(y_test, prediction, average='weighted')
    f1_scores.append(f1)
    

In [None]:
def get_F1(models, X_test):

  f1_scores = []
  for model in models:
    print(model)
    prediction = model.predict(X_test)
    #scores.append(classification_report(y_test, prediction))
    f1 = f1_score(y_test, prediction, average='weighted')
    f1_scores.append(f1)

  return f1_scores

# Treinamento REPBASE

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score

In [None]:
def train_node(Node):
    # Naive Bayes
    naiveBayes_clf = MultinomialNB()
    naiveBayes_clf = k_fold(Node, naiveBayes_clf)
    #naiveBayes_clf.fit(Node.X_train, Node.y_train)

    # Decision Tree
    decisionTree_clf = DecisionTreeClassifier(min_samples_leaf=10, random_state=0)
    decisionTree_clf = k_fold(Node, decisionTree_clf)
    #decisionTree_clf.fit(Node.X_train, Node.y_train)

    # Random Forest
    randomForest_clf = RandomForestClassifier(min_samples_leaf=10, random_state=1)
    randomForest_clf = k_fold(Node, randomForest_clf)   
    #randomForest_clf.fit(Node.X_train, Node.y_train)

    # Logistic Regression
    logisticRegression_clf = LogisticRegression(random_state=0, max_iter=10000)
    logisticRegression_clf = k_fold(Node, logisticRegression_clf)   
    #logisticRegression_clf.fit(Node.X_train, Node.y_train)

    # SVM
    SVM_clf = SVC(probability=True) 
    SVM_clf = k_fold(Node, SVM_clf)   
    #SVM_clf.fit(Node.X_train, Node.y_train)

    # KNN
    KNN_clf = KNeighborsClassifier()  
    KNN_clf = k_fold(Node, KNN_clf)    
    #KNN_clf.fit(Node.X_train, Node.y_train)
    
    # Gradient Boosting
    gradient_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
    gradient_clf = k_fold(Node, gradient_clf)    
    #gradient_clf.fit(Node.X_train, Node.y_train)
    
    #Multi-layer perceptron
    MLP_clf = MLPClassifier(random_state=1, max_iter=300)
    MLP_clf = k_fold(Node, MLP_clf)    
    #naiveBayes_clf = k_fold(df_rbMLP_clf.fit(Node.X_train, Node.y_train)
  
    Node.models = [naiveBayes_clf, decisionTree_clf, randomForest_clf, logisticRegression_clf, SVM_clf, KNN_clf, gradient_clf, MLP_clf]

    return 

# Classificadores homogêneos 

In [None]:
def build_homogeneous_tree(model_classes, model_retro, model_transp, model_ltr, model_dirs, 
                           model_line, model_sine, model_sc1, model_tir):
    
    classes = Node(X, y)
    classes.model = model_classes
    classes.name = "raiz"
  
    class_retro = Node(X, y)
    class_retro.name = "retrotransposons"
    class_retro.model = model_retro
    classes.add_child(class_retro)

    class_transp = Node(X, y)
    class_transp.name = "transposons"
    class_transp.model = model_transp
    classes.add_child(class_transp)
    
    or_ltr = Node(X, y)
    or_ltr.name = "ltr"
    or_ltr.model = model_ltr
    class_retro.add_child(or_ltr)

    or_dirs = Node(X, y)
    or_dirs.name = "dirs"
    or_dirs.model = model_dirs
    class_retro.add_child(or_dirs)

    or_line = Node(X, y)
    or_line.name = "line"
    or_line.model = model_line
    class_retro.add_child(or_line)

    or_sine = Node(X, y)
    or_sine = "sine"
    or_sine.model = model_sine
    class_retro.add_child(or_sine)

    or_sc1 = Node(X, y)
    or_sc1.name = "subclasse 1"
    or_sc1.model = model_sc1
    class_transp.add_child(or_sc1)

    or_tir = Node(X, y)
    or_tir.name = "tir"
    or_tir.model = model_tir
    or_sc1.add_child(or_tir)
    
    return classes

In [None]:
def general_classifier(row, classes):

    # Primeiro nível - Classe Retro e Classe Transp
    pred_class = classes.model.predict(row)
    #pred_class = pred_class[0]
    
    #print(pred_class == "2")
    # Segundo nível - Ordens LTR, DIRS, LINE, SINE
    if(pred_class == "1"):
        node_retro = classes.children[0]
        order = node_retro.model.predict(row)

        if(order == "1.1"):
            node_or = node_retro.children[0]
            sf = node_or.model.predict(row)
                        
        elif(order == "1.2"):
            node_or = node_retro.children[1]
            sf = node_or.model.predict(row)
            
        elif(order == "1.4"):
            node_or = node_retro.children[2]
            sf = node_or.model.predict(row)
            
        elif(order == "1.5"):
            node_or = node_retro.children[3]
            sf = node_or.model.predict(row)
        
        else:
            sf = order

        return sf

    # Todas os elementos classificados em 2 são consequentemente 2.1
    elif(pred_class == "2"):
        node_tran = classes.children[1]
        node_sb1 = node_tran.children[0]
        order = node_sb1.model.predict(row)
        
        if (order == "2.1.1"):# classe TIR
            node_tir = node_sb1.children[0]
            sf = node_tir.model.predict(row)
        else:
            return order
        
        return sf

### Naive-Bayes

In [None]:
nb_classes = k_fold(classes, MultinomialNB())
nb_class = k_fold(class_retro, MultinomialNB())
nb_ltr = k_fold(or_ltr, MultinomialNB())
nb_line = k_fold(or_line, MultinomialNB())
nb_sine = k_fold(or_sine, MultinomialNB())
nb_sc1 = k_fold(or_sc1, MultinomialNB())
nb_tir = k_fold(or_tir, MultinomialNB())

In [None]:
nb_tree = build_homogeneous_tree(nb_classes, nb_class, nb_ltr, nb_line, nb_sine, nb_sc1, nb_tir)

In [None]:
print("F1-score em cada nó com Naive-Bayes")
print("Classes:", nb_classes)
print("Classe retro:", nb_class)
print("LTR:", nb_ltr)
print("LINE:", nb_line)
print("SINE:", nb_sine)
print("SC1:", nb_sc1)
print("TIR:", nb_tir)
mean_nb = (nb_classes + nb_class + nb_ltr + nb_line + nb_sine + nb_sc1 + nb_tir) / 7
print("Média geral:", mean_nb)

F1-score em cada nó com Naive-Bayes
Classes: 0.670721235534959
Classe retro: 0.591295231495041
LTR: 0.4582801524310845
LINE: 0.6141362696486946
SINE: 0.6444825081019044
SC1: 0.5853645614242089
TIR: 0.2305449997882194
Média geral: 0.5421178512034446


### Decision Tree

In [None]:
dt = DecisionTreeClassifier(min_samples_leaf=10, random_state=0)
dt_classes = k_fold(classes, dt)
dt_class = k_fold(class_retro, dt)
dt_ltr = k_fold(or_ltr, dt)
dt_line = k_fold(or_line, dt)
dt_sine = k_fold(or_sine, dt)
dt_sc1 = k_fold(or_sc1, dt)
dt_tir = k_fold(or_tir, dt)

In [None]:
print("F1-score em cada nó com Decision Tree")
print("Classes:", dt_classes)
print("Classe retro:", dt_class)
print("LTR:", dt_ltr)
print("LINE:", dt_line)
print("SINE:", dt_sine)
print("SC1:", dt_sc1)
print("TIR:", dt_tir)
mean_dt = (dt_classes + dt_class + dt_ltr + dt_line + dt_sine + dt_sc1 + dt_tir) / 7
print("Média geral:", mean_dt)

F1-score em cada nó com Decision Tree
Classes: 0.7744688681818617
Classe retro: 0.8125202160587653
LTR: 0.5889063035000696
LINE: 0.5734031515471746
SINE: 0.67957851384352
SC1: 0.5814520334191178
TIR: 0.34243643934028406
Média geral: 0.6218236465558277


### Random Forest

In [None]:
rf = RandomForestClassifier(min_samples_leaf=10, random_state=1)
rf_classes = k_fold(classes, rf)
rf_class = k_fold(class_retro, rf)
rf_ltr = k_fold(or_ltr, rf)
rf_line = k_fold(or_line, rf)
rf_sine = k_fold(or_sine, rf)
rf_sc1 = k_fold(or_sc1, rf)
rf_tir = k_fold(or_tir, rf)

In [None]:
print("F1-score em cada nó com Random Forest")
print("Classes:", rf_classes)
print("Classe retro:", rf_class)
print("LTR:", rf_ltr)
print("LINE:", rf_line)
print("SINE:", rf_sine)
print("SC1:", rf_sc1)
print("TIR:", rf_tir)
mean_rf = (rf_classes + rf_class + rf_ltr + rf_line + rf_sine + rf_sc1 + rf_tir) / 7
print("Média geral:", mean_rf)

F1-score em cada nó com Random Forest
Classes: 0.8053288976947619
Classe retro: 0.7931502703603881
LTR: 0.6008116330819212
LINE: 0.5438224478804028
SINE: 0.6433352403506036
SC1: 0.5638061508783293
TIR: 0.37405558113160753
Média geral: 0.6177586030540022


### Logistic Regression

In [None]:
lr = LogisticRegression(random_state=0, max_iter=10000)
lr_classes = k_fold(classes, lr)
lr_class = k_fold(class_retro, lr)
lr_ltr = k_fold(or_ltr, lr)
lr_line = k_fold(or_line, lr)
lr_sine = k_fold(or_sine, lr)
lr_sc1 = k_fold(or_sc1, lr)
lr_tir = k_fold(or_tir, lr)

In [None]:
print("F1-score em cada nó com Logistic Regression")
print("Classes:", lr_classes)
print("Classe retro:", lr_class)
print("LTR:", lr_ltr)
print("LINE:", lr_line)
print("SINE:", lr_sine)
print("SC1:", lr_sc1)
print("TIR:", lr_tir)
mean_lr = (lr_classes + lr_class + lr_ltr + lr_line + lr_sine + lr_sc1 + lr_tir) / 7
print("Média geral:", mean_lr)

F1-score em cada nó com Logistic Regression
Classes: 0.8974013887365709
Classe retro: 0.9020015839861406
LTR: 0.6209780380161977
LINE: 0.6900691960821984
SINE: 0.7634399748282377
SC1: 0.6034799834539215
TIR: 0.4962312896752592
Média geral: 0.7105144935397895


### SVM

In [None]:
svm = SVC(probability=True) 
svm_classes = k_fold(classes, svm)
svm_class = k_fold(class_retro, svm)
svm_ltr = k_fold(or_ltr, svm)
svm_line = k_fold(or_line, svm)
svm_sine = k_fold(or_sine, svm)
svm_sc1 = k_fold(or_sc1, svm)
svm_tir = k_fold(or_tir, svm)

In [None]:
print("F1-score em cada nó com SVM")
print("Classes:", svm_classes)
print("Classe retro:", svm_class)
print("LTR:", svm_ltr)
print("LINE:", svm_line)
print("SINE:", svm_sine)
print("SC1:", svm_sc1)
print("TIR:", svm_tir)
mean_svm = (svm_classes + svm_class + svm_ltr + svm_line + svm_sine + svm_sc1 + svm_tir) / 7
print("Média geral:", mean_svm)

F1-score em cada nó com SVM
Classes: 0.7809645935190895
Classe retro: 0.7835716736023723
LTR: 0.512571087069856
LINE: 0.5089973621634827
SINE: 0.6396734371557529
SC1: 0.5245867787122139
TIR: 0.1977950839389723
Média geral: 0.5640228594516772


### KNN

In [None]:
knn = KNeighborsClassifier()
knn_classes = k_fold(classes, knn)
knn_class = k_fold(class_retro, knn)
knn_ltr = k_fold(or_ltr, knn)
knn_line = k_fold(or_line, knn)
knn_sine = k_fold(or_sine, knn)
knn_sc1 = k_fold(or_sc1, knn)
knn_tir = k_fold(or_tir, knn)

In [None]:
print("F1-score em cada nó com KNN")
print("Classes:", knn_classes)
print("Classe retro:", knn_class)
print("LTR:", knn_ltr)
print("LINE:", knn_line)
print("SINE:", knn_sine)
print("SC1:", knn_sc1)
print("TIR:", knn_tir)
mean_knn = (knn_classes + knn_class + knn_ltr + knn_line + knn_sine + knn_sc1 + knn_tir) / 7
print("Média geral:", mean_knn )

F1-score em cada nó com KNN
Classes: 0.8364799037592695
Classe retro: 0.8415493709540743
LTR: 0.6883546520205058
LINE: 0.6261274502813303
SINE: 0.7616581865431158
SC1: 0.5904329887223896
TIR: 0.42954174360519976
Média geral: 0.6820206136979836


### Gradient Boosting

In [None]:
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
gb_classes = k_fold(classes, gb)
gb_class = k_fold(class_retro, gb)
gb_ltr = k_fold(or_ltr, gb)
gb_line = k_fold(or_line, gb)
gb_sine = k_fold(or_sine, gb)
gb_sc1 = k_fold(or_sc1, gb)
gb_tir = k_fold(or_tir, gb)

In [None]:
print("F1-score em cada nó com Naive-Bayes")
print("Classes:", gb_classes)
print("Classe retro:", gb_class)
print("LTR:", gb_ltr)
print("LINE:", gb_line)
print("SINE:", gb_sine)
print("SC1:", gb_sc1)
print("TIR:", gb_tir)
mean_gb = (gb_classes + gb_class + gb_ltr + gb_line + gb_sine + gb_sc1 + gb_tir) / 7
print("Média geral:", mean_gb)

F1-score em cada nó com Naive-Bayes
Classes: 0.8150761528703878
Classe retro: 0.7870003321617162
LTR: 0.5583346581462164
LINE: 0.6294594525044944
SINE: 0.6955342976919173
SC1: 0.5872276857393964
TIR: 0.3668925426806091
Média geral: 0.6342178745421053


### Multi-layer Perceptron

In [None]:
mlp = MLPClassifier(random_state=1, max_iter=300)
mlp_classes = k_fold(classes, mlp)
mlp_class = k_fold(class_retro, mlp)
mlp_ltr = k_fold(or_ltr, mlp)
mlp_line = k_fold(or_line, mlp)
mlp_sine = k_fold(or_sine, mlp)
mlp_sc1 = k_fold(or_sc1, mlp)
mlp_tir = k_fold(or_tir, mlp)

In [None]:
print("F1-score em cada nó com Naive-Bayes")
print("Classes:", mlp_classes)
print("Classe retro:", mlp_class)
print("LTR:", mlp_ltr)
print("LINE:", mlp_line)
print("SINE:", mlp_sine)
print("SC1:", mlp_sc1)
print("TIR:", mlp_tir)
mean_mlp = (mlp_classes + mlp_class + mlp_ltr + mlp_line + mlp_sine + mlp_sc1 + mlp_tir) / 7
print("Média geral:", mean_mlp)

F1-score em cada nó com Naive-Bayes
Classes: 0.893020377407468
Classe retro: 0.908902177369752
LTR: 0.6965710833615878
LINE: 0.7066001305054023
SINE: 0.7839901318027082
SC1: 0.576865906284515
TIR: 0.42014992344422214
Média geral: 0.712299961453665


# Classificador plano

In [None]:
y = df_rb['classification']
X = df_rb.drop(['classification', 'id'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
nb = MultinomialNB()
dt= DecisionTreeClassifier(min_samples_leaf=10, random_state=0)
rf = RandomForestClassifier(min_samples_leaf=10, random_state=1)
lr = LogisticRegression(random_state=0, max_iter=10000)
svm = SVC(probability=True) 
knn = KNeighborsClassifier()  
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
mlp = MLPClassifier(random_state=1, max_iter=300)

In [None]:
nb = nb.fit(X_train, y_train)
dt = dt.fit(X_train, y_train)
rf = rf.fit(X_train, y_train)
lr = lr.fit(X_train, y_train)
svm = svm.fit(X_train, y_train)
knn = knn.fit(X_train, y_train)
gb = gb.fit(X_train, y_train)
mlp = mlp.fit(X_train, y_train)

models = [nb, dt, rf, lr, svm, knn, gb, mlp]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
def flat_f1(models):

  f1_scores = []
  for model in models:
    print(model)
    prediction = model.predict(X_test)
    #scores.append(classification_report(y_test, prediction))
    f1 = f1_score(y_test, prediction, average='weighted')
    f1_scores.append(f1)

  return f1_scores

f1_scores = flat_f1(models)

MultinomialNB()
DecisionTreeClassifier(min_samples_leaf=10, random_state=0)
RandomForestClassifier(min_samples_leaf=10, random_state=1)
LogisticRegression(max_iter=10000, random_state=0)
SVC(probability=True)
KNeighborsClassifier()
GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=0)
MLPClassifier(max_iter=300, random_state=1)


In [None]:
print("-- F1-score do classificador plano --")
print("Média Naive-Bayes:", f1_scores[0])
print("Média Decision Tree:", f1_scores[1])
print("Média Random Forest:", f1_scores[2])
print("Média Logistic Regression:", f1_scores[3])
print("Média SVM:", f1_scores[4])
print("Média KNN:", f1_scores[5])
print("Média Gradient Boosting:", f1_scores[6])
print("Média Multi-layer perceptron:", f1_scores[7])

-- F1-score do classificador plano --
Média Naive-Bayes: 0.16353823494579833
Média Decision Tree: 0.4593496835258623
Média Random Forest: 0.5534339040421278
Média Logistic Regression: 0.6287145862403664
Média SVM: 0.4690528957085921
Média KNN: 0.6436831205660752
Média Gradient Boosting: 0.4088979609020562
Média Multi-layer perceptron: 0.6453437509971482


# Comparativo entre os classificadores

---



In [None]:
print("-- Média dos classificadores homogêneos --")
print("Média Naive-Bayes:", mean_nb)
print("Média Decision Tree:", mean_dt)
print("Média Random Forest:", mean_rf)
print("Média Logistic Regression:", mean_lr)
print("Média SVM:", mean_svm)
print("Média KNN:", mean_knn)
print("Média Gradient Boosting:", mean_gb)
print("Média Multi-layer perceptron:", mean_mlp)

-- Média dos classificadores homogêneos --
Média Naive-Bayes: 0.5421178512034446
Média Decision Tree: 0.6218236465558277
Média Random Forest: 0.6177586030540022
Média Logistic Regression: 0.7105144935397895
Média SVM: 0.5640228594516772
Média KNN: 0.6820206136979836
Média Gradient Boosting: 0.6342178745421053
Média Multi-layer perceptron: 0.712299961453665


In [None]:
def par(x):
  if (x % 2 == 0):
    return "par"
  else:
    return "impar"

# Treinamento PGSB

In [None]:
# Classes - todos os elementos
X = df_pb.drop(columns=['classification', 'id'])
y = df_pb["classification"].map(lambda x: '1' if x[0] == '1' else '2')
classes_pb = Node(X, y)

In [None]:
# Subclasses
# ------------
# Classe 1 - classificação das subclasses(LTR, DIRS, LINE, SINE)
X = df_pb.drop(df_pb[df_pb.classification.str.contains(r'^2.*')].index)
y = X['classification'].map(lambda x: x[0:3])
X = X.drop(columns=['classification', 'id'])
class_pb_retro = Node(X, y)
classes_pb.add_child(class_pb_retro)

# Classe 2
X = df_pb.drop(df_pb[df_pb.classification.str.contains(r'^1.*')].index)
y = X['classification'].map(lambda x: x[0:3])
X = X.drop(columns=['classification', 'id'])
class_pb_transp = Node(X, y)
classes_pb.add_child(class_pb_transp)

In [None]:
# Ordens
# -----------
# LTR
X = df_pb[df_pb.classification.str.contains(r'^1.1.*')]
y = X['classification'].map(lambda x: x[0:5])
X = X.drop(columns=['classification', 'id'])
or_pb_ltr = Node(X, y)
class_retro.add_child(or_pb_ltr)

# Subclasse 1
X = df_pb[df_pb.classification.str.contains(r'^2.1.1.*')]
y = X['classification'].map(lambda x: x[0:7])
X = X.drop(columns=['classification', 'id'])
or_pb_sc1 = Node(X, y)
class_transp.add_child(or_pb_sc1)

In [None]:
results_classes_pb_node = train_node(classes_pb.X_train, classes_pb.y_train, classes_pb.X_test)

In [None]:
results_retro_pb_node = train_node(class_pb_retro.X_train, class_pb_retro.y_train, class_pb_retro.X_test)

In [None]:
results_transp_pb_node = train_node(class_pb_transp.X_train, class_pb_transp.y_train, class_pb_transp.X_test)

In [None]:
results_or_pb_ltr_node = train_node(or_pb_ltr.X_train, or_pb_ltr.y_train, or_pb_ltr.X_test)

In [None]:
results_or_pb_sc1_node = train_node(or_pb_sc1.X_train, or_pb_sc1.y_train, or_pb_sc1.X_test)

## Visualização dos dados

In [None]:
results(results_classes_pb_node, classes_pb)

In [None]:
results(results_retro_pb_node, class_pb_retro)

In [None]:
results(results_or_pb_ltr_node, or_pb_ltr)

In [None]:
results(results_or_pb_sc1_node, or_pb_sc1)

# Classificação geral

In [None]:
def classifier(row):

    # Primeiro nível - Classe Retro e Classe Transp
    pred_class = classes.model.predict(row)
    #pred_class = pred_class[0]
    
    #print(pred_class == "2")
    # Segundo nível - Ordens LTR, DIRS, LINE, SINE
    if(pred_class == "1"):
        node_retro = classes.children[0]
        order = node_retro.model.predict(row)

        if(order == "1.1"):
            node_or = node_retro.children[0]
            sf = node_or.model.predict(row)
                        
        elif(order == "1.2"):
            node_or = node_retro.children[1]
            sf = node_or.model.predict(row)
            
        elif(order == "1.4"):
            node_or = node_retro.children[2]
            sf = node_or.model.predict(row)
            
        elif(order == "1.5"):
            node_or = node_retro.children[3]
            sf = node_or.model.predict(row)
        
        else:
            sf = order

        return sf

    # Todas os elementos classificados em 2 são consequentemente 2.1
    elif(pred_class == "2"):
        node_tran = classes.children[1]
        node_sb1 = node_tran.children[0]
        order = node_sb1.model.predict(row)
        
        if (order == "2.1.1"):# classe TIR
            node_tir = node_sb1.children[0]
            sf = node_tir.model.predict(row)
        else:
            return order
        
        return sf

In [None]:
id = 4
predicao = classifier(classes.X_test.iloc[4].to_numpy().reshape(1,-1))
print("Valor previsto: ", predicao[0])
print("Valor original: ", y_test.iloc[id])

Valor previsto:  2.1.1.1
Valor original:  2.1.1.1




In [None]:
X_test.head(10)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,327,328,329,330,331,332,333,334,335,336
6941,315,263,171,279,427,234,227,298,117,200,...,39,25,30,21,29,10,19,9,25,17
33452,37,16,8,12,31,14,14,17,6,9,...,5,0,1,1,2,0,1,3,0,0
451,8,2,4,12,2,4,6,5,11,5,...,0,0,1,0,0,0,0,0,0,1
19998,62,59,19,24,69,53,27,32,18,22,...,1,1,5,1,11,3,1,4,3,1
9473,41,31,19,23,47,31,26,23,16,22,...,3,3,2,0,3,3,4,1,4,0
19214,515,513,265,361,587,381,292,445,159,380,...,53,42,49,16,33,29,23,21,43,6
28042,15,16,22,18,18,17,14,22,18,20,...,0,1,1,0,1,1,2,2,1,0
16328,585,375,245,280,298,282,207,260,87,293,...,18,19,24,20,29,14,13,11,22,10
25463,18,10,11,13,31,10,13,14,6,10,...,4,0,4,1,2,1,0,3,3,0
15910,30,65,27,22,107,41,66,34,18,48,...,11,0,7,3,2,5,0,3,8,4


In [None]:
y_test.head(10)

6941     2.1.1.2
33452      1.1.1
451        1.5.2
19998      1.1.1
9473     2.1.1.1
19214      1.1.1
28042      1.1.2
16328      1.1.1
25463      1.1.2
15910      1.1.1
Name: classification, dtype: object