In [None]:
# IMPORTAÇÃO DE OUTRAS BIBLIOTECAS
import pandas as pd
import numpy as np
np.random.seed(10)

import matplotlib.pyplot as plt
import matplotlib.pylab as pylab



# IMPORTAÇÃO DAS BIBLIOTECAS DE MACHINE LEARNING
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
# from sklearn.neural_network import MLPClassifier


from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, classification_report,  recall_score, accuracy_score, precision_score, confusion_matrix, f1_score


classifier_name = 'Random Forest'
classifier_roc_name = 'Curva ROC - Random Forest'


In [None]:
def plot_roc_curve_folds(fp, tp, n_folds, sup_title, clf_name):
    fig, axs = plt.subplots(nrows=1, ncols=n_folds, sharey=True, sharex=True, figsize=(15,5))

    for fold in range(n_folds):
        axs[fold].plot(fp, tp)
        axs[fold].set_title('Fold ' + str(fold))
        axs[fold].set_xlabel('FP rate', fontsize=12)
        axs[fold].set_ylabel('TP rate', fontsize=12)

        axs[fold].grid(linewidth=0.25)

        #remove bounding box around the graphs 
        axs[fold].spines['left'].set_visible(False)
        axs[fold].spines['top'].set_visible(False)
        axs[fold].spines['right'].set_visible(False)  

    fig.tight_layout()
    fig.subplots_adjust(top=0.88)
    fig.suptitle(sup_title)

    fig.savefig(f"{clf_name}_curves.png")


In [None]:
# registros = pd.read_csv('dataset.csv', sep=';', encoding='utf-8-sig')
registros = pd.read_pickle('dataset.pkl', compression='bz2')
registros

In [None]:
# extração do vetor de caracteristicas X (Multidimensional), retirando o label da classe
X = registros.drop(columns={'ApplicationProtocolName'})

# extração do vetor de labels y (Uma dimensão só)
y = registros['ApplicationProtocolName']
y = y.map({'53_UDP_Dns': 0, '80_TCP_HTTP': 1}).astype(int)

In [None]:
X.head(5)

In [None]:
y.head(5)

In [None]:
X = X.to_numpy()
y = y.to_numpy()

In [None]:
X_train_or, X_test_or, y_train_or, y_test_or = train_test_split(X, y, test_size=0.2, random_state = 40)

print("Lenght train: {} - {:.2f}%".format(len(y_train_or), len(y_train_or)/ len(y)*100))
print("Lenght test:  {} - {:.2f}%".format(len(y_test_or), len(y_test_or)/ len(y)*100))

# Normalização dos dados
scaler = preprocessing.MinMaxScaler()
X_train_or = scaler.fit_transform(X_train_or)
X_test_or = scaler.fit_transform(X_test_or)

In [None]:
#INSTANCIA O CLASSIFICADOR
classifier = RandomForestClassifier(n_estimators=100, max_depth = 15, max_features = 3 , min_samples_split = 3)

In [None]:
#Métricas do Fold
K_FOLDS = 5
skfold = StratifiedKFold(n_splits = K_FOLDS)
skfold.get_n_splits(X_train_or, y_train_or)


#Listas e dicionários utilizados
fold_metrics = []
FP_list = []
TP_list = []
TH_list = []

metrics = {'acc': {}, 'f1score': {},  'recall': {},  'precision': {},  'report': {},  'confussion': {}}

fold_count = 1
for train_index, test_index in skfold.split(X_train_or, y_train_or):
    print('Fold {} / {}'.format(fold_count, K_FOLDS))
    len_train, len_test = len(train_index), len(test_index)
    total = len_train + len_test

    print( 'Train: {} {:.2f}% --- Valid: {} {:.2f}%'.format( len_train, (len_train/total)*100, len_test, (len_test/total)*100 ) )

    X_train, X_test = X_train_or[train_index], X_train_or[test_index]
    y_train, y_test = y_train_or[train_index], y_train_or[test_index]

    print('Training...\n')
    classifier.fit(X_train, y_train.ravel())

    # y_pred_train = classifier.predict(X_train)
    prediction = classifier.predict(X_test)


    #Cálculo das métricas
    acc = accuracy_score(y_test, prediction)
    f1score = f1_score(y_test, prediction)
    recall = recall_score(y_test, prediction)
    precision = precision_score(y_test, prediction)
    report = classification_report(y_test,prediction)
    confussion = confusion_matrix(y_test, prediction)
    
    metrics = {'acc': acc, 'f1score': f1score, 'recall': recall, 'precision': precision, 'report': report, 'confussion': confussion}
    

    #Cálculo das curvas ROC
    y_pred_test_prob = classifier.predict_proba(X_test)[:, 1]

    fpr, tpr, thr = roc_curve(y_test.ravel(), y_pred_test_prob)
    FP_list.append(fpr)
    TP_list.append(tpr)
    TH_list.append(thr)

    fold_metrics.append(metrics)
    fold_count += 1


In [None]:
#IMPRESSÃO DAS MÉTRICAS AVALIADAS EM CADA FOLD (validação do conjunto de dados de treinamento)
print ('--------- MÉTRICAS DE VALIDAÇÃO (POR K-FOLDS) ~~~~ {}:'.format(classifier_name))
fold_count = 1
for fold in fold_metrics:
    print('Fold {} / {}'.format(fold_count, K_FOLDS))
    for metric, value in fold.items():
        print(metric, value)

    fold_count += 1
    print('\n\n')

In [None]:
# IMPRESSÃO DAS MÉTRICAS DE TESTE (utiliza um conjunto de dados que o classificador não viu em nenhuma etapa do k-fold)
prediction = classifier.predict(X_test_or)

print('--------- MÉTRICAS DO TESTE ~~~~ {}:'.format(classifier_name))
print('Accuracy:', accuracy_score(y_test_or, prediction))
print('F1 score:', f1_score(y_test_or, prediction))
print('Recall:', recall_score(y_test_or, prediction))
print('Precision:', precision_score(y_test_or, prediction))
print('\n Classification Report:\n', classification_report(y_test_or,prediction))
print('\n Confussion Matrix:\n',confusion_matrix(y_test_or, prediction))

In [None]:
plot_roc_curve_folds(FP_list, TP_list, K_FOLDS, classifier_roc_name, classifier_name)