In [None]:
import numpy as np
import pandas as pd
import time
from sklearn.svm import SVC
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline

from cpselftraining import CPSelfTraining
from conformalprediction import *
from stdst import StandardSelftraining

from preprocces import PreProcessing

from icpselftraining import ICPSelftraining

from sklearn.metrics import roc_curve, roc_auc_score, auc, classification_report
from imblearn.metrics import classification_report_imbalanced

## dataset

In [None]:
df_columns=['Destination_Port','Flow_Duration','Total_Length_of_Fwd_Packets','Fwd_Packet_Length_Mean',
            'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max','Fwd_IAT_Total', 'Fwd_IAT_Mean','Fwd_IAT_Std',
            'Fwd_IAT_Max','Bwd_IAT_Total','Bwd_IAT_Mean','Bwd_IAT_Max','Fwd_Header_Length','Fwd_Packets/s',
            'Bwd_Packets/s','Max_Packet_Length','Packet_Length_Mean','Packet_Length_Std','Packet_Length_Variance',
            'Average_Packet_Size','Avg_Fwd_Segment_Size','Subflow_Fwd_Bytes','Init_Win_bytes_forward',
            'Active_Min','Idle_Mean','Idle_Max','Idle_Min','Label']
Train_ = pd.read_csv("Dataset\Train.csv", skiprows=0, header=0, names=df_columns)
Test_ = pd.read_csv("Dataset\Test.csv", skiprows=0, header=0, names=df_columns)

In [None]:
Train_.shape

In [None]:
Test_.shape

In [None]:
labeled_ratio = 0.02
imbalanced_ratio =20
num_Train = 10000
num_Test = 3000
pre = PreProcessing(labeled_ratio,imbalanced_ratio)

In [None]:
Train, Test = pre.modify_ir(Train_,Test_,num_Train,num_Test)
X_unlabeled, X_labeled, y_unlabeled, y_labeled,X_train,y_train, X_test, y_test = pre.split(Train,Test)

In [None]:
print("Train: ",Counter(Train.Label))
print("\tlabeled: ", Counter(y_labeled))
print("\tunlabeled: ", Counter(y_unlabeled))
print("\nTest: ",Counter(Test.Label))
y_true = y_test.copy(deep=True)
no_skill_y = len(y_true[y_true==1]) / len(y_true)

In [None]:
Counter(y_train)

## Base classifiers

In [None]:
class base_classifiers:
    KNN = KNN_model(KNeighborsClassifier(n_neighbors=3,metric="euclidean",#n_jobs=2  # Parallelize work on CPUs
                              ))
#     NB = GaussianNB(priors=None)
#     SVM = SVC(#C=1.0,#kernel='poly',#degree=1,#tol=0.001
#     probability=True)
#     CART = DecisionTreeClassifier(criterion='entropy'
#                                  )
    SVM = SVM_model(SVC(kernel='linear',probability=True))
    
    
    
models = [
    CPSelfTraining("CP_SelfTrain",base_classifiers.SVM),
    CPSelfTraining("MCP_SelfTrain",base_classifiers.SVM,mondrian=True),
    ICPSelftraining("ICP_SelfTrain_SVM",base_classifiers.SVM),
    ICPSelftraining("MICP_SelfTrain_SVM",base_classifiers.SVM,mondrian = True),
    StandardSelftraining("STD_SelfTrain_SVM",SVC(kernel='linear',probability=True))
        ]

## Training and Scoring

In [None]:
results = pd.DataFrame()
cnf_matrixes = {}
predictions = {}
exeTime = 0

for model in models:
    cnf_matrixes[model.name] = {}
    print(model.name)
    start = time.time()
    
    model.training(X_labeled, X_unlabeled, y_labeled, y_unlabeled)

    y_prob,y_true,y_pred_label, scores, cnf_matrix = model.score(X_test,y_test)
    predictions[model.name] = [y_true,y_prob[:,0],y_prob[:,1],y_pred_label]

    print(classification_report_imbalanced(y_true, y_pred_label, digits=4))
    print(classification_report(y_true, y_pred_label, digits=4))
    end = time.time()
    exeTime = end - start
    
    test_info = { "classifier": model.name,"Labeled ratio":labeled_ratio ,"IR":imbalanced_ratio,"Time(sec)":exeTime}
    if results.empty:
            results = pd.DataFrame([{**test_info, **scores}])
    else:
             results.loc[len(results.index)] = {**test_info, **scores}
    cnf_matrixes[model.name][imbalanced_ratio] = cnf_matrix
    
    print()
    print("--------")
plt.show()

In [None]:
cnf_matrixes['ICP_SelfTrain_SVM'][imbalanced_ratio]

In [None]:
cnf_matrixes['MICP_SelfTrain_SVM'][imbalanced_ratio]

In [None]:
cnf_matrixes['STD_SelfTrain_SVM'][imbalanced_ratio]

In [None]:
results