In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [31]:
data = pd.read_csv("Datasets/NSL_KDD/KDDTrain+.txt")

In [32]:
columns = (['duration'
,'protocol_type'
,'service'
,'flag'
,'src_bytes'
,'dst_bytes'
,'land'
,'wrong_fragment'
,'urgent'
,'hot'
,'num_failed_logins'
,'logged_in'
,'num_compromised'
,'root_shell'
,'su_attempted'
,'num_root'
,'num_file_creations'
,'num_shells'
,'num_access_files'
,'num_outbound_cmds'
,'is_host_login'
,'is_guest_login'
,'count'
,'srv_count'
,'serror_rate'
,'srv_serror_rate'
,'rerror_rate'
,'srv_rerror_rate'
,'same_srv_rate'
,'diff_srv_rate'
,'srv_diff_host_rate'
,'dst_host_count'
,'dst_host_srv_count'
,'dst_host_same_srv_rate'
,'dst_host_diff_srv_rate'
,'dst_host_same_src_port_rate'
,'dst_host_srv_diff_host_rate'
,'dst_host_serror_rate'
,'dst_host_srv_serror_rate'
,'dst_host_rerror_rate'
,'dst_host_srv_rerror_rate'
,'attack'
,'level'])

data.columns = columns
data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,level
0,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
1,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
2,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
3,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21
4,0,tcp,private,REJ,0,0,0,0,0,0,...,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21


In [33]:
unwanted_cols = ['land'
,'wrong_fragment'
,'urgent'
,'hot'
,'num_failed_logins'
,'logged_in'
,'num_compromised'
,'root_shell'
,'su_attempted'
,'num_root'
,'num_file_creations'
,'num_shells'
,'num_access_files'
,'num_outbound_cmds'
,'is_host_login'
,'is_guest_login'
,'count'
,'srv_count'
,'serror_rate'
,'srv_serror_rate'
,'rerror_rate'
,'srv_rerror_rate'
,'same_srv_rate'
,'diff_srv_rate'
,'srv_diff_host_rate'
,'dst_host_count'
,'dst_host_srv_count'
,'dst_host_same_srv_rate'
,'dst_host_diff_srv_rate'
,'dst_host_same_src_port_rate'
,'dst_host_srv_diff_host_rate'
,'dst_host_serror_rate'
,'dst_host_srv_serror_rate'
,'dst_host_rerror_rate'
,'dst_host_srv_rerror_rate'] 

data = data.drop(unwanted_cols, axis=1)
data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,attack,level
0,0,udp,other,SF,146,0,normal,15
1,0,tcp,private,S0,0,0,neptune,19
2,0,tcp,http,SF,232,8153,normal,21
3,0,tcp,http,SF,199,420,normal,21
4,0,tcp,private,REJ,0,0,neptune,21


In [34]:
on_attack = data.attack.map(lambda a: 0 if a == 'normal' else 1)
data['attack_flag'] = is_attack
data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,attack,level,attack_flag
0,0,udp,other,SF,146,0,normal,15,0
1,0,tcp,private,S0,0,0,neptune,19,1
2,0,tcp,http,SF,232,8153,normal,21,0
3,0,tcp,http,SF,199,420,normal,21,0
4,0,tcp,private,REJ,0,0,neptune,21,1


In [35]:
categorical_cols = ['protocol_type', 'service', 'flag']  
label_encoder = LabelEncoder()
data['encoded_label'] = label_encoder.fit_transform(data['attack'])  
data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,attack,level,attack_flag,encoded_label
0,0,udp,other,SF,146,0,normal,15,0,11
1,0,tcp,private,S0,0,0,neptune,19,1,9
2,0,tcp,http,SF,232,8153,normal,21,0,11
3,0,tcp,http,SF,199,420,normal,21,0,11
4,0,tcp,private,REJ,0,0,neptune,21,1,9


In [36]:
onehot_encoder = OneHotEncoder(sparse=False)
encoded_categorical_cols = pd.DataFrame(onehot_encoder.fit_transform(data[categorical_cols]))
encoded_categorical_cols.columns = onehot_encoder.get_feature_names_out(categorical_cols)
encoded_categorical_cols

Unnamed: 0,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_aol,service_auth,service_bgp,service_courier,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125967,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
125968,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
125969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
125970,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [72]:
data_encoded = pd.concat([data.drop(categorical_cols + ['attack'], axis=1), encoded_categorical_cols], axis=1)
data_encoded

Unnamed: 0,duration,src_bytes,dst_bytes,level,attack_flag,encoded_label,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_IRC,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,146,0,15,0,11,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0,0,0,19,1,9,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0,232,8153,21,0,11,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0,199,420,21,0,11,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0,0,0,21,1,9,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125967,0,0,0,20,1,9,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
125968,8,105,145,21,0,11,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
125969,0,2231,384,18,0,11,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
125970,0,0,0,20,1,9,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [73]:
X = data_encoded.drop(['encoded_label'], axis=1)  # Features
y = data_encoded['encoded_label']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [74]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import BaggingClassifier
import pandas as pd

#importing DCS techniques from DESlib
from deslib.dcs.ola import OLA
from deslib.dcs.a_priori import APriori
from deslib.dcs.mcb import MCB

#import DES techniques from DESlib
from deslib.des.des_p import DESP
from deslib.des.knora_u import KNORAU
from deslib.des.knora_e import KNORAE
from deslib.des.meta_des import METADES

In [75]:
# Scale the variables to have 0 mean and unit variance
#scalar = StandardScaler()
#X_train = scalar.fit_transform(X_train)
#X_test = scalar.transform(X_test)

# Split the data into training and DSEL for DS techniques
X_train, X_dsel, y_train, y_dsel = train_test_split(X_train, y_train, test_size=0.5)

In [77]:
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

classifier = CalibratedClassifierCV(Perceptron(max_iter=10))

# Train a pool of 10 classifiers
#pool_classifiers = BaggingClassifier(model, n_estimators=10)
pool_classifiers = RandomForestClassifier(n_estimators=10)
pool_classifiers.fit(X_train, y_train)

RandomForestClassifier(n_estimators=10)

In [68]:
# DCS techniques
ola = OLA()
mcb = MCB()
apriori = APriori()

# DES techniques
knorau = KNORAU()
kne = KNORAE()
desp = DESP()
meta = METADES()

In [78]:
# DCS techniques
ola = OLA(pool_classifiers)
mcb = MCB(pool_classifiers)
apriori = APriori(pool_classifiers)

# DES techniques
knorau = KNORAU(pool_classifiers)
kne = KNORAE(pool_classifiers)
desp = DESP(pool_classifiers)
meta = METADES(pool_classifiers)

In [79]:
knorau.fit(X_dsel, y_dsel)
kne.fit(X_dsel, y_dsel)
desp.fit(X_dsel, y_dsel)
ola.fit(X_dsel, y_dsel)
mcb.fit(X_dsel, y_dsel)
#apriori.fit(X_dsel, y_dsel) #Nao consegui ajeitar. Erro de shapes divergentes
#meta.fit(X_dsel, y_dsel)

MCB(pool_classifiers=RandomForestClassifier(n_estimators=10))

In [80]:
print('Classification accuracy OLA: ', ola.score(X_test, y_test))
#print('Classification accuracy A priori: ', apriori.score(X_test, y_test))
print('Classification accuracy KNORA-Union: ', knorau.score(X_test, y_test))
print('Classification accuracy KNORA-Eliminate: ', kne.score(X_test, y_test))
print('Classification accuracy DESP: ', desp.score(X_test, y_test))
#print('Classification accuracy META-DES: ', apriori.score(X_test, y_test))

Classification accuracy OLA:  0.9039888866838659
Classification accuracy KNORA-Union:  0.9040682675133955
Classification accuracy KNORA-Eliminate:  0.902996626314745
Classification accuracy DESP:  0.9040682675133955


In [86]:
predict = ola.predict(X_test)

print(f"""
Reporte de classificação completo
#{classification_report(y_test, predict)}

Acuracia
{accuracy_score(y_test, predict)}

Matriz de confusão
{confusion_matrix(y_test, predict)}
""")

NameError: name 'classification_report' is not defined

Continuação

In [59]:
import pickle

file_path = "Datasets/DTmodel.pkl"

# Save the model using pickle
with open(file_path, 'wb') as file:
    pickle.dump(classifier, file)

In [60]:
sample = ['tcp', 'ftp', 'REJ','146','1', '0','11', '15' , '0']

sample_input = [sample[:3]]
print(sample_input)
sample_input_others = sample[3:]
print(sample_input_others)

[['tcp', 'ftp', 'REJ']]
['146', '1', '0', '11', '15', '0']


In [61]:
print(sample_input)
encoded_sample_input = pd.DataFrame(onehot_encoder.transform(sample_input))
encoded_sample_input.columns = onehot_encoder.get_feature_names_out(categorical_cols)
encoded_sample_input

[['tcp', 'ftp', 'REJ']]


Unnamed: 0,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_aol,service_auth,service_bgp,service_courier,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
# Add the missing columns to the encoded sample input
import itertools

missing_cols = set(data_encoded.columns) - set(encoded_sample_input.columns)
sample_input_others = ['146','1', '0','11', '15' , '0']
values_cycle = itertools.cycle(sample_input_others)
print(missing_cols)
for col in missing_cols:
    value = next(values_cycle)
    encoded_sample_input[col] = value
encoded_sample_input

{'duration', 'attack_flag', 'encoded_label', 'level', 'dst_bytes', 'src_bytes'}


Unnamed: 0,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_aol,service_auth,service_bgp,service_courier,...,flag_S2,flag_S3,flag_SF,flag_SH,duration,attack_flag,encoded_label,level,dst_bytes,src_bytes
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,146,1,0,11,15,0


In [63]:
# Reorder the columns to match the order of the encoded training data
encoded_sample_input = encoded_sample_input[X.columns]

# Make the prediction
prediction = classifier.predict(encoded_sample_input)

# Decode the predicted label
predicted_label = label_encoder.inverse_transform(prediction)

# Print the output
print("Predicted Label:", predicted_label)

NotFittedError: This CalibratedClassifierCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.