Import Library

In [22]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
# Memuat dataset
file_path = 'data.csv'
data = pd.read_csv(file_path, delimiter='|')
data

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed-label
0,1.526756e+09,C9YvmJ3zxtuqxWxLW5,192.168.2.5,38792.0,200.168.87.203,59353.0,tcp,-,2.998333,0,...,-,0.0,S,3.0,180.0,0.0,0.0,-,Malicious,PartOfAHorizontalPortScan
1,1.526756e+09,CGsZqZ3UiQexLzPRVb,192.168.2.5,38792.0,200.168.87.203,59353.0,tcp,-,-,-,...,-,0.0,S,1.0,60.0,0.0,0.0,-,Malicious,PartOfAHorizontalPortScan
2,1.526756e+09,C0LkBW2VEa292Nvet8,192.168.2.5,38793.0,200.168.87.203,59353.0,tcp,-,2.997182,0,...,-,0.0,S,3.0,180.0,0.0,0.0,-,Malicious,PartOfAHorizontalPortScan
3,1.526756e+09,CMDLrn2cVhrqvW8gKa,192.168.2.5,38793.0,200.168.87.203,59353.0,tcp,-,-,-,...,-,0.0,S,1.0,60.0,0.0,0.0,-,Malicious,PartOfAHorizontalPortScan
4,1.526756e+09,C2UM8f4knuL5Vnvp3h,192.168.2.5,38794.0,200.168.87.203,59353.0,tcp,-,2.996286,0,...,-,0.0,S,3.0,180.0,0.0,0.0,-,Malicious,PartOfAHorizontalPortScan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156098,1.526886e+09,CzeVGM1n60FIryJyBi,192.168.2.5,36646.0,200.168.87.203,59353.0,tcp,-,-,-,...,-,0.0,S,1.0,60.0,0.0,0.0,-,Malicious,PartOfAHorizontalPortScan
156099,1.526886e+09,Cq9Oi6YyGNTJlf356,192.168.2.5,36647.0,200.168.87.203,59353.0,tcp,-,2.996060,0,...,-,0.0,S,3.0,180.0,0.0,0.0,-,Malicious,PartOfAHorizontalPortScan
156100,1.526886e+09,Cc5Jwn4krKLqvFHdG1,192.168.2.5,36647.0,200.168.87.203,59353.0,tcp,-,-,-,...,-,0.0,S,1.0,60.0,0.0,0.0,-,Malicious,PartOfAHorizontalPortScan
156101,1.526886e+09,Ck8fNL3RpnSPu9Kn9e,192.168.2.5,36648.0,200.168.87.203,59353.0,tcp,-,0.995112,0,...,-,0.0,S,2.0,120.0,0.0,0.0,-,Malicious,PartOfAHorizontalPortScan


In [24]:
# Mengatasi nilai yang hilang: yang berarti nilai yang hilang akan diisi dengan nilai terakhir yang tidak hilang sebelumnya 
data.fillna(method='ffill', inplace=True)

  data.fillna(method='ffill', inplace=True)


In [25]:
# Mengonversi kolom 'ts' ke tipe numerik, Ceorce = mengisi Nan
data['ts'] = pd.to_numeric(data['ts'], errors='coerce')
data['ts']

0         1.526756e+09
1         1.526756e+09
2         1.526756e+09
3         1.526756e+09
4         1.526756e+09
              ...     
156098    1.526886e+09
156099    1.526886e+09
156100    1.526886e+09
156101    1.526886e+09
156102    1.526793e+09
Name: ts, Length: 156103, dtype: float64

In [26]:
# Menghapus kolom yang tidak relevan
data.drop(columns=['uid', 'id.orig_h', 'id.resp_h', 'service', 'detailed-label'], inplace=True)

In [27]:
# Menggabungkan semua nilai 'Malicious' menjadi satu kategori
data['label'] = data['label'].apply(lambda x: 'Malicious' if 'Malicious' in x else x)
data['label']

0         Malicious
1         Malicious
2         Malicious
3         Malicious
4         Malicious
            ...    
156098    Malicious
156099    Malicious
156100    Malicious
156101    Malicious
156102    Malicious
Name: label, Length: 156103, dtype: object

In [28]:
# Encoding fitur kategorikal (one-hot encoding) : akan menghapus kolom primary menjadi sub kolom
data = pd.get_dummies(data, columns=['proto', 'history'])

In [29]:
# Mengonversi kolom dengan tanda '-' menjadi numerik dengan nilai nol
data.replace('-', 0, inplace=True)

  data.replace('-', 0, inplace=True)


In [30]:
# Mengonversi kolom lainnya ke tipe numerik
for col in ['duration', 'orig_bytes', 'resp_bytes', 'conn_state']:
    data[col] = pd.to_numeric(data[col], errors='coerce').fillna(0)

In [31]:
# Encoding kolom label, akan di label kemudian tetapkan dan diubah
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])
data['label'] 

0         1
1         1
2         1
3         1
4         1
         ..
156098    1
156099    1
156100    1
156101    1
156102    1
Name: label, Length: 156103, dtype: int32

In [32]:
# Memeriksa kembali apakah ada kolom yang masih memiliki tipe data object
for col in data.columns:
    if data[col].dtype == 'object':
        print(f"Kolom {col} masih memiliki tipe data object dan berisi nilai: {data[col].unique()}")

In [33]:
# Normalisasi data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data.drop(columns=['label']))
data_scaled

array([[-1.73656863, -0.77748914,  2.38770411, ..., -0.00715897,
        -0.00253102, -0.00253102],
       [-1.73638061, -0.77748914,  2.38770411, ..., -0.00715897,
        -0.00253102, -0.00253102],
       [-1.73627322, -0.7773857 ,  2.38770411, ..., -0.00715897,
        -0.00253102, -0.00253102],
       ...,
       [ 1.75174011, -0.99936476,  2.38770411, ..., -0.00715897,
        -0.00253102, -0.00253102],
       [ 1.75184756, -0.99926133,  2.38770411, ..., -0.00715897,
        -0.00253102, -0.00253102],
       [-0.75103927, -0.49727418, -0.30602061, ..., -0.00715897,
        -0.00253102, -0.00253102]])

In [34]:
# Membagi data menjadi data pelatihan dan data pengujian
X = data_scaled
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
# Naïve Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

In [36]:
# Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train, y_train)

In [37]:
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

In [38]:
# Cross-Validation
scores_nb = cross_val_score(nb_model, X, y, cv=10)
scores_svm = cross_val_score(svm_model, X, y, cv=10)
scores_lr = cross_val_score(lr_model, X, y, cv=10)
cv_scores = {
    'Naïve Bayes': round(scores_nb.mean(), 4),
    'SVM': round(scores_svm.mean(), 4),
    'Logistic Regression': round(scores_lr.mean(), 4)
}

In [39]:
# Prediksi dan Confusion Matrix
y_pred_nb = nb_model.predict(X_test)
y_pred_svm = svm_model.predict(X_test)
y_pred_lr = lr_model.predict(X_test)

cm_nb = confusion_matrix(y_test, y_pred_nb)
cr_nb = classification_report(y_test, y_pred_nb, digits=4)

cm_svm = confusion_matrix(y_test, y_pred_svm)
cr_svm = classification_report(y_test, y_pred_svm, digits=4)

cm_lr = confusion_matrix(y_test, y_pred_lr)
cr_lr = classification_report(y_test, y_pred_lr, digits=4)

In [40]:
print("\nNaïve Bayes Confusion Matrix:")
print(cm_nb)
print("Naïve Bayes Classification Report:")
print(cr_nb)


Naïve Bayes Confusion Matrix:
[[  902     4]
 [    0 30315]]
Naïve Bayes Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9956    0.9978       906
           1     0.9999    1.0000    0.9999     30315

    accuracy                         0.9999     31221
   macro avg     0.9999    0.9978    0.9989     31221
weighted avg     0.9999    0.9999    0.9999     31221



In [41]:
print("\nSVM Confusion Matrix:")
print(cm_svm)
print("SVM Classification Report:")
print(cr_svm)


SVM Confusion Matrix:
[[  902     4]
 [    0 30315]]
SVM Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9956    0.9978       906
           1     0.9999    1.0000    0.9999     30315

    accuracy                         0.9999     31221
   macro avg     0.9999    0.9978    0.9989     31221
weighted avg     0.9999    0.9999    0.9999     31221



In [42]:
print("\nLogistic Regression Confusion Matrix:")
print(cm_lr)
print("Logistic Regression Classification Report:")
print(cr_lr)


Logistic Regression Confusion Matrix:
[[  906     0]
 [    4 30311]]
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0     0.9956    1.0000    0.9978       906
           1     1.0000    0.9999    0.9999     30315

    accuracy                         0.9999     31221
   macro avg     0.9978    0.9999    0.9989     31221
weighted avg     0.9999    0.9999    0.9999     31221

