In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix


In [5]:
training_start = 1
training_end = 3

test_start =4
test_end =5

In [6]:
# Define the mapping of attack types to integer keys
attack_type_map = {
    'Benign': 0,
    'DoS': 1,
    'scanning': 2,
    'DDoS': 3,
    'xss': 4,
    'Bot': 5,
    'Reconnaissance': 6,
    'password': 7,
    'Fuzzers': 8,
    'injection': 9,
    'Theft': 10,
    'Brute Force': 11,
    'Infilteration': 12,
    'Exploits': 13,
    'Generic': 14,
    'Analysis': 15,
    'Backdoor': 16,
    'mitm': 17,
    'Shellcode': 18,
    'ransomware': 19,
    'Worms': 20
}

drop_column={
    'Dataset',
    'IPV4_SRC_ADDR',
    'IPV4_DST_ADDR',
    'L7_PROTO',
    'SRC_TO_DST_SECOND_BYTES',
    'DST_TO_SRC_SECOND_BYTES',
    'FTP_COMMAND_RET_CODE',

}
drop_purposed_column= {
    'Label',
    'Attack'
}


In [4]:
dataset = pd.read_csv('dataset-v2/part_1.csv')
dataset.drop(columns=drop_column, inplace=True)
dataset.drop(columns=drop_purposed_column, inplace=True)
# dataset['Attack'] = dataset['Attack'].map(attack_type_map)

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850145 entries, 0 to 850144
Data columns (total 37 columns):
 #   Column                       Non-Null Count   Dtype
---  ------                       --------------   -----
 0   L4_SRC_PORT                  850145 non-null  int64
 1   L4_DST_PORT                  850145 non-null  int64
 2   PROTOCOL                     850145 non-null  int64
 3   IN_BYTES                     850145 non-null  int64
 4   IN_PKTS                      850145 non-null  int64
 5   OUT_BYTES                    850145 non-null  int64
 6   OUT_PKTS                     850145 non-null  int64
 7   TCP_FLAGS                    850145 non-null  int64
 8   CLIENT_TCP_FLAGS             850145 non-null  int64
 9   SERVER_TCP_FLAGS             850145 non-null  int64
 10  FLOW_DURATION_MILLISECONDS   850145 non-null  int64
 11  DURATION_IN                  850145 non-null  int64
 12  DURATION_OUT                 850145 non-null  int64
 13  MIN_TTL                      

In [6]:
list_attributes = dataset.columns

In [7]:
print(list_attributes)

Index(['L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'IN_BYTES', 'IN_PKTS',
       'OUT_BYTES', 'OUT_PKTS', 'TCP_FLAGS', 'CLIENT_TCP_FLAGS',
       'SERVER_TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'DURATION_IN',
       'DURATION_OUT', 'MIN_TTL', 'MAX_TTL', 'LONGEST_FLOW_PKT',
       'SHORTEST_FLOW_PKT', 'MIN_IP_PKT_LEN', 'MAX_IP_PKT_LEN',
       'RETRANSMITTED_IN_BYTES', 'RETRANSMITTED_IN_PKTS',
       'RETRANSMITTED_OUT_BYTES', 'RETRANSMITTED_OUT_PKTS',
       'SRC_TO_DST_AVG_THROUGHPUT', 'DST_TO_SRC_AVG_THROUGHPUT',
       'NUM_PKTS_UP_TO_128_BYTES', 'NUM_PKTS_128_TO_256_BYTES',
       'NUM_PKTS_256_TO_512_BYTES', 'NUM_PKTS_512_TO_1024_BYTES',
       'NUM_PKTS_1024_TO_1514_BYTES', 'TCP_WIN_MAX_IN', 'TCP_WIN_MAX_OUT',
       'ICMP_TYPE', 'ICMP_IPV4_TYPE', 'DNS_QUERY_ID', 'DNS_QUERY_TYPE',
       'DNS_TTL_ANSWER'],
      dtype='object')


In [8]:
dataset.drop(columns= list_attributes[0], inplace=True)

In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850145 entries, 0 to 850144
Data columns (total 36 columns):
 #   Column                       Non-Null Count   Dtype
---  ------                       --------------   -----
 0   L4_DST_PORT                  850145 non-null  int64
 1   PROTOCOL                     850145 non-null  int64
 2   IN_BYTES                     850145 non-null  int64
 3   IN_PKTS                      850145 non-null  int64
 4   OUT_BYTES                    850145 non-null  int64
 5   OUT_PKTS                     850145 non-null  int64
 6   TCP_FLAGS                    850145 non-null  int64
 7   CLIENT_TCP_FLAGS             850145 non-null  int64
 8   SERVER_TCP_FLAGS             850145 non-null  int64
 9   FLOW_DURATION_MILLISECONDS   850145 non-null  int64
 10  DURATION_IN                  850145 non-null  int64
 11  DURATION_OUT                 850145 non-null  int64
 12  MIN_TTL                      850145 non-null  int64
 13  MAX_TTL                      

In [12]:

scaler = StandardScaler()
data_normalized = scaler.fit_transform(dataset)
print(data_normalized)

[[-0.30654098 -0.74821467 -0.0077353  ... -0.3387917  -0.13934468
  -0.00167769]
 [-0.30654098 -0.74821467 -0.00962603 ... -0.3387917  -0.13934468
  -0.00167769]
 [-0.21684028 -0.74821467 -0.01281326 ... -0.3387917  -0.13934468
  -0.00167769]
 ...
 [-0.30654098  1.19632187 -0.01151676 ... -0.3387917  -0.13934468
  -0.00167769]
 [-0.30654098 -0.74821467 -0.00854561 ... -0.3387917  -0.13934468
  -0.00167769]
 [-0.30654098 -0.74821467 -0.00634426 ... -0.3387917  -0.13934468
  -0.00167769]]


## 1. Data Preprocessing

In [17]:
total_counts = Counter()

label_count= Counter()
for i in range(1,90):
    data = pd.read_csv(f'dataset-v2/part_{i}.csv')
    num_type = Counter(data['Attack'])
    label_type= Counter(data['Label'])
    total_counts.update(num_type)
    label_count.update(label_type)
print("\nTổng số lượng của mỗi phần tử trong tất cả các file:", total_counts)
print("\nTổng số lượng của mỗi nhãn trong tất cả các file:", label_count)


KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(20,10))
plt.bar(total_counts.keys(), total_counts.values())
plt.show()
plt.savefig('total_counts.png')


In [None]:
plt.figure(figsize=(20,10))
plt.bar(label_count.keys(), label_count.values())
plt.show()
plt.savefig('label_count.png')


In [7]:
IsolationForest_model= IsolationForest()


In [10]:
train_url =[f'dataset-v2/part_{i}.csv' for i in range(training_start,training_end)]

In [29]:
def pre_processing(data):
    data.drop(columns=drop_column, inplace=True)
    data.drop(columns=drop_purposed_column, inplace=True)
    scaler = StandardScaler()
    data_normalized = scaler.fit_transform(data)
    return data_normalized
    

In [30]:
def train_model(processing_function):
    prediction =[]
    original_result=[]
    
    model = IsolationForest()
    
    for train_path in train_url:
        data = pd.read_csv(train_path)
        label = data['Label']
    
        attack = data['Attack']
        data = processing_function(data)        
        model.fit(data)
        
        y_pred = model.predict(data)
        
        prediction.append(y_pred)
        original_result.append(label)
    #convert to 1D array
    prediction = np.concatenate(prediction)
    original_result = np.concatenate(original_result)
    acc = accuracy_score(original_result, prediction)
    f1 = f1_score(original_result, prediction)
    roc_auc = roc_auc_score(original_result, prediction)
    cm = confusion_matrix(original_result, prediction)
    
    return acc, f1,roc_auc,cm

In [31]:
print(train_model(processing_function=pre_processing))

MemoryError: Unable to allocate 246. MiB for an array with shape (38, 850236) and data type int64

In [21]:
prediction =[]
original_result=[]

for train_path in train_url:
    data = pd.read_csv(train_path)
    data.drop(columns=drop_column, inplace=True)
    # test['Attack'] = test['Attack'].map(attack_type_map)
    label = data['Label']
    
    attack = data['Attack']
    # label.replace(1,-1, inplace=True)
    label.replace(0,-1, inplace=True)
    data.drop(columns=drop_purposed_column, inplace=True)
    
    IsolationForest_model.fit(data)
    
    y_pred = IsolationForest_model.predict(data)
    
    prediction.append(y_pred)
    original_result.append(label)
    

In [22]:
#convert to 1D array
prediction = np.concatenate(prediction)
original_result = np.concatenate(original_result)

In [23]:
print(len(original_result))

1700381


In [25]:
# from docx import Document

# Calculate metrics
acc = accuracy_score(original_result, prediction)
f1 = f1_score(original_result, prediction)
roc_auc = roc_auc_score(original_result, prediction)
cm = confusion_matrix(original_result, prediction)
print(acc,f1,roc_auc,cm);

# Create a new Document
# doc = Document()
# 
# # Add a title
# doc.add_heading('Model Evaluation Results', level=1)
# 
# # Add accuracy
# doc.add_heading('Accuracy', level=2)
# doc.add_paragraph(f"IF acc = {acc}")
# 
# # Add F1 score
# doc.add_heading('F1 Score', level=2)
# doc.add_paragraph(f"-IF f1 = {f1}")
# 
# # Add ROC AUC
# doc.add_heading('ROC AUC', level=2)
# doc.add_paragraph(f"-IF roc auc = {roc_auc}")
# 
# # Add confusion matrix
# doc.add_heading('Confusion Matrix', level=2)
# cm_text = '\n'.join(['\t'.join(map(str, row)) for row in cm])
# doc.add_paragraph(cm_text)
# 
# # Save the document
# doc.save('model_evaluation_results.docx')

0.7117510722596877 0.8178973896172461 0.5811175222424654 [[ 109554  453250]
 [  36883 1100694]]


In [44]:
print(acc)

0.7291755383598288


In [ ]:
import seaborn as sns

# Create a heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')

# Add labels and title
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix Heatmap')

# Save the heatmap as a PNG file
plt.savefig('confusion_matrix_heatmap.png')

# Show the heatmap
plt.show()

# feature selection

In [17]:
from sklearn.decomposition import PCA



In [21]:
def train_and_evaluate_feature_selection(n_component):
    IsolationForest_evaluate_model = IsolationForest()
    pca = None  # Để lưu PCA
    scaler = None  # Để lưu StandardScaler
    
    def train_data(data):
        nonlocal pca, scaler
        # Xóa các cột không cần thiết
        data.drop(columns=drop_column, inplace=True)
        label = data['Label']
        data.drop(columns=drop_purposed_column, inplace=True)
        
        # Áp dụng PCA
        pca = PCA(n_components=n_component)  # Giữ 95% phương sai
        data_reduced = pca.fit_transform(data)
        
        # Chuẩn hóa dữ liệu
        scaler = StandardScaler()
        data_normalized = scaler.fit_transform(data_reduced)
        
        # Chuyển đổi nhãn
        label.replace(0, -1, inplace=True)
        
        # Huấn luyện Isolation Forest
        IsolationForest_evaluate_model.fit(data_normalized)
    
    for train_path in train_url:
        data = pd.read_csv(train_path)
        train_data(data)
    
    prediction = []
    original_result = []
    
    def test_data(test):
        nonlocal pca, scaler
        # Xóa các cột không cần thiết
        test.drop(columns=drop_column, inplace=True)
        label = test['Label']
        label.replace(0, -1, inplace=True)
        test.drop(columns=drop_purposed_column, inplace=True)
        
        # Áp dụng PCA và chuẩn hóa
        test_reduced = pca.transform(test)  # Áp dụng PCA từ tập train
        test_normalized = scaler.transform(test_reduced)  # Áp dụng scaler từ tập train
        
        # Dự đoán
        y_pred = IsolationForest_evaluate_model.predict(test_normalized)
        return y_pred, label
    
    for test_path in test_url:
        test = pd.read_csv(test_path)
        y_pred, label = test_data(test)
        prediction.append(y_pred)
        original_result.append(label)
    
    # Ghép kết quả
    prediction = np.concatenate(prediction)
    original_result = np.concatenate(original_result)
    
    # Đánh giá mô hình
    acc = accuracy_score(original_result, prediction)
    f1 = f1_score(original_result, prediction)
    roc_auc = roc_auc_score(original_result, prediction)
    cm = confusion_matrix(original_result, prediction)
    
    return acc, f1, roc_auc, cm

In [24]:

def train_and_evaluate_feature_selection_with_other_estimator(n_estimator,n_component):
    IsolationForest_evaluate_model = IsolationForest(n_estimators=n_estimator)
    pca = None  # Để lưu PCA
    scaler = None  # Để lưu StandardScaler
    
    def train_data(data):
        nonlocal pca, scaler
        # Xóa các cột không cần thiết
        data.drop(columns=drop_column, inplace=True)
        label = data['Label']
        data.drop(columns=drop_purposed_column, inplace=True)
        
        # Áp dụng PCA
        pca = PCA(n_components=n_component)  # Giữ số thành phần mong muốn
        data_reduced = pca.fit_transform(data)
        
        # Chuẩn hóa dữ liệu
        scaler = StandardScaler()
        data_normalized = scaler.fit_transform(data_reduced)
        
        # Chuyển đổi nhãn
        label.replace(0, -1, inplace=True)
        
        # Huấn luyện Isolation Forest
        IsolationForest_evaluate_model.fit(data_normalized)
    
    for train_path in train_url:
        data = pd.read_csv(train_path)
        train_data(data)
    
    prediction = []
    original_result = []
    
    def test_data(test):
        nonlocal pca, scaler
        # Xóa các cột không cần thiết
        test.drop(columns=drop_column, inplace=True)
        label = test['Label']
        label.replace(0, -1, inplace=True)
        test.drop(columns=drop_purposed_column, inplace=True)
        
        # Áp dụng PCA và chuẩn hóa
        test_reduced = pca.transform(test)  # Áp dụng PCA từ tập train
        test_normalized = scaler.transform(test_reduced)  # Áp dụng scaler từ tập train
        
        # Dự đoán
        y_pred = IsolationForest_evaluate_model.predict(test_normalized)
        return y_pred, label
    
    for test_path in test_url:
        test = pd.read_csv(test_path)
        y_pred, label = test_data(test)
        prediction.append(y_pred)
        original_result.append(label)
    
    # Ghép kết quả
    prediction = np.concatenate(prediction)
    original_result = np.concatenate(original_result)
    
    # Đánh giá mô hình
    acc = accuracy_score(original_result, prediction)
    f1 = f1_score(original_result, prediction)
    roc_auc = roc_auc_score(original_result, prediction)
    cm = confusion_matrix(original_result, prediction)
    
    return acc, f1, roc_auc, cm


In [1]:
test =[0.4]
estimator=[1500]
for i in test:
    for j in estimator:
        acc, f1, roc_auc, cm = train_and_evaluate_feature_selection_with_other_estimator(j,i)
        print(f"n_component = {i}")
        print(f"n_estimator = {j}")
        print(f"Accuracy: {acc}")
        print(f"F1 Score: {f1}")
        print(f"ROC AUC: {roc_auc}")
        print(f"Confusion Matrix:\n{cm}")
        print()
        # doc = Document()
        # doc.add_heading('Model Evaluation Results', level=1)
        # doc.add_heading('Feature Selection', level=2)
        # doc.add_heading('PCA', level=3)
        # doc.add_paragraph(f"PCA n_component = {i}")
        # doc.add_paragraph(f"n_estimator = {j}")
        # doc.add_paragraph(f"Accuracy: {acc}")
        # doc.add_paragraph(f"F1 Score: {f1}")
        # doc.add_paragraph(f"ROC AUC: {roc_auc}")
        # doc.add_paragraph(f"Confusion Matrix:\n{cm}")
        # doc.save(f'model_evaluation_results_fs{i}_estimator{j}.docx')
    



NameError: name 'train_and_evaluate_feature_selection_with_other_estimator' is not defined