***Importing and treating CIC-DDoS-2019¶***

In [None]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn import preprocessing
import random

DOWNSAMPLING AND BALANCING

In [None]:
mult = 5
num_samples = 5000  # Variable to control the number of rows to read

def load_file(path):
    # Count total lines in the file (excluding header)
    total_lines = sum(1 for _ in open(path)) - 1  

    # Ensure at most `num_samples` rows are read randomly
    if total_lines > num_samples:
        skip_rows = sorted(random.sample(range(1, total_lines + 1), total_lines - num_samples))
    else:
        skip_rows = None  # Read entire file if it's smaller than `num_samples`

    data = pd.read_csv(path, skiprows=skip_rows, sep=',', low_memory=False)

    is_benign = data[' Label'] == 'BENIGN'
    flows_ok = data[is_benign]
    flows_ddos_full = data[~is_benign]
    
    sizeDownSample = len(flows_ok) * mult  # Target size for anomalous data
    
    # Downsample majority class
    if sizeDownSample < len(flows_ddos_full): 
        flows_ddos_reduced = resample(
            flows_ddos_full,
            replace=False,
            n_samples=sizeDownSample,
            random_state=27
        )
    else:
        flows_ddos_reduced = flows_ddos_full

    final_df = pd.concat([flows_ok, flows_ddos_reduced])

    return final_df


def load_huge_file(path):
    total_lines = sum(1 for _ in open(path)) - 1  

    if total_lines > num_samples:
        skip_rows = sorted(random.sample(range(1, total_lines + 1), total_lines - num_samples))
    else:
        skip_rows = None  

    df_chunk = pd.read_csv(path, skiprows=skip_rows, chunksize=500000, low_memory=False)
    
    chunk_list_ok = []  
    chunk_list_ddos = [] 

    for chunk in df_chunk:  
        is_benign = chunk[' Label'] == 'BENIGN'
        flows_ok = chunk[is_benign]
        flows_ddos_full = chunk[~is_benign]
        
        if (len(flows_ok) * mult) < len(flows_ddos_full): 
            sizeDownSample = len(flows_ok) * mult  
            
            flows_ddos_reduced = resample(
                flows_ddos_full,
                replace=False,
                n_samples=sizeDownSample,
                random_state=27
            )
        else:
            flows_ddos_reduced = flows_ddos_full
            
        chunk_list_ok.append(flows_ok)
        chunk_list_ddos.append(flows_ddos_reduced)
        
    flows_ok = pd.concat(chunk_list_ok)
    flows_ddos = pd.concat(chunk_list_ddos)

    final_df = pd.concat([flows_ok, flows_ddos])

    return final_df

**Loading CIC-DDoS2019**

In [None]:
# Load first file
flows = load_huge_file('/kaggle/input/cic-ddos2019-30gb-full-dataset-csv-files/01-12/TFTP.csv')
print('file 1 loaded')

# List of remaining files
files = [
    "DrDoS_LDAP.csv", "DrDoS_MSSQL.csv", "DrDoS_NetBIOS.csv",
    "DrDoS_NTP.csv", "DrDoS_SNMP.csv", "DrDoS_SSDP.csv",
    "DrDoS_UDP.csv", "Syn.csv", "DrDoS_DNS.csv", "UDPLag.csv"
]

# Process each file
for i, file in enumerate(files, start=2):
    df = load_file(f'/kaggle/input/cic-ddos2019-30gb-full-dataset-csv-files/01-12/{file}')
    
    # Concatenate new file data
    flows = pd.concat([flows, df], ignore_index=True)
    
    print(f'file {i} loaded')

# Save to CSV
flows.to_csv('/kaggle/working/export_dataframe.csv', index=False, header=True)

# Delete large variable
del flows

In [None]:
base_path = "/kaggle/input/cic-ddos2019-30gb-full-dataset-csv-files/03-11/"
files = ["LDAP.csv", "MSSQL.csv", "NetBIOS.csv", "Portmap.csv", "Syn.csv"]
# Uncomment if fixed
# files += ["UDP.csv", "UDPLag.csv"]  

# Load first file
flows = load_file(base_path + files[0])  # Expecting ONE DataFrame
print('file 1 loaded')

# Load remaining files
for i, file in enumerate(files[1:], start=2):
    df = load_file(base_path + file)  # Expecting ONE DataFrame
    
    # Concatenate the new file data
    flows = pd.concat([flows, df], ignore_index=True)

    print(f'file {i} loaded')

# Save to CSV
flows.to_csv('/kaggle/working/export_tests.csv', index=False, header=True)

# Free memory
del flows, df

**CIC-DDoS2019 Data Processing**


In [None]:
import pandas as pd
import numpy as np
import hashlib

# Load dataset
samples = pd.read_csv('/kaggle/working/export_dataframe.csv', sep=',')

# Function to convert string to numeric hash
def string2numeric_hash(text):
    return int(hashlib.md5(text.encode()).hexdigest()[:8], 16)

# Replace infinite values
samples = samples.replace(['Infinity', np.inf], 0)

# Convert numerical columns safely
samples[' Flow Packets/s'] = pd.to_numeric(samples[' Flow Packets/s'], errors='coerce').fillna(0)
samples['Flow Bytes/s'] = pd.to_numeric(samples['Flow Bytes/s'], errors='coerce').fillna(0)

# Convert labels to numeric
samples[' Label'] = samples[' Label'].replace({
    'BENIGN': 0, 'DrDoS_DNS': 1, 'DrDoS_LDAP': 1, 'DrDoS_MSSQL': 1,
    'DrDoS_NTP': 1, 'DrDoS_NetBIOS': 1, 'DrDoS_SNMP': 1, 'DrDoS_SSDP': 1,
    'DrDoS_UDP': 1, 'Syn': 1, 'TFTP': 1, 'UDP-lag': 1, 'WebDDoS': 1
}).astype(int)

# Ensure no NaN timestamps before splitting
samples[' Timestamp'] = samples[' Timestamp'].fillna('1970-01-01 00:00:00.000000')

# Process timestamps
columnTime = samples[' Timestamp'].str.split(' ', n=1, expand=True)
columnTime.columns = ['day', 'time']
columnTime = columnTime['time'].str.split('.', n=1, expand=True)
columnTime.columns = ['time', 'milliseconds']
samples[' Timestamp'] = columnTime['time'].apply(string2numeric_hash)

# Drop unnecessary columns
samples.drop(columns=[' Source IP', ' Destination IP', 'Flow ID', 'SimillarHTTP', 'Unnamed: 0'], inplace=True)

# Save processed dataset
samples.to_csv('/kaggle/working/export_dataframe_proc.csv', index=False, header=True)

print('Training data processed successfully!')

In [None]:
import pandas as pd
import numpy as np
import hashlib# Processing test dataset
tests = pd.read_csv('/kaggle/working/export_tests.csv', sep=',')

# Replace infinite values
tests = tests.replace(['Infinity', np.inf], 0)

# Convert numerical columns safely
tests[' Flow Packets/s'] = pd.to_numeric(tests[' Flow Packets/s'], errors='coerce').fillna(0)
tests['Flow Bytes/s'] = pd.to_numeric(tests['Flow Bytes/s'], errors='coerce').fillna(0)

# Convert labels to numeric
tests[' Label'] = tests[' Label'].replace({
    'BENIGN': 0, 'LDAP': 1, 'NetBIOS': 1, 'MSSQL': 1,
    'Portmap': 1, 'Syn': 1
}).astype(int)

# Ensure no NaN timestamps before splitting
tests[' Timestamp'] = tests[' Timestamp'].fillna('1970-01-01 00:00:00.000000')

# Process timestamps
columnTime = tests[' Timestamp'].str.split(' ', n=1, expand=True)
columnTime.columns = ['day', 'time']
columnTime = columnTime['time'].str.split('.', n=1, expand=True)
columnTime.columns = ['time', 'milliseconds']
tests[' Timestamp'] = columnTime['time'].apply(string2numeric_hash)

# Drop unnecessary columns
tests.drop(columns=[' Source IP', ' Destination IP', 'Flow ID', 'SimillarHTTP', 'Unnamed: 0'], inplace=True)

# Save processed dataset
tests.to_csv('/kaggle/working/export_tests_proc.csv', index=False, header=True)

print('Test data processed successfully!')

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.svm import SVC

DNN

In [None]:
def DNN_model(input_size):
   
    # Initialize the constructor
    model = Sequential()
    
    model.add(Dense(2, activation='relu', input_shape=(input_size,)))
    #model.add(Dense(100, activation='relu'))   
    #model.add(Dense(40, activation='relu'))
    #model.add(Dense(10, activation='relu'))
    #model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    
    print(model.summary())
    
    return model


Support Vector Machine (SVM)

In [None]:
def SVM():
    return SVC(kernel='linear')

Auxiliar Functions

Implementation of auxiliar functions, such as testing, compiling/training, 3d reshape, etc.
train_test(samples)

    Receives a group of samples and split it in train/test sets.

In [None]:
def train_test(samples, test_size=0.33):
    from sklearn.model_selection import train_test_split
    X = samples.iloc[:, :-1]
    y = samples.iloc[:, -1]
    return train_test_split(X, y, test_size=test_size, random_state=42)

In [None]:
def normalize_data(X_train,X_test):
    # Import `StandardScaler` from `sklearn.preprocessing`
    from sklearn.preprocessing import StandardScaler,MinMaxScaler
    
    # Define the scaler 
    #scaler = StandardScaler().fit(X_train)
    scaler = MinMaxScaler(feature_range=(-1, 1)).fit(X_train)
    
    # Scale the train set
    X_train = scaler.transform(X_train)
    
    # Scale the test set
    X_test = scaler.transform(X_test)
    
    return X_train, X_test


compile_train(model,X_train,y_train,deep=True)¶

    Compile and train learning model

    deep = False for scikit-learn ML methods



In [None]:
def compile_train(model, X_train, y_train, epochs=10, batch_size=64, deep=True):
    """
    Compile and train the learning model.
    """
    if deep:
        import matplotlib.pyplot as plt
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        history = model.fit(
            X_train, y_train,
            epochs=epochs,
            batch_size=batch_size,
            verbose=1
        )
        # plotting...
        plt.plot(history.history['accuracy'])
        plt.title('Model Accuracy'); plt.show()
        plt.plot(history.history['loss'])
        plt.title('Model Loss');     plt.show()
        print('Metrics:', model.metrics_names)
    else:
        model.fit(X_train, y_train)

    print('Model Compiled and Trained')
    return model

In [None]:
def testes(model,X_test,y_test,y_pred, deep=True):
    if(deep==True): 
        score = model.evaluate(X_test, y_test,verbose=1)

        print(score)
    
    # Alguns testes adicionais
    #y_test = formatar2d(y_test)
    #y_pred = formatar2d(y_pred)
    
    
    # Import the modules from `sklearn.metrics`
    from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, cohen_kappa_score, accuracy_score
    
    # Accuracy 
    acc = accuracy_score(y_test, y_pred)
    print('\nAccuracy')
    print(acc)
    
    # Precision 
    prec = precision_score(y_test, y_pred)#,average='macro')
    print('\nPrecision')
    print(prec)
    
    # Recall
    rec = recall_score(y_test, y_pred) #,average='macro')
    print('\nRecall')
    print(rec)
    
    # F1 score
    f1 = f1_score(y_test,y_pred) #,average='macro')
    print('\nF1 Score')
    print(f1)
    
    #average
    avrg = (acc+prec+rec+f1)/4
    print('\nAverage (acc, prec, rec, f1)')
    print(avrg)
    
    return acc, prec, rec, f1, avrg

Calculate the correct classification rate of normal and attack flow records

In [None]:
def test_normal_atk(y_test, y_pred):
    import pandas as pd

    df = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
    # общее число нормальных и атак
    normal = (df['y_test'] == 0).sum()
    atk    = (df['y_test'] == 1).sum()
    
    # ошибки по классам
    wrong = df[df['y_test'] != df['y_pred']]
    counts = wrong.groupby('y_test').size().to_dict()
    
    # сколько ошибок у каждого класса (0 или 1)
    wrong_normal = counts.get(0, 0)
    wrong_atk    = counts.get(1, 0)
    
    normal_detect_rate = (normal - wrong_normal) / normal if normal > 0 else 0
    atk_detect_rate    = (atk    - wrong_atk)    / atk    if atk    > 0 else 0
    
    return normal_detect_rate, atk_detect_rate
print("123")

Methods for saving and loading trained models

In [None]:
def save_model(model,name):
    from keras.models import model_from_json
    
    arq_json = 'Models/' + name + '.json'
    model_json = model.to_json()
    with open(arq_json,"w") as json_file:
        json_file.write(model_json)
    
    arq_h5 = 'Models/' + name + '.h5'
    model.save_weights(arq_h5)
    print('Model Saved')
    
def load_model(name):
    from keras.models import model_from_json
    
    arq_json = 'Models/' + name + '.json'
    json_file = open(arq_json,'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    
    arq_h5 = 'Models/' + name + '.h5'
    loaded_model.load_weights(arq_h5)
    
    print('Model loaded')
    
    return loaded_model

def save_Sklearn(model,nome):
    import pickle
    arquivo = 'Models/'+ nome + '.pkl'
    with open(arquivo,'wb') as file:
        pickle.dump(model,file)
    print('Model sklearn saved')

def load_Sklearn(nome):
    import pickle
    arquivo = 'Models/'+ nome + '.pkl'
    with open(arquivo,'rb') as file:
        model = pickle.load(file)
    print('Model sklearn loaded')
    return model

In [None]:
samples = pd.read_csv('/kaggle/working/export_dataframe_proc.csv', sep=',')

X_tmp, _, y_tmp, _ = train_test(samples, test_size=0.33)
X_ap = pd.concat([X_tmp, y_tmp], axis=1)

is_benign = X_ap[' Label'] == 0
normal    = X_ap[is_benign]
ddos      = X_ap[~is_benign]

normal_upsampled = resample(normal, replace=True, n_samples=len(ddos), random_state=27)
upsampled_df     = pd.concat([normal_upsampled, ddos], ignore_index=True)

# Опционально массивы, но дальше работаем с upsampled_df
X_train_base = upsampled_df.iloc[:, :-1].values
y_train_base = upsampled_df.iloc[:, -1].values

del X_tmp, y_tmp, X_ap, normal, ddos, normal_upsampled

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

# Предполагаем, что upsampled_df определён выше,
# а X_test_base, y_test_base — тоже.

results = pd.DataFrame(columns=[
    'TestSize','BatchSize','Epochs','Method',
    'Accuracy','Precision','Recall','F1_Score',
    'Normal_Detect_Rate','Atk_Detect_Rate'
])

for test_size in [0.2, 0.3, 0.4]:
    # 1) Разбиение апсемплингового DataFrame
    X_tr, X_val, y_tr, y_val = train_test(upsampled_df, test_size=test_size)
    # 2) Нормализация
    X_tr, X_val = normalize_data(X_tr, X_val)

    # DNN: перебор batch_size и epochs
    for batch_size in [32, 64, 128]:
        for epochs in [10, 15, 20, 25, 30]:
            model = DNN_model(X_tr.shape[1])
            model = compile_train(
                model, X_tr, y_tr,
                epochs=epochs,
                batch_size=batch_size,
                deep=True
            )

            # предсказания
            y_pred = model.predict(X_val).round()
            acc, prec, rec, f1, _ = testes(model, X_val, y_val, y_pred)
            y_pred_flat = y_pred.ravel()  # или y_pred.flatten()
            norm_r, atk_r = test_normal_atk(y_val, y_pred_flat)
            # визуализация CM
            disp = ConfusionMatrixDisplay.from_predictions(
                y_val, y_pred,
                display_labels=['Benign','DDoS']
            )
            plt.title(f'DNN CM: ts={test_size}, bs={batch_size}, ep={epochs}')
            plt.show()

            # сохраняем метрики
            results = pd.concat([results, pd.DataFrame([{
                'TestSize': test_size,
                'BatchSize': batch_size,
                'Epochs': epochs,
                'Method': 'DNN',
                'Accuracy': acc,
                'Precision': prec,
                'Recall': rec,
                'F1_Score': f1,
                'Normal_Detect_Rate': norm_r,
                'Atk_Detect_Rate': atk_r
            }])], ignore_index=True)

    # SVM
    model_svm = SVM()
    model_svm.fit(X_tr, y_tr)
    y_pred_svm = model_svm.predict(X_val)
    y_pred_svm = y_pred_svm.ravel()
    acc_s, prec_s, rec_s, f1_s, _ = testes(model_svm, X_val, y_val, y_pred_svm, deep=False)
    norm_s, atk_s = test_normal_atk(y_val, y_pred_svm)

    # визуализация CM для SVM
    disp = ConfusionMatrixDisplay.from_predictions(
        y_val, y_pred_svm,
        display_labels=['Benign','DDoS']
    )
    plt.title(f'SVM CM: ts={test_size}')
    plt.show()

    # сохраняем метрики SVM
    results = pd.concat([results, pd.DataFrame([{
        'TestSize': test_size,
        'BatchSize': None,
        'Epochs': None,
        'Method': 'SVM',
        'Accuracy': acc_s,
        'Precision': prec_s,
        'Recall': rec_s,
        'F1_Score': f1_s,
        'Normal_Detect_Rate': norm_s,
        'Atk_Detect_Rate': atk_s
    }])], ignore_index=True)

# Сохраняем финальную таблицу
results.to_csv('/kaggle/working/experiment_results.csv', index=False)
print(results)

In [None]:
import pandas as pd

results = pd.read_csv('/kaggle/working/experiment_results.csv')
#по убыванию Accuracy
sorted_results = results.sort_values(by='Accuracy', ascending=False)
#Сбросим индексы для красоты
sorted_results = sorted_results.reset_index(drop=True)
display(sorted_results)