# Training a ML model using CICIoT2023

This notebook shows how a LogisticRegression model can be trained using the CICIoT2023 csv files.

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
DATASET_DIRECTORY = '/content/drive/MyDrive/Do_An_Tot_Nghiep/dataset/CICIoT2023/'

### Importing Dataset

In [4]:
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()
training_sets = df_sets[:int(len(df_sets)*.8)]
test_sets = df_sets[int(len(df_sets)*.8):]

In [5]:
# print out every training set name
print('Training sets:')
for k in training_sets:
    print('\t', k)

Training sets:
	 part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
	 part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
	 part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
	 part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
	 part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
	 part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
	 part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
	 part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
	 part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
	 part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
	 part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
	 part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
	 part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
	 part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
	 part-00014-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
	 part-00015-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
	 part-00016-363d1ba3-8ab5-4f96-bc25-4d58

In [6]:
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count',
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight',
]
y_column = 'label'

In [7]:
# Set training_sets to be only the last entry in training_sets
training_sets = training_sets[-1:]


In [8]:
# print out all the training sets
print(training_sets)

['part-00134-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv']


### Scaling

In [9]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = StandardScaler()

In [10]:
for train_set in tqdm(training_sets):
    scaler.partial_fit(pd.read_csv(DATASET_DIRECTORY + train_set)[X_columns])

100%|██████████| 1/1 [00:03<00:00,  3.28s/it]


### Classification: 34 (33+1) classes

In [11]:
ML_models = [
        LogisticRegression(n_jobs=-1),
]

ML_neams = [
        "LogisticRegression",
]

for train_set in tqdm(training_sets):
    d = pd.read_csv(DATASET_DIRECTORY + train_set)
    d[X_columns] = scaler.transform(d[X_columns])
    for model in (ML_models):
        model.fit(d[X_columns], d[y_column])
    del d

100%|██████████| 1/1 [00:38<00:00, 38.06s/it]


In [13]:
y_test = []
preds = {i:[] for i in range(len(ML_models))}
for test_set in tqdm(test_sets):
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
    d_test[X_columns] = scaler.transform(d_test[X_columns])

    y_test += list(d_test[y_column].values)

    for i in range(len(ML_models)):
        model = ML_models[i]
        y_pred = list(model.predict(d_test[X_columns]))
        preds[i] = preds[i] + y_pred


100%|██████████| 34/34 [02:11<00:00,  3.86s/it]


In [14]:
# Import necessary modules
import pandas as pd
import time # Import the time module for delays

y_test = []
preds = {i:[] for i in range(len(ML_models))}
# Number of retries for reading files
max_retries = 5
# Delay in seconds between retries
retry_delay = 10

for test_set in tqdm(test_sets):
    d_test = None
    for retry_attempt in range(max_retries):
        try:
            d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
            # If reading is successful, break out of the retry loop
            break
        except OSError as e:
            print(f"Error reading {test_set}: {e}. Attempt {retry_attempt + 1}/{max_retries}. Retrying in {retry_delay} seconds...")
            time.sleep(retry_delay)

    # If after all retries the file could not be read, skip this test set or handle appropriately
    if d_test is None:
        print(f"Failed to read {test_set} after {max_retries} attempts. Skipping.")
        continue # Skip to the next test_set

    d_test[X_columns] = scaler.transform(d_test[X_columns])

    y_test += list(d_test[y_column].values)

    for i in range(len(ML_models)):
        model = ML_models[i]
        y_pred = list(model.predict(d_test[X_columns]))
        preds[i] = preds[i] + y_pred

100%|██████████| 34/34 [01:36<00:00,  2.84s/it]


In [15]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
for k,v in preds.items():
    y_pred = v
    print(f"##### {ML_neams[k]} (34 classes) #####")
    print('accuracy_score: ', accuracy_score(y_pred, y_test))
    print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
    print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
    print('f1_score: ', f1_score(y_pred, y_test, average='macro'))
    print()
    print()
    print()

##### LogisticRegression (34 classes) #####
accuracy_score:  0.8023117821859834
recall_score:  0.5951442699599506
precision_score:  0.4867544825125931
f1_score:  0.49387518888568555





# Classification: 8 (7+1) classes

In [16]:
dict_7classes = {}
dict_7classes['DDoS-RSTFINFlood'] = 'DDoS'
dict_7classes['DDoS-PSHACK_Flood'] = 'DDoS'
dict_7classes['DDoS-SYN_Flood'] = 'DDoS'
dict_7classes['DDoS-UDP_Flood'] = 'DDoS'
dict_7classes['DDoS-TCP_Flood'] = 'DDoS'
dict_7classes['DDoS-ICMP_Flood'] = 'DDoS'
dict_7classes['DDoS-SynonymousIP_Flood'] = 'DDoS'
dict_7classes['DDoS-ACK_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-UDP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-ICMP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-SlowLoris'] = 'DDoS'
dict_7classes['DDoS-HTTP_Flood'] = 'DDoS'

dict_7classes['DoS-UDP_Flood'] = 'DoS'
dict_7classes['DoS-SYN_Flood'] = 'DoS'
dict_7classes['DoS-TCP_Flood'] = 'DoS'
dict_7classes['DoS-HTTP_Flood'] = 'DoS'


dict_7classes['Mirai-greeth_flood'] = 'Mirai'
dict_7classes['Mirai-greip_flood'] = 'Mirai'
dict_7classes['Mirai-udpplain'] = 'Mirai'

dict_7classes['Recon-PingSweep'] = 'Recon'
dict_7classes['Recon-OSScan'] = 'Recon'
dict_7classes['Recon-PortScan'] = 'Recon'
dict_7classes['VulnerabilityScan'] = 'Recon'
dict_7classes['Recon-HostDiscovery'] = 'Recon'

dict_7classes['DNS_Spoofing'] = 'Spoofing'
dict_7classes['MITM-ArpSpoofing'] = 'Spoofing'

dict_7classes['BenignTraffic'] = 'Benign'

dict_7classes['BrowserHijacking'] = 'Web'
dict_7classes['Backdoor_Malware'] = 'Web'
dict_7classes['XSS'] = 'Web'
dict_7classes['Uploading_Attack'] = 'Web'
dict_7classes['SqlInjection'] = 'Web'
dict_7classes['CommandInjection'] = 'Web'


dict_7classes['DictionaryBruteForce'] = 'BruteForce'

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


ML_models = [
        LogisticRegression(n_jobs=-1),
]

ML_neams = [
        "LogisticRegression",
]


for train_set in tqdm(training_sets):
    d = pd.read_csv(DATASET_DIRECTORY + train_set)
    d[X_columns] = scaler.transform(d[X_columns])
    new_y = [dict_7classes[k] for k in d[y_column]]
    d[y_column] = new_y

    for model in (ML_models):
        model.fit(d[X_columns], d[y_column])
    del d

100%|██████████| 1/1 [00:16<00:00, 16.49s/it]


In [18]:
y_test = []
preds = {i:[] for i in range(len(ML_models))}
for test_set in tqdm(test_sets):
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
    d_test[X_columns] = scaler.transform(d_test[X_columns])
    new_y = [dict_7classes[k] for k in d_test[y_column]]
    d_test[y_column] = new_y

    y_test += list(d_test[y_column].values)

    for i in range(len(ML_models)):
        model = ML_models[i]
        y_pred = list(model.predict(d_test[X_columns]))
        preds[i] = preds[i] + y_pred


100%|██████████| 34/34 [01:41<00:00,  2.97s/it]


In [19]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
for k,v in preds.items():
    y_pred = v
    print(f"##### {ML_neams[k]} (8 classes) #####")
    print('accuracy_score = ', accuracy_score(y_pred, y_test))
    print('recall_score = ', recall_score(y_pred, y_test, average='macro'))
    print('precision_score = ', precision_score(y_pred, y_test, average='macro'))
    print('f1_score = ', f1_score(y_pred, y_test, average='macro'))
    print()
    print()
    print()

##### LogisticRegression (8 classes) #####
accuracy_score =  0.8316742843752626
recall_score =  0.6960550750828025
precision_score =  0.5124086796923355
f1_score =  0.5394229552962799





# Classification: 2 (1+1) Classes

In [20]:
dict_2classes = {}
dict_2classes['DDoS-RSTFINFlood'] = 'Attack'
dict_2classes['DDoS-PSHACK_Flood'] = 'Attack'
dict_2classes['DDoS-SYN_Flood'] = 'Attack'
dict_2classes['DDoS-UDP_Flood'] = 'Attack'
dict_2classes['DDoS-TCP_Flood'] = 'Attack'
dict_2classes['DDoS-ICMP_Flood'] = 'Attack'
dict_2classes['DDoS-SynonymousIP_Flood'] = 'Attack'
dict_2classes['DDoS-ACK_Fragmentation'] = 'Attack'
dict_2classes['DDoS-UDP_Fragmentation'] = 'Attack'
dict_2classes['DDoS-ICMP_Fragmentation'] = 'Attack'
dict_2classes['DDoS-SlowLoris'] = 'Attack'
dict_2classes['DDoS-HTTP_Flood'] = 'Attack'

dict_2classes['DoS-UDP_Flood'] = 'Attack'
dict_2classes['DoS-SYN_Flood'] = 'Attack'
dict_2classes['DoS-TCP_Flood'] = 'Attack'
dict_2classes['DoS-HTTP_Flood'] = 'Attack'


dict_2classes['Mirai-greeth_flood'] = 'Attack'
dict_2classes['Mirai-greip_flood'] = 'Attack'
dict_2classes['Mirai-udpplain'] = 'Attack'

dict_2classes['Recon-PingSweep'] = 'Attack'
dict_2classes['Recon-OSScan'] = 'Attack'
dict_2classes['Recon-PortScan'] = 'Attack'
dict_2classes['VulnerabilityScan'] = 'Attack'
dict_2classes['Recon-HostDiscovery'] = 'Attack'

dict_2classes['DNS_Spoofing'] = 'Attack'
dict_2classes['MITM-ArpSpoofing'] = 'Attack'

dict_2classes['BenignTraffic'] = 'Benign'

dict_2classes['BrowserHijacking'] = 'Attack'
dict_2classes['Backdoor_Malware'] = 'Attack'
dict_2classes['XSS'] = 'Attack'
dict_2classes['Uploading_Attack'] = 'Attack'
dict_2classes['SqlInjection'] = 'Attack'
dict_2classes['CommandInjection'] = 'Attack'

dict_2classes['DictionaryBruteForce'] = 'Attack'

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


ML_models = [
        LogisticRegression(n_jobs=-1),
]

ML_neams = [
        "LogisticRegression",
]

data = []  # List to hold all datasets
for train_set in tqdm(training_sets):
    d = pd.read_csv(DATASET_DIRECTORY + train_set)
    d[X_columns] = scaler.transform(d[X_columns])
    new_y = [dict_2classes[k] for k in d[y_column]]
    d[y_column] = new_y
    data.append(d)  # Add the current dataset to the list

# Combine all datasets
combined_data = pd.concat(data, ignore_index=True)
del data  # Delete the list to free up memory

# Fit the models on the combined data
for model in (ML_models):
    model.fit(combined_data[X_columns], combined_data[y_column])
del combined_data  # Delete the combined data to free up memory

y_test = []
preds = {i:[] for i in range(len(ML_models))}
for test_set in tqdm(test_sets):
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
    d_test[X_columns] = scaler.transform(d_test[X_columns])
    new_y = [dict_2classes[k] for k in d_test[y_column]]
    d_test[y_column] = new_y

    y_test += list(d_test[y_column].values)

    for i, model in enumerate(ML_models):
        preds[i].append(model.predict(d_test[X_columns]))

100%|██████████| 1/1 [00:02<00:00,  2.52s/it]
100%|██████████| 34/34 [01:31<00:00,  2.70s/it]


In [22]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
for k,v in preds.items():
    y_pred = v
    print(f"##### {ML_neams[k]} (8 classes) #####")
    print('accuracy_score = ', accuracy_score(y_pred, y_test))
    print('recall_score = ', recall_score(y_pred, y_test, average='macro'))
    print('precision_score = ', precision_score(y_pred, y_test, average='macro'))
    print('f1_score = ', f1_score(y_pred, y_test, average='macro'))
    print()
    print()
    print()

##### LogisticRegression (8 classes) #####


ValueError: Found input variables with inconsistent numbers of samples: [34, 10340161]

In [23]:
# Import necessary modules
import pandas as pd
import time # Import the time module for delays
import numpy as np # Import numpy for concatenation

y_test = []
preds = {i:[] for i in range(len(ML_models))}
# Number of retries for reading files
max_retries = 5
# Delay in seconds between retries
retry_delay = 10

for test_set in tqdm(test_sets):
    d_test = None
    for retry_attempt in range(max_retries):
        try:
            d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
            # If reading is successful, break out of the retry loop
            break
        except OSError as e:
            print(f"Error reading {test_set}: {e}. Attempt {retry_attempt + 1}/{max_retries}. Retrying in {retry_delay} seconds...")
            time.sleep(retry_delay)

    # If after all retries the file could not be read, skip this test set or handle appropriately
    if d_test is None:
        print(f"Failed to read {test_set} after {max_retries} attempts. Skipping.")
        continue # Skip to the next test_set

    d_test[X_columns] = scaler.transform(d_test[X_columns])
    new_y = [dict_2classes[k] for k in d_test[y_column]]
    d_test[y_column] = new_y


    y_test.extend(list(d_test[y_column].values)) # Use extend to add elements directly

    for i, model in enumerate(ML_models):
        preds[i].append(model.predict(d_test[X_columns])) # Keep appending the arrays for now

# After the loop, concatenate the arrays for each model's predictions
for i in range(len(ML_models)):
    preds[i] = np.concatenate(preds[i])

# The rest of your evaluation code
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
for k,v in preds.items():
    y_pred = v
    print(f"##### {ML_neams[k]} (2 classes) #####") # Corrected the number of classes in the print statement
    print('accuracy_score = ', accuracy_score(y_pred, y_test))
    print('recall_score = ', recall_score(y_pred, y_test, average='macro'))
    print('precision_score = ', precision_score(y_pred, y_test, average='macro'))
    print('f1_score = ', f1_score(y_pred, y_test, average='macro'))
    print()
    print()
    print()

100%|██████████| 34/34 [01:32<00:00,  2.74s/it]


##### LogisticRegression (2 classes) #####
accuracy_score =  0.9890510408880481
recall_score =  0.8903256738863513
precision_score =  0.8641187549527014
f1_score =  0.8767412604444338



