In [None]:
!pip install pandas



In [None]:
"""
Preprocessing Script for WSN_DDoS_Attack_H-IoT2023 Dataset
----------------------------------------------------------

This notebook processes raw simulation data collected from a Wireless Sensor Network (WSN)
using the Cooja simulator to generate the WSN_DDoS_Attack_H-IoT2023 dataset. It focuses on
environmental sensor readings, specifically:

- Room temperature
- Humidity

The script performs:

- Parsing of raw timestamps
- Extraction of sensor values and node-specific communication features
- Timestamp conversion to milliseconds
- Labeling of data instances (normal or DDoS attack)

The final cleaned dataset is saved in CSV format and is intended for use in anomaly
detection and DDoS attack research within Healthcare-IoT (H-IoT) environments.

Author: Mirza Akhi
Affiliation: University of Limerick (UL), Ireland
"""



import math

def get_time(time_stamp):
  splited_time_stamp = time_stamp.split(':')

  len_splited_time_stamp = len(splited_time_stamp)

  secs_msecs = splited_time_stamp[len_splited_time_stamp - 1]
  mins = splited_time_stamp[len_splited_time_stamp - 2]

  hrs = 0

  if len_splited_time_stamp >= 3:
    hrs = splited_time_stamp[len_splited_time_stamp - 3]

  secs = secs_msecs.split('.')[0]
  msecs = secs_msecs.split('.')[1]

  total_msecs = int(hrs) * 60 * 60 * 1000 + int(mins) * 60 * 1000 + int(secs) * 1000 + int(msecs)

  return total_msecs


def get_previous_send_time(ids, time_stamps, event_descriptions):
  index = len(ids) - 2

  same_node_id = int(ids[index + 1])

  if same_node_id == 1:
    return -1

  while index != -1:
    if 'DATA send' in event_descriptions[index]:
      break

    index = index - 1

  if index == -1:
    return -1

  return time_stamps[index]


def get_interval(ids, time_stamps, event_descriptions):
  index = len(ids) - 1

  if int(ids[index]) == 1:
    return -1

  filtered_time_stamps = []

  while len(filtered_time_stamps) != 2 and index >= 0:
    if int(ids[index]) != 1 and 'DATA send' in event_descriptions[index]:
      filtered_time_stamps.append(time_stamps[index])

    index = index - 1

  if len(filtered_time_stamps) != 2:
    return -1

  cur_time = get_time(filtered_time_stamps[0])
  prev_time = get_time(filtered_time_stamps[1])

  return cur_time - prev_time


def get_previous_send_time_same_node(ids, time_stamps, event_descriptions):
  index = len(ids) - 2

  same_node_id = int(ids[index + 1])

  if same_node_id == 1:
    return -1

  while index != -1:
    if int(ids[index]) == same_node_id and 'DATA send' in event_descriptions[index]:
      break

    index = index - 1

  if index == -1:
    return -1

  return time_stamps[index]


def get_previous_send_interval(ids, time_stamps, previous_send_times):
  index = len(ids) - 1

  if ids[index] == 1:
    return -1

  if previous_send_times[index] == -1:
    return -1

  cur_time = get_time(time_stamps[index])
  prev_time = get_time(previous_send_times[index])

  return cur_time - prev_time


def get_previous_sender_node(ids, event_descriptions):
  index = len(ids) - 1

  if int(ids[index]) == 1:
    return -1

  previous_sender_node = 1

  while previous_sender_node == 1 and index > 0:
    index = index - 1

    if 'DATA send' in event_descriptions[index]:
      previous_sender_node = int(ids[index])

  if previous_sender_node == 1:
    previous_sender_node = -1

  return previous_sender_node


def get_average_previous_send_intervals_same_node(ids, previous_send_intervals_same_node):
  index = len(ids) - 1

  if previous_send_intervals_same_node[index] == -1:
    return -1

  count = 0
  sum = 0

  for i in previous_send_intervals_same_node:
    if i == -1:
      continue

    count += 1
    sum += i

  if not count:
    return -1

  return math.floor(sum / count)


def get_average_previous_send_intervals_normal_or_malicious_nodes(ids, malicious_nodes, average_previous_send_intervals_same_node):
  index = len(ids) - 1
  if int(ids[index]) not in malicious_nodes:
    return -1

  already_added_nodes = []
  count = 0
  sum = 0

  while index != -1:
    if len(already_added_nodes) == len(malicious_nodes):
      break

    if int(ids[index]) in malicious_nodes and int(ids[index]) not in already_added_nodes:
      if average_previous_send_intervals_same_node[index] == -1:
        index -= 1
        continue

      already_added_nodes.append(int(ids[index]))
      count += 1
      sum += int(average_previous_send_intervals_same_node[index])

    index -= 1

  if not len(already_added_nodes):
    return -1

  return math.floor(sum / count)

In [None]:
import pandas as pd
import re

file_path = '/content/Raw_WSN_DDoS_Attack_H-IoT2023.txt'

# Read the content of the text file
with open(file_path, 'r') as file:
    lines = file.readlines()

# Create lists to store data for each column
time_stamps = []
ids = []
previous_send_times = []
intervals = []
previous_send_times_same_nodes = []
previous_send_intervals_same_nodes = []
average_previous_send_intervals_same_nodes = []
average_previous_send_intervals_normal_nodes = []
average_previous_send_intervals_malicious_nodes = []
previous_sender_nodes = []
received_from_nodes = []
event_descriptions = []
temperatures = []
humidities = []
malicious = []

# Process each line in the text file
for line in lines:
    # Split the line based on whitespace
    parts = line.split()

    # Check if the line has enough elements
    if len(parts) >= 3:
        # Extract id from the 'id:X' format
        id_info = parts[1]
        id_value = id_info.split(':')[1]

        # Extract the relevant part of the 'Event Description'
        event_description = ' '.join(parts[2:])

        if int(id_value) == 1 or 'DATA send to 1' not in event_description:
          continue

        # Extract data for each column
        time_stamps.append(parts[0])

        ids.append(id_value)

        event_descriptions.append(event_description)

        # Find previous_send_time
        previous_send_times.append(get_previous_send_time(ids, time_stamps, event_descriptions))

        # Claculate interval
        intervals.append(get_interval(ids, time_stamps, event_descriptions))

        # Find previous_send_times_same_node
        previous_send_times_same_nodes.append(get_previous_send_time_same_node(ids, time_stamps, event_descriptions))

        # Calculate previous_send_intervals_same_node
        previous_send_intervals_same_nodes.append(get_previous_send_interval(ids, time_stamps, previous_send_times_same_nodes))

        # Calculate average_previous_send_intervals_same_node
        average_previous_send_intervals_same_nodes.append(
            get_average_previous_send_intervals_same_node(ids, previous_send_intervals_same_nodes)
        )

        # Calculate average_previous_send_intervals_normal_node
        normal_nodes = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
        average_previous_send_intervals_normal_nodes.append(
            get_average_previous_send_intervals_normal_or_malicious_nodes(ids, normal_nodes, average_previous_send_intervals_same_nodes)
        )

        # Calculate average_previous_send_intervals_malicious_node
        malicious_nodes = [12, 13, 14]
        average_previous_send_intervals_malicious_nodes.append(
            get_average_previous_send_intervals_normal_or_malicious_nodes(ids, malicious_nodes, average_previous_send_intervals_same_nodes)
        )

        # Find previous_sender
        previous_sender_nodes.append(get_previous_sender_node(ids, event_descriptions))

        # Find received From
        match_received_from = re.search(r'from (\d+)', event_description)
        received_from_nodes.append(match_received_from.group(1) if match_received_from else -1)

        # Find temperature, oxygen_level and heart_rate
        match_sensor_data = re.findall(r"'([^']*)'", event_description)

        if len(match_sensor_data) and 'temperature' in match_sensor_data[0] and int(ids[len(ids) - 1]) != 1:
          splitted_sensor_data = match_sensor_data[0].split(' ')

          temperatures.append(splitted_sensor_data[1])
          humidities.append(splitted_sensor_data[4])
        else:
          temperatures.append(-1)
          humidities.append(-1)

        # malicious column
        if int(id_value) in malicious_nodes:
          malicious.append(1)
        else:
          malicious.append(0)

for index, time_stamp in enumerate(time_stamps):
  time_stamps[index] = get_time(time_stamp)


# Create a dictionary with column names as keys and lists as values
data_dict = {
   # 'Time Stamp': time_stamps,
    'id': ids,
    #'previous_send_time': previous_send_times,
    'interval': intervals,
    # 'previous_send_times_same_node': previous_send_times_same_nodes,
    'previous_send_intervals_same_node': previous_send_intervals_same_nodes,
    'average_previous_send_intervals_same_node': average_previous_send_intervals_same_nodes,
    # 'average_previous_send_intervals_normal_node': average_previous_send_intervals_normal_nodes,
    # 'average_previous_send_intervals_malicious_node': average_previous_send_intervals_malicious_nodes,
    'previous_sender': previous_sender_nodes,
    # 'received_from': received_from_nodes,
    # 'temperature': temperatures,
    # 'humidity': humidities,
    'malicious': malicious,
}

# Create DataFrame from the dictionary
df = pd.DataFrame(data_dict)

# Save the DataFrame to a CSV file (primary output)
df.to_csv('WSN_DDoS_Attack_H-IoT2023.csv', index=False)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import itertools
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Dropout, Flatten, MaxPooling2D
import os

In [None]:
df.shape

(15988, 6)

In [None]:
X = df.drop(['malicious'], axis=1)

# X = X.drop(['Time Stamp'], axis=1)

y = df['malicious']

In [None]:
import numpy as np

def time_series_train_test_split(X, y, train_size):
    """
    Split time series data (X, y) into training and testing sets.

    Parameters:
    - X: The feature matrix as a NumPy array.
    - y: The target values as a NumPy array.
    - train_size: The proportion of data to be used for training (e.g., 0.8 for 80%).

    Returns:
    - X_train: The training features.
    - X_test: The testing features.
    - y_train: The training target values.
    - y_test: The testing target values.
    """
    if not (0 < train_size < 1):
        raise ValueError("train_size should be a proportion between 0 and 1.")

    data_length = len(X)
    train_length = int(data_length * train_size)

    X_train = X[:train_length]
    y_train = y[:train_length]
    X_test = X[train_length:]
    y_test = y[train_length:]

    return X_train, X_test, y_train, y_test

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = time_series_train_test_split(X, y, train_size=0.7)

X_train = np.asarray(X_train).astype('float32')
X_test = np.asarray(X_test).astype('float32')
y_train = np.asarray(y_train).astype('float32')
y_test = np.asarray(y_test).astype('float32')

print(X_train.shape)

sample_count = {
    'Number of samples': [np.sum(y_train == 1), np.sum(y_train == 0), np.sum(y_test == 1), np.sum(y_test == 0)],
    'Class': ['Malicious', 'Normal', 'Malicious', 'Normal'],
    'Phase': ['Train', 'Train', 'Test', 'Test'],
}

pd.DataFrame(sample_count).tail()

(11191, 5)


Unnamed: 0,Number of samples,Class,Phase
0,2583,Malicious,Train
1,8608,Normal,Train
2,1107,Malicious,Test
3,3690,Normal,Test
