In [2]:
import pandas as pd
from sqlalchemy import create_engine

import warnings
warnings.filterwarnings(action="ignore")

In [324]:
df = pd.read_csv('iot_data/datasets/dataset_ML_DL/ML-EdgeIIoT-dataset.csv')
df_distance = pd.read_csv('iot_data/datasets/normal_traffic/Distance/Distance.csv')

### Creating pipeline binary classification

#### Functions for each row 

In [257]:
# binary_groups - columns converted to "has data" and "doesn't have data" (as only attack frames contain data)
# one_hot_columns - columns to be converted to one hot array
# hex_to_int - columns containig string hex value
# to_drop - columns to drop from the df

# Note that frame time and ip data will be dropped in all models except from LSTM model
binary_columns = ["http.file_data", "http.request.uri.query", "http.request.full_uri", "mqtt.conack.flags",
                 "mqtt.protoname", "mqtt.topic", "arp.hw.size", "mqtt.conflags", "mqtt.proto_len",]

one_hot_columns = ["http.request.method", "http.referer", "http.request.version", "arp.opcode",
                   "dns.qry.qu", "dns.retransmission", "mqtt.hdrflags", "mqtt.msgtype"]

hex_to_int = ["tcp.options"]

to_drop = ["tcp.payload", "dns.qry.name.len", "mqtt.msg", "icmp.transmit_timestamp", "icmp.unused",
           "http.tls_port", "dns.qry.type", "dns.retransmit_request_in", "mqtt.msg_decoded_as",
           "mbtcp.len", "mbtcp.trans_id", "mbtcp.unit_id"]

to_normalize = ["icmp.checksum", "icmp.seq_le", "http.content_length", "tcp.ack_raw", "tcp.ack",
                "tcp.checksum", "tcp.dstport", "tcp.flags", "tcp.len",
                "tcp.seq", "udp.port", "udp.stream", "udp.time_delta",
                "dns.qry.name", "mqtt.len", "mqtt.topic_len", "mqtt.ver"]

to_drop_non_lstm_and_no_flow_data = ["frame.time", "ip.src_host", "ip.dst_host", 
                                     "arp.src.proto_ipv4", "arp.dst.proto_ipv4"]

#### Functions

In [344]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin


class hex_to_int_convertor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        self.columns = X.columns
        return self
    
    def transform(self, X):
        self.columns = X.columns
        X_copy = X.copy()  # Make a copy of the input DataFrame
        
        for column_name in X.columns:
            X_copy[column_name] = X_copy[column_name].apply(lambda x: float(int(x, 16)) if isinstance(x, str) else x)

            tmp = [val for val in X_copy[column_name] if type(val) != str]
            max_val = max(tmp)
            min_val = min(tmp)
            X_copy[column_name] = X_copy[column_name].apply(lambda x: (x - min_val)/max_val)
        return X_copy
    
    def get_feature_names_out(self, input_features=None):
        # Define the output feature names after transformation
        return list(self.columns)

    
class convert_string_to_0(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        self.columns = X.columns
        self.columns_data = dict()
        
        for column_name in X.columns:
            tmp = [val for val in X[column_name] if type(val) != str]
            max_val = max(tmp)
            min_val = min(tmp)
            self.columns_data[column_name] = {
                "max": max_val,
                "min": min_val
            }
        return self

    def transform(self, X):
        self.columns = X.columns
        X_copy = X.copy()  # Make a copy of the input DataFrame
        
        for key, value in self.columns_data.items():
            X_copy[key] = X_copy[key].apply(lambda x: 0.0 if isinstance(x, str) else x)
            X_copy[key] = X_copy[key].apply(lambda x: (x - value["min"])/value["max"])
        return X_copy
    
    def get_feature_names_out(self, input_features=None):
        # Define the output feature names after transformation
        return list(self.columns)


def drop_columns(X):
    
    for column_to_drop in to_drop + to_drop_non_lstm_and_no_flow_data:
        
        try:
            X.drop(columns=[column_to_drop], inplace=True)
        except:
            pass
    return X


def convert_to_numeric_all_columns(df):
 
    # Iterate over each column
    for col in df.columns:
        # Create a copy of the column to avoid modifying the original DataFrame
        converted_column = df[col].copy()

        # Try converting string values to floats, ignoring errors for non-numeric values
        for i in range(len(converted_column)):
            try:
                converted_column.iloc[i] = float(converted_column.iloc[i])
            except ValueError:
                pass
        df[col] = converted_column
    return df


class FactorizeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        self.columns = X.columns
        
        for col in X.columns:
            X[col], _ = pd.factorize(X[col])
        return X
    
    def get_feature_names_out(self, input_features=None):
        # Define the output feature names after transformation
        return list(self.columns)


    
class FactorizeAndOneHotTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.factorizer = FactorizeTransformer()
        self.encoder = OneHotEncoder(sparse_output=False)

    def fit(self, X, y=None):
        self.factorizer.fit(X)
        factorized_data = self.factorizer.transform(X)
        self.encoder.fit(factorized_data)
        return self

    def transform(self, X):
        factorized_data = self.factorizer.transform(X)
        one_hot_encoded = self.encoder.transform(factorized_data)
        return one_hot_encoded
    
    def get_feature_names_out(self, input_features=None):
        # Define the output feature names after transformation
        return list(self.encoder.get_feature_names_out())
    
    
class BinaryGroups(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        self.columns = X.columns
        X_copy = X.copy()
        
        for column_name in X_copy.columns:
            X_copy[column_name] = X_copy[column_name].apply(lambda x: 1 if x != 0 else x)
        return X_copy
    
    def get_feature_names_out(self, input_features=None):
        # Define the output feature names after transformation
        return list(self.columns)
    




In [331]:
from sklearn.pipeline import Pipeline

labels = ["Attack_type", "Attack_label"]
columns_to_keep = [column for column in df.columns if column not in to_drop_non_lstm_and_no_flow_data and column not in labels]

df_data = df.iloc[:, [df.columns.get_loc(col) for col in columns_to_keep]]
df_labels = df.iloc[:, [df.columns.get_loc(col) for col in labels]]
df_flow = df.iloc[:, [df.columns.get_loc(col) for col in to_drop_non_lstm_and_no_flow_data]]




In [332]:
df = drop_columns(df)
# df = convert_to_numeric_all_columns(df)

In [345]:
full_column_transform = ColumnTransformer([
    ("convert_hex", hex_to_int_convertor(), ['tcp.options']),
    ("wrong_parse", convert_string_to_0(), ['tcp.srcport']),
    ("normalization", StandardScaler(), to_normalize),
    ("binary_groups", BinaryGroups(), binary_columns),
    ("one_hot", FactorizeAndOneHotTransformer(), one_hot_columns),
])

full_pipeline = Pipeline([
#     ('drop', FunctionTransformer(drop_columns)),
#     ('convert_to_numeric', FunctionTransformer(convert_to_numeric_all_columns)),
    ('full_column_transform', full_column_transform)
])


processed_data = full_pipeline.fit_transform(df_data.copy())

In [352]:
new_columns = full_column_transform.get_feature_names_out()

processed_df = pd.DataFrame(data=processed_data, columns=new_columns)

In [369]:
processed_df.to_csv('iot_data/datasets/dataset_ML_DL/ML-EdgeIIoT-dataset-processed.csv')
df_labels.to_csv('iot_data/datasets/dataset_ML_DL/ML-EdgeIIoT-dataset-labels.csv')