In [37]:
from pyspark.ml.feature import StringIndexer
from pyspark.sql import SparkSession
from sklearn.preprocessing import LabelEncoder
import re

spark = SparkSession.builder.appName("Preprocess data").getOrCreate()
ddos_tf_df = spark.read.format("csv").option("header", "true").load("ddos-tcp-syn-flood.csv")
normal_tf_df = spark.read.format("csv").option("header", "true").load("normal-traffic.csv")
port_scan_tf_df = spark.read.format("csv").option("header", "true").load("port-scanning.csv")

data_frames = {
    "ddos-tcp-syn-flood": ddos_tf_df,
    "normal-traffic": normal_tf_df,
    "port-scanning": port_scan_tf_df
}

In [38]:
selected_columns = [
    "frame-time",
    "ip-src_host",
    "ip-dst_host",
    "tcp-connection-syn",
    "tcp-connection-synack",
    "tcp-flags_index",
    "tcp-len",
    "tcp-seq",
    "tcp-dstport",
    "Attack_type"
]

In [39]:
for df_name, df in data_frames.items():

    for col_name in df.columns:
        new_col_name = re.sub(r'\.', '-', col_name)
        df = df.withColumnRenamed(col_name, new_col_name)
        
    tcp_flags_indexer = StringIndexer(inputCol="tcp-flags", outputCol="tcp-flags_index")
    indexed_df = tcp_flags_indexer.fit(df).transform(df)

    indexed_df = indexed_df.select([c for c in df.columns if c in selected_columns])
    pandas_df = indexed_df.toPandas()

    label_encoder = LabelEncoder()
    pandas_df["ip-src_host"] = label_encoder.fit_transform(pandas_df["ip-src_host"])
    pandas_df["ip-dst_host"] = label_encoder.fit_transform(pandas_df["ip-dst_host"])

    pandas_df.to_csv(f'preprocessed_data/{df_name}.csv', index=False)