# Introduction
Here we will be creating a small set of functions that could be used to run different models
- preprocess the datas
- cleaning the datas
- select a model of machine learning
- train the model
-

In [4]:
import pandas as pd
import numpy as np
from numpy.lib.tests.test__datasource import malicious_files
from sklearn.model_selection import train_test_split

In [40]:

class DataProcessor:
    def __init__(self, data_path):
        self.data_path = data_path
        self.data = None
        self.X = None
        self.y = None


    # function to loas the data for that you need to have pandas imported as pd
    def load_data(self):
        """ here we will be loading the data from the path"""
        try:
            self.data = pd.read_csv(self.data_path, low_memory=False)
            print("Data loaded successfully")
        except FileNotFoundError:
            print("Error loading data {self.data_path}")

    def clean_data(self, drop_missing_values=False, replace_inf = True, Fill_value = True):
        """ here we will be cleaning the data"""
        if drop_missing_values:
            self.data.dropna(inplace=True)
        if replace_inf:
            self.data.replace([np.inf, -np.inf], Fill_value, inplace=True)

    def show_features(self):
        """ Display the features of the data"""
        if self.data is not None:
            print("features in the datasets: ")
            for feature in self.data.columns:
                print(f"-{feature}")
        else: print("No data loaded")

    def drop_features(self, columns_to_drop):
        """ Drop specific columns from the data
        args : colums_to_drop (list) : list of columns names to be dropped
        use the function show_features in order to checks the different features presents in the dataset
        """
        if self.data is not None:
            self.data.drop(columns_to_drop, axis=1, inplace=True, errors='ignore')
            print("Features dropped successfully")
        else: print("No data loaded. please doad the data first")

    def split_data(self, target_column):
        """
        Splits the data into features (X) and target (y).
        Args: target_column (str): The name of the column to be used as the target variable.
        Returns: tuple: X (features), y (target)
        """
        if self.data is not None:
            if target_column in self.data.columns:
                self.X = self.data.drop(columns=[target_column])
                self.y = self.data[target_column]
                print(f"Data split into X (features) and y (target) using '{target_column}' as target.")
            else:
                print(f"Error: Column '{target_column}' not found in the dataset.")

        else:
            print("No data loaded. Please load the data first.")



    def detect_categorical(self, handle_nan = "unknown"):
        """
        Detects categorical features in the dataset and identifies NaN values.
        Handles NaN values in the categorical features based on the chosen method.

        Args: handle_nan (str): How to handle NaN values in categorical features.
                              Options are "drop", "most_frequent", or "unknown".
        """
        if self.data is not None:
            # Detect categorical features
            categorical_features = self.data.select_dtypes(include=['object', 'category']).columns

            if len(categorical_features) == 0:
                print("No categorical features detected.")
                return

            print("Categorical Features and their NaN Information:")
            for feature in categorical_features:
                total_nan = self.data[feature].isna().sum()
                percentage_nan = (total_nan / len(self.data)) * 100

                print(f"- {feature}:")
                print(f"  NaN Count: {total_nan}")
                print(f"  Percentage of NaNs: {percentage_nan:.2f}%")

                # Handle NaN values based on the chosen option
                if total_nan > 0:  # Only handle if there are NaN values
                    if handle_nan == "drop":
                        self.data.dropna(subset=[feature], inplace=True)
                        print(f"  Action: Dropped rows with NaN in '{feature}'.")

                    elif handle_nan == "most_frequent":
                        most_frequent = self.data[feature].mode()[0]
                        self.data[feature].fillna(most_frequent, inplace=True)
                        print(f"  Action: Replaced NaN with most frequent value '{most_frequent}'.")

                    elif handle_nan == "unknown":
                        self.data[feature].fillna("Unknown", inplace=True)
                        print(f"  Action: Replaced NaN with 'Unknown'.")

                    else:
                        print(f"  Action: Invalid option '{handle_nan}'. No changes made for '{feature}'.")

            print("\nCategorical NaN handling completed.")
        else:
            print("No data loaded. Please load the data first.")











In [33]:

""" main function to execute the script """
data = DataProcessor("balanced_data.csv")



In [34]:
data.load_data()


Data loaded successfully


In [35]:
data.show_features()

features in the datasets: 
-Flow ID
-SrcIP
-DstIP
-SrcPort
-DstPort
-Protocol
-mTimestampStart
-mTimestampLast
-Flow Duration
-Flow Bytes/s
-Flow Packets/s
-Tot Fwd Pkts
-Tot Bwd Pkts
-Total Length of Fwd Packet
-Total Length of Bwd Packet
-Fwd Packet Length Min
-Fwd Packet Length Max
-Fwd Packet Length Mean
-Fwd Packet Length Std
-Bwd Packet Length Min
-Bwd Packet Length Max
-Bwd Packet Length Mean
-Bwd Packet Length Std
-Flow IAT Mean
-Flow IAT Min
-Flow IAT Max
-Flow IAT Stddev
-Fwd IAT Min
-Fwd IAT Max
-Fwd IAT Mean
-Fwd IAT Std
-Fwd IAT Tot
-Bwd IAT Min
-Bwd IAT Max
-Bwd IAT Mean
-Bwd IAT Std
-Bwd IAT Tot
-Fwd PSH flags
-Bwd PSH flags
-Fwd URG flags
-Bwd URG flags
-Fwd Header Length
-Bwd Header Length
-Fwd Packets/s
-Bwd Packets/s
-Packet Length Min
-Packet Length Max
-Packet Length Mean
-Packet Length Std
-Packet Length Variance
-FIN Flag Cnt
-SYN Flag Cnt
-RST Flag Cnt
-PSH Flag Cnt
-ACK Flag Cnt
-URG Flag Cnt
-CWR Flag Cnt
-ECE Flag Cnt
-Down/Up Ratio
-Average Packet Size
-Fwd 

In [36]:
columns_to_drop = ['Flow ID', 'SrcIP', 'DstIP','External_src', 'External_dst','Conn_state', 'Segment_src', 'Segment_dst', 'Expoid_src', 'Expoid_dst','mTimestampStart','mTimestampLast']
data.drop_features(columns_to_drop=columns_to_drop)

Features dropped successfully


In [37]:
# split data into dataset for the training and target
data.split_data(target_column='Label')


Data split into X (features) and y (target) using 'Label' as target.


In [39]:
print(data.X.shape)
print(data.y.shape)

(3289198, 82)
(3289198,)
