# Data Cleaning
**Target**: To prepare the dataset before EDA
Consists of 3 stages:
1. Define function
2. Preprocess and store
3. Check for correctness

### Imports

In [1]:
import pandas as pd
import sys
sys.path.append('../')
pd.set_option('display.max_columns',None)
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_rows = 10  # Show only 10 rows by default
pd.options.display.html.border = 2  # Add border for clarity
from src.cleaning import add_super_class, add_super_service, get_columns
from src.data_loader import store_dataset, load_test_dataset, load_train_dataset

### Define function
The function does 4 key actions:
1. Assigns columns to datasets
2. Adds Supper Service - grouping of services
3. Adds Super Class - grouping of classes of attacks
4. Saves as .csv

In [5]:

def process_dataset(target: str):
    df = load_test_dataset(raw=True) if target == 'test' else load_train_dataset(raw=True)
    df.columns = get_columns()
    add_super_service(df)
    add_super_class(df)
    store_dataset(df, target=target, purpose='eda')
    return df

In [6]:
datasets = []

for target in ['train', 'test']:
    df = process_dataset(target)
    datasets.append(df)

## Ensure validity

In [7]:
for ds in datasets:
    ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 45 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     125973 non-null  int64  
 1   protocol_type                125973 non-null  object 
 2   service                      125973 non-null  object 
 3   flag                         125973 non-null  object 
 4   src_bytes                    125973 non-null  int64  
 5   dst_bytes                    125973 non-null  int64  
 6   land                         125973 non-null  int64  
 7   wrong_fragment               125973 non-null  int64  
 8   urgent                       125973 non-null  int64  
 9   hot                          125973 non-null  int64  
 10  num_failed_logins            125973 non-null  int64  
 11  logged_in                    125973 non-null  int64  
 12  num_compromised              125973 non-null  int64  
 13 