# CIC-IDS-2019 Dataset Analysis

## Importing Libraries


In [None]:
# For getting paths
import os

# For data manipulation and preprocessing
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# For feature engineering
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

# For model building
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# For model evaluation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.model_selection import train_test_split

# For oversampling
from imblearn.over_sampling import SMOTE

# For plotting
import matplotlib.pyplot as plt
import seaborn as sns

# For statistical analysis
import scipy

# For timing
import time

## Aquire the Data

In [None]:
data_dir = '/Users/lucky/GitHub/DDoS-Hybrid-Detection-System/datasets/cic_ids_2019'

day1_dir = '/Users/lucky/GitHub/DDoS-Hybrid-Detection-System/datasets/cic_ids_2019/DoS/DDoS/03-11'
day2_dir = '/Users/lucky/GitHub/DDoS-Hybrid-Detection-System/datasets/cic_ids_2019/DoS/DDoS/01-12'

data_frames = []

# Flag to indicate if the header should be included
include_header = True

# Walk through the directory and load each CSV file into a dataframe
for root, dirs, files in os.walk(data_dir):
    for file in files:
        if file.endswith('.csv'):
            print(f"Reading {file}")
            file_path = os.path.join(root, file)
            try:
                if include_header:
                    df = pd.read_csv(file_path)
                    include_header = False 
                else:
                    df = pd.read_csv(file_path, header=None, skiprows=1)
                    df.columns = data_frames[0].columns
                data_frames.append(df)
            except Exception as e:
                print(f"Error reading {file_path}: {e}")

# Combine all dataframes into a single dataframe
combined_df = pd.concat(data_frames, ignore_index=True)

## Data Preprocessing

In [None]:
# Observe the data
print(combined_df.shape)
print(combined_df.head())
print(combined_df.info())

### Remove Columns that Do Not Exist in Packet Capture Data/Ungeneralizable Columns

In [None]:
# Remove columns that have potential to bias the model
combined_df.drop(
    [
        'Flow ID', 
        ' Source IP', 
        ' Destination IP', 
        ' Timestamp', 
        'SimillarHTTP', 
        'Unnamed: 0', 
        ' Inbound'
    ], 
    axis=1,
    inplace=True
)

### Removal Duplicates, NAN, +/-INF Values

In [None]:
# Maintain a copy of the original dataframe
original_df = combined_df.copy()

# Remove duplicates
combined_df.drop_duplicates(inplace=True)
# Replace infinity with NaN
combined_df.replace([np.inf, -np.inf], np.nan, inplace=True)
# Remove columns with all NaN values
combined_df.dropna(axis=1, how='all', inplace=True)
# Remove rows with any NaN values
combined_df.dropna(inplace=True)
# Acquire column names that contain object data types
object_cols = combined_df.select_dtypes(include=['object']).columns