In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import ipaddress
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif, RFE
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
import joblib

In [2]:
df = pd.read_csv("./data/cybersecurity_attacks.csv")

In [3]:
columns = [
    "User Information",
    "Geo-location Data",
    "Payload Data"
]
df = df.drop(columns=columns)

In [4]:
def extract_device_info(user_agent):
    """
    Extracts Operating System (OS), Browser, and Device Type from a user-agent string.
    """
    user_agent = str(user_agent).lower()

    if "windows" in user_agent:
        os = "Windows"
    elif "mac os" in user_agent or "macintosh" in user_agent:
        os = "MacOS"
    elif "linux" in user_agent:
        os = "Linux"
    elif "android" in user_agent:
        os = "Android"
    elif "iphone" in user_agent or "ipad" in user_agent or "ios" in user_agent:
        os = "iOS"
    else:
        os = "Other"

    if "chrome" in user_agent:
        browser = "Chrome"
    elif "firefox" in user_agent:
        browser = "Firefox"
    elif "safari" in user_agent and "chrome" not in user_agent:
        browser = "Safari"
    elif "msie" in user_agent or "trident" in user_agent:
        browser = "Internet Explorer"
    elif "edge" in user_agent:
        browser = "Edge"
    else:
        browser = "Other"

    if "mobile" in user_agent or "android" in user_agent or "iphone" in user_agent:
        device_type = "Mobile"
    else:
        device_type = "Desktop"

    return pd.Series([os, browser, device_type])

def extract_ip_features(ip):
    try:
        ip_obj = ipaddress.ip_address(ip)
        octets = str(ip).split('.')
        octet_1 = int(octets[0])
        octet_2 = int(octets[1])
        octet_3 = int(octets[2])
        octet_4 = int(octets[3])
        
    except:
        octet_1, octet_2, octet_3, octet_4 = 0, 0, 0, 0

    return pd.Series([octet_1, octet_2, octet_3, octet_4])

def time_of_day(hour):
    if 5 <= hour < 12:
        return "Morning"
    elif 12 <= hour < 17:
        return "Afternoon"
    elif 17 <= hour < 21:
        return "Evening"
    else:
        return "Night"

In [5]:
missing_cols = ['Malware Indicators', 'Alerts/Warnings', 'Firewall Logs', 'IDS/IPS Alerts', 'Proxy Information']
df[missing_cols] = df[missing_cols].fillna(0)

In [6]:
df['Malware Indicators'] = df['Malware Indicators'].apply(lambda x: 1 if x == 'IoC Detected' else 0)
df['Alerts/Warnings'] = df['Alerts/Warnings'].apply(lambda x: 1 if x == 'Alert Triggered' else 0)
df['Firewall Logs'] = df['Firewall Logs'].apply(lambda x: 1 if x == 'Log Data' else 0)
df['IDS/IPS Alerts'] = df['IDS/IPS Alerts'].apply(lambda x: 1 if x == 'Alert Data' else 0)
df['Packet Type'] = df['Packet Type'].apply(lambda x: 1 if x == 'Data' else 0)
df['Log Source'] = df['Log Source'].apply(lambda x: 1 if x == 'Server' else 0)
df['Attack Signature'] = df['Attack Signature'].apply(lambda x: 1 if x == 'Known Pattern A' else 0)

In [7]:
protocol_dummies = pd.get_dummies(df['Protocol'], prefix='', prefix_sep='')
protocol_dummies = protocol_dummies[['ICMP', 'UDP']].astype(int)
df = pd.concat([df, protocol_dummies], axis=1)
df = df.drop(columns=['Protocol'])

traffic_dummies = pd.get_dummies(df['Traffic Type'], prefix='', prefix_sep='')
traffic_dummies = traffic_dummies[['HTTP', 'DNS']].astype(int)
df = pd.concat([df, traffic_dummies], axis=1)
df = df.drop(columns=['Traffic Type'])

Action_dummies = pd.get_dummies(df['Action Taken'], prefix='', prefix_sep='')
Action_dummies = Action_dummies[['Logged', 'Blocked']].astype(int)
df = pd.concat([df, Action_dummies], axis=1)
df = df.drop(columns=['Action Taken'])

sec_dummies = pd.get_dummies(df['Severity Level'], prefix='', prefix_sep='')
sec_dummies = sec_dummies[['Low', 'Medium']].astype(int)
df = pd.concat([df, sec_dummies], axis=1)
df = df.drop(columns=['Severity Level'])

net = pd.get_dummies(df['Network Segment'], prefix='', prefix_sep='')
net = net[['Segment A', 'Segment B']].astype(int)
df = pd.concat([df, net], axis=1)
df = df.drop(columns=['Network Segment'])

df[["OS", "Browser", "Device Type"]] = df["Device Information"].apply(extract_device_info)
df = df.drop(columns='Device Information')
df = pd.get_dummies(df, columns=["OS", "Browser", "Device Type"], drop_first=True, dtype=int)

df[['Src_Octet1', 'Src_Octet2', 'Src_Octet3', 'Src_Octet4']] = df['Source IP Address'].apply(extract_ip_features)
df[['Dst_Octet1', 'Dst_Octet2', 'Dst_Octet3', 'Dst_Octet4']] = df['Destination IP Address'].apply(extract_ip_features)
df['Proxy_Used'] = df['Proxy Information'].apply(lambda x: 0 if x == 0 or pd.isnull(x) else 1)


In [8]:
df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")

In [9]:
df["Year"] = df["Timestamp"].dt.year
df = pd.get_dummies(df, columns=['Year'], drop_first=True, dtype=int)
df["Month"] = df["Timestamp"].dt.month
df["Day"] = df["Timestamp"].dt.day
df["Hour"] = df["Timestamp"].dt.hour
df["Minute"] = df["Timestamp"].dt.minute
df["Second"] = df["Timestamp"].dt.second
df["Day_of_Week"] = df["Timestamp"].dt.dayofweek

df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M')

df['Hour'] = df['Timestamp'].dt.hour
df['Day_of_Week'] = df['Timestamp'].dt.dayofweek  # Monday = 0, Sunday = 6
df['Month'] = df['Timestamp'].dt.month

df['Time_of_Day'] = df['Hour'].apply(time_of_day)

df = pd.get_dummies(df, columns=['Day_of_Week', 'Month', 'Time_of_Day'], drop_first=True)

df = df.drop(columns = 'Timestamp')

In [10]:
df['Src_Port_Binned'] = pd.cut(df['Source Port'], bins=[0, 1023, 49151, 65535], labels=['Well-Known', 'Registered', 'Dynamic'])
df['Dst_Port_Binned'] = pd.cut(df['Destination Port'], bins=[0, 1023, 49151, 65535], labels=['Well-Known', 'Registered', 'Dynamic'])

df['Log_Packet_Length'] = np.log1p(df['Packet Length'])

df[['Src_Octet1', 'Src_Octet2', 'Src_Octet3', 'Src_Octet4']] = df['Source IP Address'].str.split('.', expand=True).astype(int)
df[['Dst_Octet1', 'Dst_Octet2', 'Dst_Octet3', 'Dst_Octet4']] = df['Destination IP Address'].str.split('.', expand=True).astype(int)

df['Src_Octet1*Dst_Octet1'] = df['Src_Octet1'] * df['Dst_Octet1']
df['Src_Octet2*Dst_Octet2'] = df['Src_Octet2'] * df['Dst_Octet2']
df['Src_Octet3*Dst_Octet3'] = df['Src_Octet3'] * df['Dst_Octet3']
df['Src_Octet4*Dst_Octet4'] = df['Src_Octet4'] * df['Dst_Octet4']

df = pd.get_dummies(df, columns=['Src_Port_Binned', 'Dst_Port_Binned'], drop_first=True)
df = df.drop(columns=['Source IP Address', 'Destination IP Address', 'Proxy Information'])

In [11]:
attack_mapping = {'Malware': 0, 'Intrusion': 1, 'DDoS': 2}
df['Attack Type'] = df['Attack Type'].map(attack_mapping)

In [12]:
df.dtypes

Source Port                   int64
Destination Port              int64
Packet Length                 int64
Packet Type                   int64
Malware Indicators            int64
                              ...  
Src_Octet4*Dst_Octet4         int64
Src_Port_Binned_Registered     bool
Src_Port_Binned_Dynamic        bool
Dst_Port_Binned_Registered     bool
Dst_Port_Binned_Dynamic        bool
Length: 75, dtype: object

In [13]:
bool_columns = df.select_dtypes(include=['bool']).columns
df[bool_columns] = df[bool_columns].astype(int)
numerical_columns = df.select_dtypes(include=['number']).columns
numerical_columns = numerical_columns.drop("Attack Type", errors="ignore")
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
joblib.dump(scaler, "./data/scaler.pkl")
print("Data has been standardized: Mean ~0, Std ~1")
print(df.describe())

Data has been standardized: Mean ~0, Std ~1
        Source Port  Destination Port  Packet Length   Packet Type  \
count  4.000000e+04      4.000000e+04   4.000000e+04  4.000000e+04   
mean   3.677059e-17      1.776357e-16   3.304024e-17 -2.469136e-17   
std    1.000013e+00      1.000013e+00   1.000013e+00  1.000013e+00   
min   -1.721068e+00     -1.729628e+00  -1.724484e+00 -9.882194e-01   
25%   -8.685042e-01     -8.644202e-01  -8.687953e-01 -9.882194e-01   
50%   -6.161382e-03     -7.880112e-03   1.315442e-03 -9.882194e-01   
75%    8.597913e-01      8.687279e-01   8.690226e-01  1.011921e+00   
max    1.754273e+00      1.743479e+00   1.727115e+00  1.011921e+00   

count        40000.000000    4.000000e+04     4.000000e+04  40000.000000   
mean             0.000000    5.719869e-17     9.237056e-18      1.003025   
std              1.000013    1.000013e+00     1.000013e+00      0.817547   
min             -1.000000   -1.736841e+00    -9.966556e-01      0.000000   
25%             -1.00

In [None]:
df.to_csv('./data/Processed_01.csv', index=False)

In [15]:
corr_matrix = df.corr(numeric_only=True)['Attack Type'].sort_values(ascending=False)
corr_matrix

Attack Type       1.000000
Day_of_Week_1     0.015047
Packet Type       0.011329
Browser_Other     0.010652
Month_3           0.009617
                    ...   
Browser_Safari   -0.007483
Minute           -0.008124
Day_of_Week_4    -0.008837
Month_7          -0.008946
Month_11         -0.009036
Name: Attack Type, Length: 75, dtype: float64

In [16]:
X = df.drop(columns=['Attack Type'], errors='ignore')
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X)
joblib.dump(pca, "./data/pca.pkl")
pca_columns = [f'PC{i+1}' for i in range(X_pca.shape[1])]
X_pca_df = pd.DataFrame(X_pca, columns=pca_columns)
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)
print("PCA Transformation Complete")
print("Explained Variance Ratio per Component:", explained_variance)

PCA Transformation Complete
Explained Variance Ratio per Component: [0.03795335 0.03645507 0.03582283 0.02657195 0.0263606  0.02628488
 0.0260257  0.02590127 0.02575956 0.0234775  0.02060229 0.0204925
 0.0203513  0.02030523 0.0199116  0.01958447 0.01947289 0.01843674
 0.01673719 0.01598632 0.01595765 0.01581155 0.01579909 0.01576553
 0.0150825  0.01492409 0.01488914 0.01483428 0.01480335 0.01473822
 0.01469703 0.01457329 0.01446671 0.01380578 0.0137581  0.0136988
 0.01367432 0.01367192 0.01357993 0.0135046  0.01346282 0.01341657
 0.013392   0.01337337 0.01330833 0.01318556 0.01315276 0.0131255
 0.01299796 0.01240046 0.01088282 0.00767683 0.00730725 0.0068276
 0.00677631 0.00671822]


In [17]:
X_pca_df['Attack Type'] = df['Attack Type']
X_pca_df.to_csv("./data/Processed_02.csv", index=False)

In [18]:
data_pca = pd.read_csv("./data/Processed_02.csv")
X_pca = data_pca.drop(columns=['Attack Type'])
y = data_pca['Attack Type']
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_pca, y)
feature_importances = pd.Series(rf.feature_importances_, index=X_pca.columns).sort_values(ascending=False)
top_20_features = feature_importances.head(20)
print("Top 20 Most Important Principal Components:")
print(top_20_features)


Top 20 Most Important Principal Components:
PC36    0.018729
PC41    0.018668
PC49    0.018634
PC6     0.018573
PC47    0.018536
PC9     0.018528
PC40    0.018508
PC12    0.018484
PC39    0.018479
PC37    0.018474
PC8     0.018470
PC43    0.018459
PC42    0.018430
PC7     0.018424
PC48    0.018421
PC46    0.018332
PC35    0.018323
PC11    0.018311
PC34    0.018298
PC45    0.018262
dtype: float64


In [19]:
top_20_pcs = [
    "PC9", "PC41", "PC36", "PC7", "PC37", "PC6", "PC39", "PC48", "PC8", "PC11",
    "PC5", "PC44", "PC40", "PC43", "PC45", "PC4", "PC38", "PC35", "PC55", "PC49"
]
final_dataset = data_pca[top_20_pcs + ["Attack Type"]]
final_dataset.to_csv("./data/Processed_03.csv", index=False)
print("Processed dataset with the top 20 PCA features saved as 'Processed_03.csv'.")

Processed dataset with the top 20 PCA features saved as 'Processed_03.csv'.
