In [None]:
import pandas as pd
data = pd.read_csv('extracted_data_new.csv')

<h2>Get Dummies</h2>

One hot encoding categorical data.

In [69]:
data = pd.get_dummies(data, columns=['Protocol', 
                                     'Packet Type',
                                     'Traffic Type',
                                     'Log Source',
                                    'Categorical Source Port',
                                    'Categorical Destination Port'],  drop_first=True)

<h2>Dropping</h2>

Dropping columns considered useless.

In [74]:
data.drop(columns=['Attack Signature', 'Action Taken', 'Severity Level', 'Network Segment'], inplace=True)

Encoding Proxy as a binary variable: has proxy or not.

In [77]:
data['Proxy Information'] = data['Proxy Information'].notnull().astype(int)

<h2>Other Preparing</h2>

Onhe hot encoding for varaibles with few categories, label enconding if there are a lot of categories.\
Dropping missing values because it has no sens to try to infer them.

In [81]:
print(data[['asn_num Source', 'country Source', 'lat Source', 'long Source', 'asn_source type']].isna().sum())
print(data[['asn_num Destination', 'country Destination', 'lat Destination', 'long Destination', 'asn_dest type']].isna().sum())

asn_num Source     6523
country Source     6528
lat Source         6528
long Source        6528
asn_source type       0
dtype: int64
asn_num Destination    6528
country Destination    6533
lat Destination        6533
long Destination       6533
asn_dest type             0
dtype: int64


In [83]:
data.dropna(subset=['asn_num Destination'], inplace=True)
data.dropna(subset=['asn_num Source'], inplace=True)
data.dropna(subset=['country Source', 'country Destination'], inplace=True)

In [85]:
print(data[['asn_num Source', 'country Source', 'lat Source', 'long Source']].isna().sum())
print(data[['asn_num Destination', 'country Destination', 'lat Destination', 'long Destination']].isna().sum())

asn_num Source    0
country Source    0
lat Source        0
long Source       0
dtype: int64
asn_num Destination    0
country Destination    0
lat Destination        0
long Destination       0
dtype: int64


In [87]:
data = pd.get_dummies(data, columns=['asn_source type', 'asn_dest type'],  drop_first=True)

In [89]:
print(data['dist Source/Dest'].isna().sum())

0


In [91]:
def modify_browser(browser):
    if "Safari" in browser:
        return "Safari"
    if "Chrome" in browser:
        return "Chrome"
    if "Firefox" in browser:
        return "Firefox"
    return browser

print(data['Browser'].unique())
data['Browser'] = data['Browser'].apply(modify_browser)
print(data['Browser'].unique())

['IE' 'Firefox' 'Opera' 'Chrome' 'Safari' 'Mobile Safari' 'Firefox iOS'
 'Firefox Mobile' 'Chrome Mobile iOS']
['IE' 'Firefox' 'Opera' 'Chrome' 'Safari']


In [93]:
data = pd.get_dummies(data, columns=['Browser'],  drop_first=True)

In [95]:
print(data[['OS', 'OS Version']].isna().sum())

OS               0
OS Version    4949
dtype: int64


In [97]:
data.dropna(subset=['OS Version'], inplace=True)

In [99]:
data = pd.get_dummies(data, columns=['OS'],  drop_first=True)

In [101]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

data['OS Version'] = le.fit_transform(data['OS Version'])

In [103]:
def modify_device(device):
    if device in ["iPhone", "iPod", "iPad", "UG", "Smartphone", "Tablet"]:
        return "Mobile"
    else:
        return "Desktop"

data['Device'] = data['Device'].apply(modify_device)

In [105]:
data = pd.get_dummies(data, columns=['Device', 'Engine'],  drop_first=True)

In [107]:
le = LabelEncoder()

data['city'] = le.fit_transform(data['city'])

le = LabelEncoder()

data['state'] = le.fit_transform(data['state'])

In [109]:
print(data[['city', 'state', 'pop','lat', 'long']].isna().sum())

city        0
state       0
pop      3100
lat       516
long      516
dtype: int64


In [111]:
data.dropna(subset=['lat'], inplace=True)

In [113]:
print(data[['city', 'state', 'pop','lat', 'long']].isna().sum())

city        0
state       0
pop      2584
lat         0
long        0
dtype: int64


We fill missing values with 0 because population missing value are likely to be associated with small and unknown towns or cities, and their population is small.

In [116]:
import numpy as np
data['pop'] = data['pop'].fillna(0)

In [118]:
data.drop(columns=['asn_num Proxy', 'country Proxy', 'lat Proxy', 'long Proxy', 'asn_proxy type'], inplace = True)

<h2>Encoding Countries</h2>

In [121]:
from sklearn.preprocessing import LabelEncoder

all_countries = pd.concat([data['country Source'], data['country Destination']]).unique()

le_countries = LabelEncoder()
le_countries.fit(all_countries)

data['country Source'] = le_countries.transform(data['country Source'])
data['country Destination'] = le_countries.transform(data['country Destination'])

<h1>Model</h1>

In [124]:
data.drop(columns=['state'], inplace = True)                         
data.drop(columns=['octet4 Source',
'octet1 Destination',                         
'octet2 Destination',                         
'octet3 Destination',                         
'octet4 Destination'], inplace=True)
data.drop(columns=['country Source', 'country Destination'], inplace=True)

In [126]:
print(data.isna().sum())

Source Port           0
Destination Port      0
Packet Length         0
Malware Indicators    0
Anomaly Scores        0
                     ..
OS_iOS                0
Device_Mobile         0
Engine_Gecko          0
Engine_Presto         0
Engine_Trident        0
Length: 61, dtype: int64


In [128]:
y = data['Attack Type']
X = data.drop(columns=['Attack Type'])

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_sample_weight

weights = compute_sample_weight(class_weight='balanced', y=y_train)

model = XGBClassifier(
    n_estimators=1500,
    learning_rate=0.01,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
)

model.fit(X_train, y_train, sample_weight=weights)

y_pred = model.predict(X_test)

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.35      0.34      0.35      1501
           1       0.35      0.35      0.35      1505
           2       0.34      0.35      0.35      1495

    accuracy                           0.35      4501
   macro avg       0.35      0.35      0.35      4501
weighted avg       0.35      0.35      0.35      4501



<h1>Save Model</h1>

In [131]:
from joblib import dump

dump(model, 'trained_model.joblib')

['trained_model.joblib']