# Data Exploration

In [109]:
import pandas as pd
import numpy as np
import sys
import os
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

MAIN_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(MAIN_DIR, 'data')

TRAIN_DATA_NAME = 'UNSW_NB15_training-set.csv'
TEST_DATA_NAME = 'UNSW_NB15_testing-set.csv'

TRAIN_DIR = os.path.join(DATA_DIR, TRAIN_DATA_NAME)
TEST_DIR = os.path.join(DATA_DIR, TEST_DATA_NAME)

## Read Data

In [110]:
df_train = pd.read_csv(TRAIN_DIR)

df_train.shape

(82332, 45)

In [111]:
print(df_train.columns.values)
df_train.head()

['id' 'dur' 'proto' 'service' 'state' 'spkts' 'dpkts' 'sbytes' 'dbytes'
 'rate' 'sttl' 'dttl' 'sload' 'dload' 'sloss' 'dloss' 'sinpkt' 'dinpkt'
 'sjit' 'djit' 'swin' 'stcpb' 'dtcpb' 'dwin' 'tcprtt' 'synack' 'ackdat'
 'smean' 'dmean' 'trans_depth' 'response_body_len' 'ct_srv_src'
 'ct_state_ttl' 'ct_dst_ltm' 'ct_src_dport_ltm' 'ct_dst_sport_ltm'
 'ct_dst_src_ltm' 'is_ftp_login' 'ct_ftp_cmd' 'ct_flw_http_mthd'
 'ct_src_ltm' 'ct_srv_dst' 'is_sm_ips_ports' 'attack_cat' 'label']


Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,...,1,2,0,0,0,1,2,0,Normal,0
1,2,8e-06,udp,-,INT,2,0,1762,0,125000.0003,...,1,2,0,0,0,1,2,0,Normal,0
2,3,5e-06,udp,-,INT,2,0,1068,0,200000.0051,...,1,3,0,0,0,1,3,0,Normal,0
3,4,6e-06,udp,-,INT,2,0,900,0,166666.6608,...,1,3,0,0,0,2,3,0,Normal,0
4,5,1e-05,udp,-,INT,2,0,2126,0,100000.0025,...,1,3,0,0,0,2,3,0,Normal,0


In [112]:
df_train.isna().sum().sum()

np.int64(0)

# Split data

In [113]:
df_train.head(2)

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,...,1,2,0,0,0,1,2,0,Normal,0
1,2,8e-06,udp,-,INT,2,0,1762,0,125000.0003,...,1,2,0,0,0,1,2,0,Normal,0


In [140]:
print(df_train['attack_cat'].head())
df_train.groupby('attack_cat').size().sort_values(ascending=False)

0    Normal
1    Normal
2    Normal
3    Normal
4    Normal
Name: attack_cat, dtype: object


attack_cat
Normal            37000
Generic           18871
Exploits          11132
Fuzzers            6062
DoS                4089
Reconnaissance     3496
Analysis            677
Backdoor            583
Shellcode           378
Worms                44
dtype: int64

In [114]:
X = df_train.drop(columns=["attack_cat","label"])
y = df_train["label"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

## Clean Data

In [115]:
df_train_cleaned = X_train.dropna(axis=0, how='any')
df_test_cleaned = X_test.dropna(axis=0, how='any')

cols_drop = ['id']
df_train_cleaned = df_train_cleaned.drop(columns=cols_drop)
df_test_cleaned = df_test_cleaned.drop(columns=cols_drop)

In [116]:
df_numeric_train = df_train_cleaned.select_dtypes(include=[np.number])
df_numeric_train.describe(include='all')

df_numeric_test = df_test_cleaned.select_dtypes(include=[np.number])

In [117]:
fltr = df_numeric_train == 0
df_numeric_train = df_numeric_train.astype(float)
df_numeric_train[fltr] = df_numeric_train[fltr] + 1e-9
df_numeric_train = df_numeric_train.map(lambda x: np.log(x))
df_numeric_train

fltr = df_numeric_test == 0
df_numeric_test = df_numeric_test.astype(float)
df_numeric_test[fltr] = df_numeric_test[fltr] + 1e-9
df_numeric_test = df_numeric_test.map(lambda x: np.log(x))
df_numeric_test

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
38666,0.256716,3.951244,3.988984,7.984122,8.226841,4.397244,3.433987,3.367296,9.787576,10.030945,...,1.609438,0.000000,0.000000,1.386294,0.000000,0.000000,-20.723266,1.945910,0.000000,-20.723266
56460,-11.869600,0.693147,-20.723266,4.736198,-20.723266,11.869600,5.537334,-20.723266,17.992093,-20.723266,...,1.386294,1.609438,1.386294,1.386294,-20.723266,-20.723266,-20.723266,1.609438,1.386294,-20.723266
16266,-11.512925,0.693147,-20.723266,4.736198,-20.723266,11.512925,5.537334,-20.723266,17.635418,-20.723266,...,3.135494,3.135494,2.564949,3.931826,-20.723266,-20.723266,-20.723266,3.135494,3.931826,-20.723266
75603,-20.723266,0.000000,-20.723266,3.828641,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,...,0.000000,0.000000,0.000000,0.000000,-20.723266,-20.723266,-20.723266,0.000000,0.000000,0.000000
11200,-1.660742,2.302585,1.791759,7.286192,5.590987,4.368792,5.537334,5.529429,10.921014,9.151829,...,0.000000,0.000000,0.000000,0.000000,-20.723266,-20.723266,-20.723266,0.000000,0.000000,-20.723266
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1449,-0.107345,2.639057,2.484907,6.677083,6.519147,3.326221,5.537334,5.529429,8.790730,8.619726,...,0.000000,0.000000,0.000000,1.386294,-20.723266,-20.723266,-20.723266,0.000000,1.386294,-20.723266
50900,-12.429216,0.693147,-20.723266,4.736198,-20.723266,12.429216,5.537334,-20.723266,18.551709,-20.723266,...,1.609438,1.609438,0.693147,1.609438,-20.723266,-20.723266,-20.723266,1.609438,1.945910,-20.723266
16106,-12.429216,0.693147,-20.723266,4.736198,-20.723266,12.429216,5.537334,-20.723266,18.551709,-20.723266,...,2.639057,2.639057,2.639057,3.401197,-20.723266,-20.723266,-20.723266,2.708050,3.401197,-20.723266
20440,-11.736069,0.693147,-20.723266,4.736198,-20.723266,11.736069,5.537334,-20.723266,17.858562,-20.723266,...,2.890372,2.890372,2.890372,3.044522,-20.723266,-20.723266,-20.723266,2.890372,3.044522,-20.723266


In [118]:
scaler = StandardScaler()
scaler.fit(df_numeric_train)
df_numeric_scaled_train = scaler.transform(df_numeric_train)
df_numeric_scaled_train = pd.DataFrame(df_numeric_scaled_train, columns=df_numeric_train.columns)

scaler = StandardScaler()
scaler.fit(df_numeric_test)
df_numeric_scaled_test = scaler.transform(df_numeric_test)
df_numeric_scaled_test = pd.DataFrame(df_numeric_scaled_test, columns=df_numeric_test.columns)

### Categorigal transformation

reduce categories dimensionality, select top K categories

In [119]:
class CardinalityReducer(BaseEstimator, TransformerMixin):
    def __init__(self, top_n=5, placeholder='otros'):
        self.top_n = top_n
        self.placeholder = placeholder
        self.top_categories_ = {}

    def fit(self, X, y=None):
        X = X.copy()
        for col in X.select_dtypes(include='object'):
            self.top_categories_[col] = X[col].value_counts().nlargest(self.top_n).index.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        for col, top_vals in self.top_categories_.items():
            X[col] = X[col].where(X[col].isin(top_vals), self.placeholder)
        return X
    
cardinality_reducer = CardinalityReducer(top_n=5, placeholder='otros')

In [120]:
df_categorical_train = df_train_cleaned.select_dtypes(exclude=[np.number])
df_categorical_train

cardinality_reducer.fit(df_categorical_train)
df_categorical_reduced_train = cardinality_reducer.transform(df_categorical_train)
df_categorical_reduced_train.nunique()

df_categorical_test = df_test_cleaned.select_dtypes(exclude=[np.number])
df_categorical_reduced_test = cardinality_reducer.transform(df_categorical_test)

In [121]:
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
enc.fit(df_categorical_reduced_train)
df_categorical_encoded_train = enc.transform(df_categorical_reduced_train)
df_categorical_encoded_train = pd.DataFrame(df_categorical_encoded_train, columns=enc.get_feature_names_out())

df_categorical_encoded_test = enc.transform(df_categorical_reduced_test)
df_categorical_encoded_test = pd.DataFrame(df_categorical_encoded_test, columns=enc.get_feature_names_out())

In [122]:
train_df = pd.concat([df_numeric_scaled_train, df_categorical_encoded_train], axis=1)

test_df = pd.concat([df_numeric_scaled_test, df_categorical_encoded_test], axis=1)

train_df

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,...,service_ftp,service_http,service_otros,service_smtp,state_ACC,state_CON,state_FIN,state_INT,state_REQ,state_otros
0,1.117040,1.733455,0.994559,2.615531,0.952687,-0.630662,-0.378179,0.774446,-0.229043,0.824092,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.135925,2.599941,0.952925,3.621634,0.873623,-0.527310,0.312098,0.945215,0.036448,0.745713,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.997666,0.401653,0.869974,0.236297,0.876502,-0.804085,-0.150702,0.945215,-0.791216,0.795449,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-1.177848,-0.898467,-1.128979,-1.119428,-1.125732,1.066529,0.312098,-1.128267,0.916784,-1.120818,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.209466,1.436403,0.985870,0.896110,1.065991,0.340561,-0.378179,0.774446,0.200938,1.262299,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65860,0.785638,0.401653,0.850602,0.228549,0.941254,-0.587764,-0.150702,0.945215,-0.579894,0.930131,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
65861,1.132088,2.599941,0.976214,3.638730,1.037702,-0.514452,0.312098,0.945215,0.045108,0.893819,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
65862,0.892187,0.401653,0.825628,-0.017046,0.770079,-0.731508,0.312098,0.945215,-0.756039,0.735142,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
65863,1.049066,0.401653,0.850602,0.189942,0.790130,-0.883268,0.312098,0.945215,-0.855919,0.697648,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


# Modelos

In [125]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression

model_performance = pd.DataFrame(columns=['Accuracy','Recall','Precision','F1-Score'])

### Logistic Regression

In [126]:
logi_adj = LogisticRegression(random_state=42, max_iter=10000, penalty=None, solver="lbfgs")
logi_adj.fit(train_df, y_train)
y_pred = logi_adj.predict(test_df)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
# dict_accuracy_models["reg_log_ajustado"] = accuracy
print(f'Accuracy: {accuracy:.4f}')

#calculo Precision usado=1
prec = precision_score(y_test, y_pred)
# dict_precision_models["reg_log_ajustado"] = prec

#calculo Recall nuevo = 1
rec = recall_score(y_test, y_pred)
# dict_recall_models["reg_log_ajustado"] = rec

#calculo F1-Score
f1 = f1_score(y_test, y_pred)

model_performance.loc['Logistic Regression'] = [accuracy,
                                                rec,
                                                prec,
                                                f1]

Accuracy: 0.9453


In [133]:
pd.DataFrame(logi_adj.coef_.T, index=logi_adj.feature_names_in_, columns=['coef']).sort_values(by='coef', ascending=False)

Unnamed: 0,coef
tcprtt,23.842772
dbytes,19.190676
state_INT,12.895752
dttl,12.567733
proto_otros,12.499812
proto_unas,11.321867
proto_ospf,10.280997
ct_state_ttl,9.366992
state_REQ,7.34481
service_smtp,7.077188


### Gradient Boosting

In [134]:
xgb = GradientBoostingClassifier()

xgb.fit(train_df, y_train)
y_pred = xgb.predict(test_df)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
# dict_accuracy_models["reg_log_ajustado"] = accuracy
print(f'Accuracy: {accuracy:.4f}')

#calculo Precision usado=1
prec = precision_score(y_test, y_pred)
# dict_precision_models["reg_log_ajustado"] = prec

#calculo Recall nuevo = 1
rec = recall_score(y_test, y_pred)
# dict_recall_models["reg_log_ajustado"] = rec

#calculo F1-Score
f1 = f1_score(y_test, y_pred)

model_performance.loc['XGBoost'] = [accuracy,
                                    rec,
                                    prec,
                                    f1]

Accuracy: 0.8826


In [136]:
feature_importances = xgb.feature_importances_

importance_df = pd.DataFrame({
    'Feature': train_df.columns,
    'Importance': feature_importances
})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df.head(10)

Unnamed: 0,Feature,Importance
6,sttl,0.388837
32,ct_dst_src_ltm,0.13036
31,ct_dst_sport_ltm,0.090021
3,sbytes,0.072076
21,synack,0.062278
23,smean,0.052175
20,tcprtt,0.047535
37,ct_srv_dst,0.034081
4,dbytes,0.026072
54,state_INT,0.014543


In [135]:
model_performance

Unnamed: 0,Accuracy,Recall,Precision,F1-Score
Logistic Regression,0.945345,0.953144,0.947698,0.950413
XGBoost,0.882553,0.821859,0.9585,0.884936
