In [None]:
import numpy as np
import pandas as pd


dfs = []
for i in range(1,5):
    path = 'UNSW-NB15_{}.csv'  # There are 4 input csv files
    dfs.append(pd.read_csv(path.format(i), header = None,low_memory=False))
data = pd.concat(dfs).reset_index(drop=True)

# This csv file contains names of all the features
df_col = pd.read_csv('NUSW-NB15_features.csv', encoding='ISO-8859-1')
# Making column names lower case, removing spaces
df_col['Name'] = df_col['Name'].apply(lambda x: x.strip().replace(' ', '').lower())
# Renaming our dataframe with proper column names
data.columns = df_col['Name']
dataSet = data
dataSet.head()# Iterate through each column and print unique values
for col in dataSet.columns:
    unique_values = dataSet[col].unique()
    num_distinct_values = dataSet[col].nunique()
    # print(f'{col} have {num_distinct_values} value : {unique_values}')

In [None]:
# Select only categorical columns
categorical_columns = dataSet.select_dtypes(include=['object']).columns
# Initialize an empty dictionary to store results
result_dict = {}

# Iterate through each categorical column
for column in categorical_columns:
    # Calculate value counts
    value_counts = dataSet[column].value_counts()

    # Calculate percentages
    percentages = (value_counts / len(dataSet[column])) * 100

    # Combine counts and percentages into a DataFrame
    result_df = pd.DataFrame({
        'Count': value_counts,
        'Percentage': percentages
    })

    # Store the result in the dictionary
    result_dict[column] = result_df

# Print the results
for column, result_df in result_dict.items():
    print(f"Column: {column}")
    print(result_df)
    print()

Column: srcip
                 Count  Percentage
59.166.0.6        3111    8.691644
59.166.0.5        3029    8.462549
59.166.0.4        3011    8.412259
59.166.0.8        2972    8.303300
59.166.0.3        2955    8.255804
59.166.0.0        2949    8.239041
59.166.0.1        2932    8.191546
59.166.0.2        2856    7.979214
59.166.0.7        2805    7.836728
59.166.0.9        2692    7.521024
175.45.176.3      2453    6.853295
149.171.126.18    2320    6.481714
175.45.176.0       579    1.617635
175.45.176.1       431    1.204146
175.45.176.2       324    0.905205
10.40.85.1          69    0.192775
10.40.182.1         67    0.187187
10.40.182.6         60    0.167631
10.40.182.3         37    0.103372
10.40.170.2         37    0.103372
10.40.85.30         34    0.094991
192.168.241.243     10    0.027938
149.171.126.12       9    0.025145
10.40.85.10          8    0.022351
149.171.126.9        7    0.019557
149.171.126.5        6    0.016763
149.171.126.3        6    0.016763
149.17

In [None]:
dataSet['attack_cat'] = dataSet.attack_cat.fillna(value='normal').apply(lambda x: x.strip().lower())
dataSet['attack_cat'] = dataSet['attack_cat'].replace('backdoors','backdoor', regex=True).apply(lambda x: x.strip().lower())
dataSet['attack_cat'].unique()
# Replace '-' with 'unknown'
dataSet['service'] = dataSet['service'].replace('-', 'unknown')

# Remove leading and trailing whitespaces
dataSet['ct_ftp_cmd'] = dataSet['ct_ftp_cmd'].astype(str).str.strip()

# Replace empty strings with NaN
dataSet['ct_ftp_cmd'].replace('', np.nan, inplace=True)

# Convert the column to numeric type
dataSet['ct_ftp_cmd'] = pd.to_numeric(dataSet['ct_ftp_cmd'], errors='coerce')


In [None]:
numerical_col = dataSet.select_dtypes(include=np.number).columns  # All the numerical columns list
categorical_col = dataSet.select_dtypes(exclude=np.number).columns  # All the categorical columns list

# Calculate the median and replace null values

for col in dataSet.columns:
    if dataSet[col].dtypes == 'object' :
        val = dataSet[col].mode().values[0]  # Mode value of the column in train data
        dataSet[col] = dataSet[col].fillna(value=val).replace(' ', val)
    else:
        val = dataSet[col].median()
        dataSet[col] = dataSet[col].fillna(value=val).replace(' ', val)
    print(col,val)


srcip 59.166.0.6
sport 31867.0
dstip 149.171.126.3
dsport 53
proto tcp
state FIN
dur 0.024907
sbytes 1540.0
dbytes 2260.0
sttl 31.0
dttl 29.0
sloss 3.0
dloss 4.0
service unknown
sload 546792.8125
dload 581699.3125
spkts 12.0
dpkts 12.0
swin 255.0
dwin 255.0
stcpb 821125705.0
dtcpb 794963492.0
smeansz 73.0
dmeansz 91.0
trans_depth 0.0
res_bdy_len 0.0
sjit 24.003180999999998
djit 13.856473000000001
stime 1424231148.0
ltime 1424231148.0
sintpkt 0.588141
dintpkt 0.5514795
tcprtt 0.000633
synack 0.000499
ackdat 0.000125
is_sm_ips_ports 0.0
ct_state_ttl 0.0
ct_flw_http_mthd 0.0
is_ftp_login 0.0
ct_ftp_cmd 0.0
ct_srv_src 6.0
ct_srv_dst 5.0
ct_dst_ltm 3.0
ct_src_ltm 4.0
ct_src_dport_ltm 1.0
ct_dst_sport_ltm 1.0
ct_dst_src_ltm 2.0
attack_cat normal
label 0.0


In [None]:
# Iterate through each column and print unique values
for col in dataSet.columns:
    unique_values = dataSet[col].unique()
    num_distinct_values = dataSet[col].nunique()
    print(f'{col} have {num_distinct_values} value : {unique_values}')
# Fixing binary columns
dataSet['is_ftp_login'] = np.where(dataSet['is_ftp_login']>1, 1, dataSet['is_ftp_login'])
for col in dataSet.columns:
    print(col)
    print(dataSet[col].unique())
dataSet.drop(['label','stime','ltime', 'srcip', 'dstip', 'dsport'], axis=1, inplace=True)
dataSet['attack_cat'].replace({ 'dos' : 1, 'normal':0, 'reconnaissance':1, 'backdoor':1, 'exploits':1,
       'analysis':1, 'fuzzers':1, 'worms':1, 'shellcode':1, 'generic':1},inplace = True)
for col_name in dataSet.columns:
    if dataSet[col_name].dtypes == 'object' :
        unique_cat = len(dataSet[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))
# Select columns with data type 'object'
object_columns = dataSet.select_dtypes(include=['object'])

# Print or use the selected columns as needed
# print(object_columns)


srcip have 35 value : ['59.166.0.0' '59.166.0.6' '59.166.0.5' '59.166.0.3' '10.40.182.3'
 '59.166.0.7' '10.40.170.2' '59.166.0.1' '59.166.0.2' '59.166.0.4'
 '175.45.176.3' '175.45.176.2' '175.45.176.0' '59.166.0.8' '59.166.0.9'
 '175.45.176.1' '10.40.182.1' '10.40.85.1' '192.168.241.243' '10.40.85.30'
 '149.171.126.5' '149.171.126.8' '149.171.126.3' '149.171.126.6'
 '149.171.126.4' '149.171.126.2' '149.171.126.9' '149.171.126.7'
 '149.171.126.0' '149.171.126.1' '149.171.126.18' '10.40.85.10'
 '149.171.126.12' '10.40.182.6' '149.171.126.11']
sport have 24273 value : [ 1390 33661  1464 ... 55855 18606 37709]
dstip have 43 value : ['149.171.126.6' '149.171.126.9' '149.171.126.7' '149.171.126.5'
 '149.171.126.0' '149.171.126.4' '10.40.182.3' '10.40.170.2'
 '149.171.126.18' '149.171.126.16' '149.171.126.8' '149.171.126.2'
 '149.171.126.1' '149.171.126.10' '149.171.126.3' '149.171.126.15'
 '149.171.126.14' '149.171.126.12' '149.171.126.13' '149.171.126.11'
 '224.0.0.5' '149.171.126.17' '149.

In [None]:

droppable_cols = ['srcip', 'dstip', 'dsport']
binary_cols = ['is_sm_ips_ports', 'is_ftp_login']
target_cols = ['attack_cat']


In [None]:
# Define the individual steps
ohe_step = ('ohe', OneHotEncoder(sparse=False))
ssc_step = ('std_sclr', StandardScaler())


# Make the step part of a pipeline
ohe_pipe = Pipeline([ohe_step])
ssc_pipe = Pipeline([ssc_step])


# Columns to transform: categorical columns for encoding, numeric feature columns for standardizing
ohe_cols = ['proto', 'state', 'service']
non_ssc_cols = ohe_cols+droppable_cols+binary_cols+target_cols
#ssc_cols = all_data.drop(columns = ohe_cols+droppable_cols+['label']).columns
ssc_cols = [col for col in dataSet.columns if col not in non_ssc_cols]

# Transformer input: tuple w/ contents ('name', SomeTransformer(Parameters), columns)
transformer = [
    ('one_hot_encoding', ohe_pipe, ohe_cols),
    ('standard_scaling', ssc_pipe, ssc_cols)
]

In [None]:
# Implementing and testing the column transformer
ct = ColumnTransformer(transformers=transformer, remainder='passthrough')

In [None]:
# Recreating column labels for one_hot_encoded data
cat_cols = np.concatenate((np.sort(dataSet.proto.unique()),
                           np.sort(dataSet.state.unique()),
                           np.sort(dataSet.service.unique())))

# Combining transformed column labels with non-transformed column labels.
# Order matters here: transformed columns in order, non-transformed features, target column
new_cols =  np.concatenate((cat_cols, ssc_cols, binary_cols, target_cols))

In [None]:
# new_data = pd.DataFrame(ct.fit_transform(dataSet.drop(columns=droppable_cols)))
# new_data.columns = new_cols

In [None]:
corrs = dataSet.corr()
corrs['attack_cat'].sort_values(ascending=False)

  corrs = dataSet.corr()


Name
attack_cat          1.000000
sttl                0.885745
ct_state_ttl        0.883514
ct_dst_src_ltm      0.395548
ct_dst_sport_ltm    0.385593
ct_src_dport_ltm    0.382632
dttl                0.363775
ct_src_ltm          0.333373
ct_dst_ltm          0.332360
ct_srv_src          0.331013
ct_srv_dst          0.328497
tcprtt              0.216732
ackdat              0.203641
synack              0.177947
sload               0.161582
sbytes              0.023111
trans_depth         0.021207
ct_flw_http_mthd    0.014486
sjit                0.014159
dintpkt            -0.008429
smeansz            -0.011122
dur                -0.012753
is_sm_ips_ports    -0.014592
sintpkt            -0.019649
res_bdy_len        -0.027013
sloss              -0.030401
ct_ftp_cmd         -0.032556
is_ftp_login       -0.035356
djit               -0.046827
dbytes             -0.057610
dloss              -0.072595
dpkts              -0.088977
sport              -0.093887
spkts              -0.105539
dtcpb    

In [None]:
# data_prep = ('data_prep', ColumnTransformer(transformers=transformer, remainder='passthrough'))

In [None]:
dataSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58074 entries, 0 to 58073
Data columns (total 43 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   sport             58074 non-null  float64
 1   proto             58074 non-null  object 
 2   state             58074 non-null  object 
 3   dur               58074 non-null  float64
 4   sbytes            58074 non-null  float64
 5   dbytes            58074 non-null  float64
 6   sttl              58074 non-null  float64
 7   dttl              58074 non-null  float64
 8   sloss             58074 non-null  float64
 9   dloss             58074 non-null  float64
 10  service           58074 non-null  object 
 11  sload             58074 non-null  float64
 12  dload             58074 non-null  float64
 13  spkts             58074 non-null  float64
 14  dpkts             58074 non-null  float64
 15  swin              58074 non-null  float64
 16  dwin              58074 non-null  float6

In [None]:
import numpy as np
import pandas as pd
# import pickle as pkl

# Data processing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Clustering Methods
from sklearn.cluster import DBSCAN
# Assuming dataSet is your DataFrame
total_samples = len(dataSet)
sample_size = int(0.5* total_samples)

# Randomly sample 15% of the data
random_indices = np.random.choice(total_samples, size=sample_size, replace=False)

# Select the subset of data based on the random indices
sampled_data = dataSet.iloc[random_indices]

# Split the sampled data into features (x) and labels (y)
x = sampled_data.iloc[:, 0:42]
y = sampled_data.iloc[:, 42]

In [None]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29037 entries, 20329 to 1843
Data columns (total 42 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   sport             29037 non-null  float64
 1   proto             29037 non-null  object 
 2   state             29037 non-null  object 
 3   dur               29037 non-null  float64
 4   sbytes            29037 non-null  float64
 5   dbytes            29037 non-null  float64
 6   sttl              29037 non-null  float64
 7   dttl              29037 non-null  float64
 8   sloss             29037 non-null  float64
 9   dloss             29037 non-null  float64
 10  service           29037 non-null  object 
 11  sload             29037 non-null  float64
 12  dload             29037 non-null  float64
 13  spkts             29037 non-null  float64
 14  dpkts             29037 non-null  float64
 15  swin              29037 non-null  float64
 16  dwin              29037 non-null  flo

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import metrics

# ... (chargement des données et prétraitement)

# Encodage One-Hot des variables catégorielles
encoder = OneHotEncoder(drop='first', sparse=False)
categorical_features = ['sport', 'proto', 'service', 'state']
dataSet_encoded = pd.get_dummies(x, columns=categorical_features)

# Remplacement des valeurs manquantes
dataSet_encoded = dataSet_encoded.fillna(dataSet_encoded.median())


In [None]:
# # Normalisation des données
# scaler = StandardScaler()
# dataSet_scaled = scaler.fit_transform(dataSet_encoded)

from sklearn.preprocessing import MinMaxScaler
from joblib import dump
scaler = MinMaxScaler()
scaler.fit(dataSet_encoded)

#dump(scaler, 'UnswScaler.joblib')

dataSet_scaled=pd.DataFrame(scaler.transform(dataSet_encoded),
            columns=dataSet_encoded.columns, index=dataSet_encoded.index)

# Application de l'algorithme DBSCAN
dbscan = DBSCAN(eps=0.7, min_samples=42)
clusters = dbscan.fit_predict(dataSet_scaled)

# Ajout des labels de cluster au DataFrame d'entraînement
x['cluster'] = clusters
print('num of clusters,'+clusters)

# Calcul du coefficient de silhouette
silhouette_score = metrics.silhouette_score(dataSet_scaled, dbscan.labels_)

# Calcul de l'indice de Davies-Bouldin
davies_bouldin_score = metrics.davies_bouldin_score(dataSet_scaled, dbscan.labels_)

print(f"Silhouette Coefficient: {silhouette_score}")
print(f"Davies-Bouldin Index: {davies_bouldin_score}")


NameError: ignored

In [None]:
# # # Normalisation des données
# # scaler = StandardScaler()
# # dataSet_scaled = scaler.fit_transform(dataSet_encoded)

# from sklearn.preprocessing import MinMaxScaler
# from joblib import dump
# scaler = MinMaxScaler()
# scaler.fit(dataSet_encoded)

# #dump(scaler, 'UnswScaler.joblib')

# dataSet_scaled=pd.DataFrame(scaler.transform(dataSet_encoded),
#             columns=dataSet_encoded.columns, index=dataSet_encoded.index)

# # Application de l'algorithme DBSCAN
# dbscan = DBSCAN(eps=0.35, min_samples=)
# clusters = dbscan.fit_predict(dataSet_scaled)

# # Ajout des labels de cluster au DataFrame d'entraînement
# x['cluster'] = clusters

# # Calcul du coefficient de silhouette
# silhouette_score = metrics.silhouette_score(dataSet_scaled, dbscan.labels_)

# # Calcul de l'indice de Davies-Bouldin
# davies_bouldin_score = metrics.davies_bouldin_score(dataSet_scaled, dbscan.labels_)

# print(f"Silhouette Coefficient: {silhouette_score}")
# print(f"Davies-Bouldin Index: {davies_bouldin_score}")

In [None]:
# Define the individual steps
ohe_step = ('ohe', OneHotEncoder(sparse=False))
ssc_step = ('std_sclr', StandardScaler())

# Make the step part of a pipeline
ohe_pipe = Pipeline([ohe_step])
ssc_pipe = Pipeline([ssc_step])

# Columns to transform: categorical columns for encoding, numeric feature columns for standardizing
ohe_cols = ['proto', 'state', 'service']
binary_cols = ['is_sm_ips_ports', 'is_ftp_login']
ssc_cols = [col for col in x.columns if col not in ohe_cols+binary_cols]

# Transformer input: tuple w/ contents ('name', SomeTransformer(Parameters), columns)
transformer = [
    ('one_hot_encoding', ohe_pipe, ohe_cols),
    ('standard_scaling', ssc_pipe, ssc_cols)
]
ct = ColumnTransformer(transformers=transformer, remainder='passthrough')

In [None]:
# # Save Column Transformer object for later use as these column names and conditions do not change between datasets
# with open('ct_ohe_ssc_xyagg.pkl', 'wb') as f:
#     pkl.dump(ct,f)

In [None]:
# Apply transformations to the features
x_transformed = ct.fit_transform(x)

# Apply DBSCAN to the transformed features
dbscan = DBSCAN(eps=0.35, min_samples=10)
clusters = dbscan.fit_predict(x_transformed)
# db = DBSCAN(eps=0.35, min_samples=10).fit(x_transformed)
# db = DBSCAN(eps=0.35, min_samples=10).fit(ct.fit_transform(x))



In [None]:
print(x.dtypes)

Name
srcip                object
sport               float64
dstip                object
dsport               object
proto                object
state                object
dur                 float64
sbytes              float64
dbytes              float64
sttl                float64
dttl                float64
sloss               float64
dloss               float64
service              object
sload               float64
dload               float64
spkts               float64
dpkts               float64
swin                float64
dwin                float64
stcpb               float64
dtcpb               float64
smeansz             float64
dmeansz             float64
trans_depth         float64
res_bdy_len         float64
sjit                float64
djit                float64
sintpkt             float64
dintpkt             float64
tcprtt              float64
synack              float64
ackdat              float64
is_sm_ips_ports     float64
ct_state_ttl        float64
ct_flw_http_mth

In [None]:
pd.Series(clusters.labels_).unique().size

AttributeError: ignored

In [None]:
y.value_counts()

0    7256
1     802
Name: attack_cat, dtype: int64

In [None]:
from sklearn import metrics
# Ajout des labels de cluster au DataFrame d'entraînement
x['cluster'] = clusters

# Calcul du coefficient de silhouette
silhouette_score = metrics.silhouette_score(x_transformed, dbscan.labels_)

# Calcul de l'indice de Davies-Bouldin
davies_bouldin_score = metrics.davies_bouldin_score(x_transformed, dbscan.labels_)

print(f"Silhouette Coefficient: {silhouette_score}")
print(f"Davies-Bouldin Index: {davies_bouldin_score}")

Silhouette Coefficient: -0.14501308101197735
Davies-Bouldin Index: 0.7831483391634988


In [None]:
dataSet.proto.value_counts().head(10)

tcp     11542
udp      5930
arp        82
ospf       82
sctp       20
icmp       14
t           1
udt         1
Name: proto, dtype: int64

In [None]:
dataSet.corr()['attack_cat'].sort_values(ascending=False)

  dataSet.corr()['attack_cat'].sort_values(ascending=False)


Name
attack_cat          1.000000
ct_state_ttl        0.892348
sttl                0.850975
dttl                0.404406
ct_dst_src_ltm      0.392508
ct_dst_sport_ltm    0.314059
ct_src_dport_ltm    0.298848
ct_srv_dst          0.248748
ct_srv_src          0.243432
ct_dst_ltm          0.230517
ct_src_ltm          0.227328
sload               0.216004
tcprtt              0.186899
ackdat              0.168022
synack              0.152361
smeansz             0.053155
trans_depth         0.044453
ct_flw_http_mthd    0.034687
sbytes              0.033103
dur                -0.009051
dintpkt            -0.010686
sjit               -0.011861
is_sm_ips_ports    -0.013075
sintpkt            -0.016865
res_bdy_len        -0.022635
ct_ftp_cmd         -0.023178
is_ftp_login       -0.024424
sloss              -0.033337
djit               -0.040171
dbytes             -0.056166
dloss              -0.070502
sport              -0.081724
dpkts              -0.084899
spkts              -0.095109
dtcpb    