In [1]:
#This repo is to share the code shared during the TechGig webinar. 
#We have used a dataset taken from Canadian Institute of CyberSecurity. 
#We cannot share the data here; please contact a.habibi.l@unb.ca to obtain the dataset.
import pandas as pd
import numpy as np

In [2]:
datapath = 'https://raw.githubusercontent.com/rcarmas/datasets-Internetworking/main/SelectedFeatures-10s-TOR-NonTOR.csv'
names = ['Source IP', 'Source Port', 'Destination IP', 'Destination Port', 'Protocol', 'Flow Duration', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min','Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min','Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min','Active Mean', 'Active Std', 'Active Max', 'Active Min','Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min','label']
# Read data from csv
dataframe = pd.read_csv(datapath,names=names,low_memory=False)

In [14]:
dataframe.head()

Unnamed: 0,Source IP,Source Port,Destination IP,Destination Port,Protocol,Flow Duration,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,...,Bwd IAT Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label
17256,131.202.240.87,53783,224.0.0.252,5355,17,419860,104.7968,4.763493,419860.0,0.0,...,0,0,0,0,0,0,0,0,0,nonTOR
3114,131.202.240.150,53803,173.194.123.72,443,6,24991,0.0,80.02881,24991.0,0.0,...,0,0,0,0,0,0,0,0,0,nonTOR
66514,175.121.47.31,49585,131.202.240.87,61009,17,333,1177177.0,6006.006006,333.0,0.0,...,0,0,0,0,0,0,0,0,0,nonTOR
65055,131.202.240.150,39400,131.202.244.5,53,17,377,663130.0,5305.039788,377.0,0.0,...,0,0,0,0,0,0,0,0,0,nonTOR
43357,131.202.240.150,55558,173.194.123.110,443,6,25157,4213.539,119.251103,12578.5,17740.60203,...,34,0,0,0,0,0,0,0,0,nonTOR


In [3]:
# Normalise the data
def dfNormalize(df):
    for feature_name in df.columns:
        df.loc[:,feature_name]= pd.to_numeric(df.loc[:,feature_name], errors='coerce').fillna(0)
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()   
        if (max_value - min_value) > 0:
            df.loc[:,feature_name] = (df.loc[:,feature_name] - min_value) / (max_value - min_value)
        else:
            df.loc[:,feature_name] = (df.loc[:,feature_name]- min_value)    
    return df

In [4]:
print (dataframe.shape)
dataframe = dataframe.reindex(np.random.permutation(dataframe.index)).copy()
print(dataframe.describe())
print (list(dataframe))

(67834, 29)
        Source Port  Destination Port      Protocol  Flow Duration  \
count  67834.000000      67834.000000  67834.000000   6.783400e+04   
mean   37912.753324      11566.395967     12.167291   2.991884e+06   
std    20986.077326      18374.765123      5.459410   4.063005e+06   
min       21.000000         21.000000      6.000000   0.000000e+00   
25%    19305.000000        137.000000      6.000000   4.435975e+04   
50%    43677.000000        443.000000     17.000000   4.108570e+05   
75%    54685.000000      16311.000000     17.000000   7.325550e+06   
max    65534.000000      65514.000000     17.000000   1.000000e+07   

       Flow Bytes/s  Flow Packets/s  Flow IAT Mean  Flow IAT Std  \
count  6.783200e+04    6.783400e+04   6.783400e+04  6.783400e+04   
mean            inf             inf   3.155927e+05  2.209662e+05   
std             NaN             NaN   6.988069e+05  6.409506e+05   
min    0.000000e+00    2.002581e-01   0.000000e+00  0.000000e+00   
25%    1.046553e+

In [5]:
keys = dataframe.keys()
# Feature Selection, Dropping Source IP, Source Port, Destination IP and Destination Port as it 
# specific to each organisation network and generic model should not contain them. 
data_to_process = dataframe[keys[4:len(keys) - 1]].copy()
#data_to_process = dataframe[[' Source Port',' Destination Port', ' Flow Duration', ' Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean','Fwd IAT Mean','Bwd IAT Mean','Active Mean','Idle Mean','label']].copy()
# do a data normalization
x_normalised = dfNormalize(data_to_process)
print(x_normalised.describe())

           Protocol  Flow Duration  Flow Bytes/s  Flow Packets/s  \
count  67834.000000   67834.000000       67830.0         67828.0   
mean       0.560663       0.299188           0.0             0.0   
std        0.496310       0.406300           0.0             0.0   
min        0.000000       0.000000           0.0             0.0   
25%        0.000000       0.004436           0.0             0.0   
50%        1.000000       0.041086           0.0             0.0   
75%        1.000000       0.732555           0.0             0.0   
max        1.000000       1.000000           0.0             0.0   

       Flow IAT Mean  Flow IAT Std  Flow IAT Max  Flow IAT Min  Fwd IAT Mean  \
count   67834.000000  67834.000000  67834.000000  67834.000000  67834.000000   
mean        0.031600      0.031363      0.089855      0.019491      0.035016   
std         0.069971      0.090973      0.173880      0.057865      0.083937   
min         0.000000      0.000000      0.000000      0.000000     

In [6]:
change_labels = lambda x: 1 if x == 'nonTOR' else 0

In [7]:
y_normalised = dataframe['label'].apply(change_labels)

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(x_normalised, y_normalised, test_size=0.3, random_state=42)

In [10]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [11]:
#Escalar los datos utilizando StandardScaler.
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

In [12]:
#Entrenar un modelo SVM utilizando SVC de sklearn.
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(X_train_scaled, y_train)

In [13]:
#Evaluar el modelo utilizando las métricas de precisión, recall y f1-score.
from sklearn.metrics import precision_recall_fscore_support

y_pred = svm.predict(X_test_scaled)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1_score)

Precision: 0.9568423927888555
Recall: 0.9771814327159116
F1-score: 0.9669049656352646
