In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')


Import the CSV Data as Pandas DataFrame

In [3]:
data1 = pd.read_csv('data\cybersecurity_intrusion_data.csv')

Show Top 5 Records

In [4]:
data1.head()

Unnamed: 0,session_id,network_packet_size,protocol_type,login_attempts,session_duration,encryption_used,ip_reputation_score,failed_logins,browser_type,unusual_time_access,attack_detected
0,SID_00001,599,TCP,4,492.983263,DES,0.606818,1,Edge,0,1
1,SID_00002,472,TCP,3,1557.996461,DES,0.301569,0,Firefox,0,0
2,SID_00003,629,TCP,3,75.044262,DES,0.739164,2,Chrome,0,1
3,SID_00004,804,UDP,4,601.248835,DES,0.123267,0,Unknown,0,1
4,SID_00005,453,TCP,5,532.540888,AES,0.054874,1,Firefox,0,0


Shape of the dataset

In [5]:
data1.shape

(9537, 11)

3.1 Check Missing values

In [6]:
data1.isna().sum()

session_id                0
network_packet_size       0
protocol_type             0
login_attempts            0
session_duration          0
encryption_used        1966
ip_reputation_score       0
failed_logins             0
browser_type              0
unusual_time_access       0
attack_detected           0
dtype: int64

In [7]:
# Drop column 'session_id'
data = data1.drop('session_id', axis=1)
print(data)

      network_packet_size protocol_type  login_attempts  session_duration  \
0                     599           TCP               4        492.983263   
1                     472           TCP               3       1557.996461   
2                     629           TCP               3         75.044262   
3                     804           UDP               4        601.248835   
4                     453           TCP               5        532.540888   
...                   ...           ...             ...               ...   
9532                  194          ICMP               3        226.049889   
9533                  380           TCP               3        182.848475   
9534                  664           TCP               5         35.170248   
9535                  406           TCP               4         86.664703   
9536                  340           TCP               6         86.876744   

     encryption_used  ip_reputation_score  failed_logins browser_type  \
0 

In [8]:
# Fill NaNs with mode for each column
df = data.apply(lambda x: x.fillna(x.mode()[0]))

print(df)

      network_packet_size protocol_type  login_attempts  session_duration  \
0                     599           TCP               4        492.983263   
1                     472           TCP               3       1557.996461   
2                     629           TCP               3         75.044262   
3                     804           UDP               4        601.248835   
4                     453           TCP               5        532.540888   
...                   ...           ...             ...               ...   
9532                  194          ICMP               3        226.049889   
9533                  380           TCP               3        182.848475   
9534                  664           TCP               5         35.170248   
9535                  406           TCP               4         86.664703   
9536                  340           TCP               6         86.876744   

     encryption_used  ip_reputation_score  failed_logins browser_type  \
0 

3.2 Check Duplicates

In [9]:
df.duplicated().sum()

0

3.3 Check data types

In [10]:
# Check Null and Dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9537 entries, 0 to 9536
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   network_packet_size  9537 non-null   int64  
 1   protocol_type        9537 non-null   object 
 2   login_attempts       9537 non-null   int64  
 3   session_duration     9537 non-null   float64
 4   encryption_used      9537 non-null   object 
 5   ip_reputation_score  9537 non-null   float64
 6   failed_logins        9537 non-null   int64  
 7   browser_type         9537 non-null   object 
 8   unusual_time_access  9537 non-null   int64  
 9   attack_detected      9537 non-null   int64  
dtypes: float64(2), int64(5), object(3)
memory usage: 745.2+ KB


3.4 Checking the number of unique values of each column

In [11]:
df.nunique()

network_packet_size     959
protocol_type             3
login_attempts           13
session_duration       9532
encryption_used           2
ip_reputation_score    9537
failed_logins             6
browser_type              5
unusual_time_access       2
attack_detected           2
dtype: int64

3.5 Check statistics of data set

In [12]:
df.describe()

Unnamed: 0,network_packet_size,login_attempts,session_duration,ip_reputation_score,failed_logins,unusual_time_access,attack_detected
count,9537.0,9537.0,9537.0,9537.0,9537.0,9537.0,9537.0
mean,500.430639,4.032086,792.745312,0.331338,1.517773,0.149942,0.447101
std,198.379364,1.963012,786.560144,0.177175,1.033988,0.357034,0.49722
min,64.0,1.0,0.5,0.002497,0.0,0.0,0.0
25%,365.0,3.0,231.953006,0.191946,1.0,0.0,0.0
50%,499.0,4.0,556.277457,0.314778,1.0,0.0,0.0
75%,635.0,5.0,1105.380602,0.453388,2.0,0.0,1.0
max,1285.0,13.0,7190.392213,0.924299,5.0,1.0,1.0


3.7 Exploring Data

In [13]:
df.head()

Unnamed: 0,network_packet_size,protocol_type,login_attempts,session_duration,encryption_used,ip_reputation_score,failed_logins,browser_type,unusual_time_access,attack_detected
0,599,TCP,4,492.983263,DES,0.606818,1,Edge,0,1
1,472,TCP,3,1557.996461,DES,0.301569,0,Firefox,0,0
2,629,TCP,3,75.044262,DES,0.739164,2,Chrome,0,1
3,804,UDP,4,601.248835,DES,0.123267,0,Unknown,0,1
4,453,TCP,5,532.540888,AES,0.054874,1,Firefox,0,0


In [14]:
print("Categories in 'protocol_type' variable:     ",end=" " )
print(df['protocol_type'].unique())

print("Categories in 'encryption_used' variable:  ",end=" ")
print(df['encryption_used'].unique())

print("Categories in'browser_type' variable:",end=" " )
print(df['browser_type'].unique())


Categories in 'protocol_type' variable:      ['TCP' 'UDP' 'ICMP']
Categories in 'encryption_used' variable:   ['DES' 'AES']
Categories in'browser_type' variable: ['Edge' 'Firefox' 'Chrome' 'Unknown' 'Safari']


In [15]:
# define numerical & categorical columns
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

# print columns
print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))

We have 7 numerical features : ['network_packet_size', 'login_attempts', 'session_duration', 'ip_reputation_score', 'failed_logins', 'unusual_time_access', 'attack_detected']

We have 3 categorical features : ['protocol_type', 'encryption_used', 'browser_type']
