## import libraries

In [None]:
#%%import libraries
import numpy as np
np.set_printoptions(threshold=np.inf) 
np.core.arrayprint._line_width=np.inf

import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 300)
pd.set_option('display.width', 100000)

import os
import shutil
import glob

import sklearn  
from sklearn.ensemble.gradient_boosting import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest,f_classif,chi2
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import ( Dense, Dropout,Input)
from tensorflow.keras.models import load_model

import pickle

import warnings
warnings.filterwarnings("ignore")

#%%Read Data


## Read The Data

In [None]:
features=pd.read_csv('/content/drive/My Drive/ML/01 Data set/NUSW-NB15_features.csv',engine='python')
features

Unnamed: 0,No.,Name,Type,Description
0,1,srcip,nominal,Source IP address
1,2,sport,integer,Source port number
2,3,dstip,nominal,Destination IP address
3,4,dsport,integer,Destination port number
4,5,proto,nominal,Transaction protocol
5,6,state,nominal,Indicates to the state and its dependent proto...
6,7,dur,Float,Record total duration
7,8,sbytes,Integer,Source to destination transaction bytes
8,9,dbytes,Integer,Destination to source transaction bytes
9,10,sttl,Integer,Source to destination time to live value


In [None]:
cols=features["Name"]
data1=pd.read_csv('/content/drive/My Drive/ML/01 Data set/UNSW-NB15_1.csv',names=cols)
data2=pd.read_csv('/content/drive/My Drive/ML/01 Data set/UNSW-NB15_2.csv',names=cols)
data3=pd.read_csv('/content/drive/My Drive/ML/01 Data set/UNSW-NB15_3.csv',names=cols)
data4=pd.read_csv('/content/drive/My Drive/ML/01 Data set/UNSW-NB15_4.csv',names=cols)
Data=pd.concat([data1,data2,data3,data4],ignore_index=True)
Data=Data.replace(r'^\s*$', np.nan, regex=True)
del data1,data2,data3,data4

## Visulaize the Data

In [None]:
print("Data Shape:\n",Data.shape)
print(100*"=")
print("Data Head:\n",Data.head())
print(100*"=")
print("Data Types:\n",Data.dtypes)
print(100*"=")
print("Null Data:\n",Data.isnull().sum().sort_values(ascending=False,inplace=False))

Data Shape:
 (2540047, 49)
Data Head:
         srcip  sport          dstip dsport proto state       dur  sbytes  dbytes  sttl  dttl  sloss  dloss service         Sload         Dload  Spkts  Dpkts  swin  dwin  stcpb  dtcpb  smeansz  dmeansz  trans_depth  res_bdy_len     Sjit       Djit       Stime       Ltime  Sintpkt   Dintpkt  tcprtt  synack  ackdat  is_sm_ips_ports  ct_state_ttl  ct_flw_http_mthd  is_ftp_login ct_ftp_cmd  ct_srv_src  ct_srv_dst  ct_dst_ltm  ct_src_ ltm  ct_src_dport_ltm  ct_dst_sport_ltm  ct_dst_src_ltm attack_cat  Label
0  59.166.0.0   1390  149.171.126.6     53   udp   CON  0.001055     132     164    31    29      0      0     dns  500473.93750  621800.93750      2      2     0     0      0      0       66       82            0            0  0.00000   0.000000  1421927414  1421927414    0.017  0.013000     0.0     0.0     0.0                0             0               0.0           0.0          0           3           7           1            3                 1

## PreProcessing

In [None]:
print(Data.shape)
Data.drop_duplicates(keep=False,inplace=True) 
print(Data.shape)

(2540047, 49)
(1971209, 49)


In [None]:
def StrToNum(a):      
  try:
    return int(a,0)
  except:
    try:
      return float(a)
    except:
      return a

In [None]:
StrList=[j for i,j,k in zip(Data.dtypes,features.iloc[:,1],features.iloc[:,2])
if (i==('object'))and(k==('integer'or 'Float'))]
print(StrList)

['sport', 'dsport', 'ct_ftp_cmd']


In [None]:
StrDict={col:[[i,j] for i,j in enumerate(Data[col]) if type(j)==str]
	 for col in StrList}
for col in StrList:
  print(len(StrDict[col]))


52417
1111321
31243


In [None]:
for col in StrList:
  Data[col]=Data[col].map(StrToNum)

In [None]:
StrDict={col:[[i,j] for i,j in enumerate(Data[col]) if type(j)==str] for col in StrList}
for col in StrList:
  print(len(StrDict[col]))


2
7
0


In [None]:
for col in StrDict.keys():
  print(np.unique(StrDict[col]))
  print(100*'=')

['-' '125776' '78808']
['-' '104770' '122414' '34983' '44021' '44023' '45465' '70198']
[]


In [None]:
Data[StrList]=Data[StrList].replace('-',np.nan,regex=True)

In [None]:
StrDict={col:[[i,j] for i,j in enumerate(Data[col]) if type(j)==str] for col in StrList}
for col in StrList:
  print(len(StrDict[col]))

2
7
0


## Save The Data

In [None]:
#os.makedirs('/content/drive/My Drive/ML/02 Results' , exist_ok = False)

In [None]:
#Data.to_csv('/content/drive/My Drive/ML/02 Results/tot_Data.csv',index=False)

## Split The Data

In [None]:
#%%Split The Data
y=Data['Label']
x=Data.drop(['Label','attack_cat'],axis=1)
x_train, x_test, y_train, y_test  = train_test_split(x, y, test_size=0.2, random_state=2)
x_train.reset_index(inplace=True,drop=True)
x_test.reset_index(inplace=True,drop=True)
print('x_train shape:\n',x_train.shape)
print('y_train shape:\n',y_train.shape)
print('x_test shape:\n',x_test.shape)
print('y_test  shape:\n',y_test .shape)
del x,y,Data

x_train shape:
 (1576967, 47)
y_train shape:
 (1576967,)
x_test shape:
 (394242, 47)
y_test  shape:
 (394242,)


## Cleaning The Data

In [None]:
#%%Cleanning the Data
print(x_train.isnull().sum().sort_values(ascending=False,inplace=False).head(8))
print(100*'=')

ct_ftp_cmd          771646
is_ftp_login        771646
ct_flw_http_mthd    708012
dsport                   6
sport                    2
ct_dst_src_ltm           0
sloss                    0
dwin                     0
dtype: int64


In [None]:
NanColsNames=x_train.columns[x_train.isnull().any()]
num=x_train[NanColsNames]._get_numeric_data().columns #numeric data columns names
cat=x_train[NanColsNames].select_dtypes(include=['object']).columns#categorical data columns names
print(len(num),'\n',len(cat))


5 
 0


In [None]:

SIn=SimpleImputer(missing_values=np.nan, strategy='mean')#replace nan values with the colum mean
SIn.fit(x_train[num])
x_train[num]=pd.DataFrame(SIn.transform(x_train[num]),columns=num) #transform
x_test[num]=pd.DataFrame(SIn.transform(x_test[num]),columns=num)


In [None]:
print(x_train.isnull().sum().sort_values(ascending=False,inplace=False).head())
print(100*'=')
print(x_test.isnull().sum().sort_values(ascending=False,inplace=False).head())


ct_dst_src_ltm    0
sloss             0
stcpb             0
dwin              0
swin              0
dtype: int64
ct_dst_src_ltm    0
sloss             0
stcpb             0
dwin              0
swin              0
dtype: int64


## Categorical Encoding

In [None]:
x_train=pd.get_dummies(x_train)
x_test=pd.get_dummies(x_test)
missed_features=set(x_train.columns)-set(x_test.columns)

for i in missed_features:
    x_test[i]=0

missed_features=set(x_test.columns)-set(x_train.columns)

for i in missed_features:
    x_test=x_test.drop(i,axis=1)    

num=x_train._get_numeric_data().columns
cat=set(x_train.columns)-set(num)

x_train = x_train.reindex(sorted(x_train.columns), axis=1)#sort the data again 
x_test = x_test.reindex(sorted(x_test.columns), axis=1)


In [None]:
print(x_train.shape)
print(x_test.shape)
print('object' in x_train.dtypes.values)
print('object' in x_test.dtypes.values)

(1576967, 295)
(394242, 295)
False
False


## Feature Selection

In [None]:
#%%feature selection
k=47
SP=SelectKBest(score_func=chi2,k=k)
SP.fit(x_train,y_train)



SelectKBest(k=47, score_func=<function chi2 at 0x7fd35aafd1e0>)

In [None]:
f=SP.get_support() # get the remaining columns from feature selection
columns=x_train.columns[f]
print(len(columns))#k


47


In [None]:
x_train=pd.DataFrame(SP.transform(x_train),columns=columns)
x_test=pd.DataFrame(SP.transform(x_test),columns=columns)

x_train = x_train.reindex(sorted(x_train.columns), axis=1)
x_test = x_test.reindex(sorted(x_test.columns), axis=1)


## Normalization

In [None]:

columns=x_train.columns
SS=StandardScaler(copy=True, with_mean=True, with_std=True)
SS.fit(x_train)

x_train=pd.DataFrame(SS.transform(x_train),columns=columns)
x_test=pd.DataFrame(SS.transform(x_test),columns=columns)

In [None]:
print(
x_train.agg(['max','min','std','mean']),'\n', 
x_test.agg(['max','min','std','mean']) 
)

           Dintpkt          Djit         Dload         Dpkts         Ltime       Sintpkt          Sjit         Sload         Spkts         Stime        ackdat    ct_dst_ltm    ct_srv_dst    ct_srv_src  ct_state_ttl        dbytes         dloss       dmeansz        dsport  dstip_149.171.126.10  dstip_149.171.126.11  dstip_149.171.126.12  dstip_149.171.126.13  dstip_149.171.126.14  dstip_149.171.126.15  dstip_149.171.126.16  dstip_149.171.126.17  dstip_149.171.126.18  dstip_149.171.126.19         dtcpb          dttl           dur    proto_unas   res_bdy_len        sbytes  service_pop3         sloss       smeansz         sport  srcip_175.45.176.0  srcip_175.45.176.1  srcip_175.45.176.2  srcip_175.45.176.3     state_INT         stcpb          sttl        tcprtt
max   6.311241e+01  2.592689e+02  2.778130e+01  8.558914e+01  1.017763e+00  4.907015e+01  1.585119e+02  9.147994e+01  1.266483e+02  1.017763e+00  2.077777e+02  1.981695e+01  1.579756e+01  1.475701e+01  1.536775e+01  8.530219e+01  9.1

## Model

In [None]:
input_layer= Input(shape=(x_train.shape[1]))
x=Dense(32,'relu')(input_layer)
x=Dropout(0.2)(x)
x=Dense(16,'relu')(x)
x=Dropout(0.2)(x)
x=Dense(1,'sigmoid')(x)

model = Model(inputs=input_layer, outputs=x)
model.compile(optimizer ='adam',
  loss='binary_crossentropy',
  metrics=['accuracy'])
hist=model.fit(np.array(x_train), np.array(y_train), epochs=1,
          validation_data=(np.array(x_test), np.array(y_test)))

