## Libraries

In [274]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 300)
pd.set_option('display.width', 100000)

import numpy as np
np.set_printoptions(threshold=np.inf) 
np.core.arrayprint._line_width=np.inf

import os
import shutil
import glob

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import ( Dense, Dropout,Input)
from tensorflow.keras.models import load_model

import sklearn  
from sklearn.ensemble.gradient_boosting import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest,f_classif,chi2
from sklearn.preprocessing import StandardScaler

import pickle

import warnings
warnings.filterwarnings("ignore")

## Load Data

In [195]:
features=pd.read_csv("/content/drive/MyDrive/ML/01 Data set/NUSW-NB15_features.csv",engine="python")

In [196]:
cols=features["Name"]

In [197]:
def RemoveSpace(a):
  if a==np.nan  or isinstance(a,int) or isinstance(a,float):
    return a
  if a=="":
    return np.nan
  b=""
  for i in a:
    if i==" " or i=="  " or i=="-":
      continue
    b+=i 
  if b=="":
    return np.nan
  return b

In [198]:
cols=cols.map(RemoveSpace)

In [199]:
data1=pd.read_csv('/content/drive/My Drive/ML/01 Data set/UNSW-NB15_1.csv',names=cols)
data2=pd.read_csv('/content/drive/My Drive/ML/01 Data set/UNSW-NB15_2.csv',names=cols)
data3=pd.read_csv('/content/drive/My Drive/ML/01 Data set/UNSW-NB15_3.csv',names=cols)
data4=pd.read_csv('/content/drive/My Drive/ML/01 Data set/UNSW-NB15_4.csv',names=cols)
Data=pd.concat([data1,data2,data3,data4],ignore_index=True)
del data1,data2,data3,data4

In [200]:
num=Data._get_numeric_data().columns #numeric data columns names
cat=Data.select_dtypes(include=['object']).columns#categorical data columns names

In [201]:
for col in cat:
  Data[col]=Data[col].map(RemoveSpace)

## Correct Dtypes and Remove duplicated

In [203]:
print(Data.shape)
Data.drop_duplicates(keep=False,inplace=True) 
print(Data.shape)


(2540047, 49)
(1971209, 49)


In [204]:
StrList=[j for i,j,k in zip(Data.dtypes,features.iloc[:,1],features.iloc[:,2])
if (i==('object')) and (k==('integer' or 'Float'))]
print(StrList)

['sport', 'dsport', 'ct_ftp_cmd']


In [205]:
def StrToNum(a):      
  try:
    return int(a,0)
  except:
    try:
      return float(a)
    except:
      return a

In [206]:
StrDict={col:[[i,j] for i,j in enumerate(Data[col]) if type(j)==str]
	 for col in StrList}
for col in StrList:
  print(len(StrDict[col]))

52415
1111314
31243


In [207]:

for col in StrList:
  Data[col]=Data[col].map(StrToNum)
  Data[col]=Data[col].map(RemoveSpace)

In [208]:
StrDict={col:[[i,j] for i,j in enumerate(Data[col]) if type(j)==str] for col in StrList}
for col in StrList:
  print(len(StrDict[col]))

0
0
0


In [209]:
Data.to_csv(r"/content/drive/MyDrive/ML/01 Data set/00TotData.csv",index=False)

## Select Features

In [151]:
print(Data.columns.sort_values())

Index(['Dintpkt', 'Djit', 'Dload', 'Dpkts', 'Label', 'Ltime', 'Sintpkt', 'Sjit', 'Sload', 'Spkts', 'Stime', 'ackdat', 'attack_cat', 'ct_dst_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'ct_flw_http_mthd', 'ct_ftp_cmd', 'ct_src_dport_ltm', 'ct_src_ltm', 'ct_srv_dst', 'ct_srv_src', 'ct_state_ttl', 'dbytes', 'dloss', 'dmeansz', 'dsport', 'dstip', 'dtcpb', 'dttl', 'dur', 'dwin', 'is_ftp_login', 'is_sm_ips_ports', 'proto', 'res_bdy_len', 'sbytes', 'service', 'sloss', 'smeansz', 'sport', 'srcip', 'state', 'stcpb', 'sttl', 'swin', 'synack', 'tcprtt', 'trans_depth'], dtype='object')


In [210]:
selectedFeatures=[
                  "sttl","ct_dst_src_ltm","Spkts","Dload","sloss","dloss","ct_src_ltm","ct_srv_dst",
                  "sbytes","smeansz","ct_dst_sport_ltm",
                  ]

In [222]:
Data2=Data[selectedFeatures+['Label','attack_cat']]

In [223]:
print(Data2.shape)
Data2.drop_duplicates(keep=False,inplace=True) 
print(Data2.shape)

(1971209, 13)
(1599305, 13)


## Split

In [224]:
y=Data2[['Label','attack_cat']]
x=Data2.drop(['Label','attack_cat'],axis=1)
x_train, x_test, y_train, y_test  = train_test_split(x, y, test_size=0.2, random_state=2)
x_train.reset_index(inplace=True,drop=True)
x_test.reset_index(inplace=True,drop=True)
y_train.reset_index(inplace=True,drop=True)
y_test.reset_index(inplace=True,drop=True)
'''print('x_train shape:\n',x_train.shape)
print('y_train shape:\n',y_train.shape)
print('x_test shape:\n',x_test.shape)
print('y_test  shape:\n',y_test .shape)'''

"print('x_train shape:\n',x_train.shape)\nprint('y_train shape:\n',y_train.shape)\nprint('x_test shape:\n',x_test.shape)\nprint('y_test  shape:\n',y_test .shape)"

In [225]:
y_train1=y_train['Label']
y_train2=y_train['attack_cat']

y_test1=y_test['Label']
y_test2=y_test['attack_cat']

In [226]:
x_train.head()

Unnamed: 0,sttl,ct_dst_src_ltm,Spkts,Dload,sloss,dloss,ct_src_ltm,ct_srv_dst,sbytes,smeansz,ct_dst_sport_ltm
0,31,3,8,290869.2188,1,4,5,7,424,53,1
1,31,7,52,201305.7969,18,8,7,1,37402,719,1
2,31,3,28,50596.89063,7,7,5,9,5176,185,1
3,31,1,2,700787.375,0,0,4,6,146,73,1
4,31,4,4,383685.1563,0,0,5,7,568,142,1


## Cleaning

In [227]:
print(y_train2.isnull().sum())

1234866


In [228]:
y_train2=y_train2.fillna("Normal")
y_test2=y_test2.fillna("Normal")

In [229]:
y_train2.shape

(1279444,)

## Categorical Encoding

In [230]:
y_train2=pd.get_dummies(y_train2)
y_test2=pd.get_dummies(y_test2)

In [231]:
y_train2.columns

Index(['Analysis', 'Backdoor', 'Backdoors', 'DoS', 'Exploits', 'Fuzzers', 'Generic', 'Normal', 'Reconnaissance', 'Shellcode', 'Worms'], dtype='object')

In [232]:
y_test2.columns

Index(['Analysis', 'Backdoor', 'Backdoors', 'DoS', 'Exploits', 'Fuzzers', 'Generic', 'Normal', 'Reconnaissance', 'Shellcode', 'Worms'], dtype='object')

In [233]:
np.sum(y_train1)/y_train1.shape[0]

0.034841696862074466

In [234]:
y_train2['Backdoor']=y_train2['Backdoor']+y_train2['Backdoors']
y_test2['Backdoor']=y_test2['Backdoor']+y_test2['Backdoors']
y_train2=y_train2.drop('Backdoors',axis=1)
y_test2=y_test2.drop('Backdoors',axis=1)

In [235]:
y_train2.columns

Index(['Analysis', 'Backdoor', 'DoS', 'Exploits', 'Fuzzers', 'Generic', 'Normal', 'Reconnaissance', 'Shellcode', 'Worms'], dtype='object')

In [236]:
x_train=x_train.reindex(sorted(x_train.columns), axis=1)#sort the data again 
x_test = x_test.reindex(sorted(x_test.columns), axis=1)

y_train2=y_train2.reindex(sorted(y_train2.columns), axis=1)#sort the data again 
y_test2 = y_test2.reindex(sorted(y_test2.columns), axis=1)


## Normalization

In [160]:
#Data2.to_csv(r"/content/drive/MyDrive/ML/01 Data set/01SelectedFeature.csv",index=False)

In [237]:
columns=x_train.columns
SS=StandardScaler(copy=True, with_mean=True, with_std=True)
SS.fit(x_train)

x_train=pd.DataFrame(SS.transform(x_train),columns=columns)
x_test=pd.DataFrame(SS.transform(x_test),columns=columns)

## Model

In [290]:
input_layer= Input(shape=(x_train.shape[1]))
x=Dense(128,'relu')(input_layer)
x=Dense(64,'relu')(x)
x=Dense(32,'relu')(x)
x=Dense(16,'relu')(x)

x=Dense(y_train2.shape[1],'softmax')(x)

model = Model(inputs=input_layer, outputs=x)
model.compile(optimizer ='adam',
  loss='categorical_crossentropy',
  metrics=['accuracy'])
hist=model.fit(np.array(x_train), np.array(y_train2), epochs=1,
          validation_data=(np.array(x_test), np.array(y_test2)))




In [428]:
print(y_train1.shape
,x_train.shape)

(1279444,) (1279444, 11)


In [429]:

input_layer= Input(shape=(x_train.shape[1]))
x=Dense(32,'relu')(input_layer)
x=Dropout(0.2)(x)
x=Dense(16,'relu')(x)
x=Dropout(0.2)(x)
x=Dense(1,'sigmoid')(x)

model = Model(inputs=input_layer, outputs=x)
model.compile(optimizer ='adam',
  loss='binary_crossentropy',
  metrics=['accuracy'])
hist=model.fit(np.array(x_train), np.array(y_train1), epochs=1,
          validation_data=(np.array(x_test), np.array(y_test1)))



In [443]:
y_pred=model.predict(x_test)

In [444]:
def SetMax(Prob):
  pred=np.zeros(Prob.shape)
  for l,s in enumerate(list(Prob)):
    Max=0
    Maxi=0
    for i,j in enumerate(s):
      if j>Max:
        Max=j
        Maxi=i
    pred[l,Maxi]=1
  return pred
    


'5'

In [441]:
y_pred2=y_pred2.tostring()

In [453]:
y_pred2=SetMax(y_pred)
y_pred2=np.array([str(i) for i in y_pred2],dtype=str)

In [456]:
y_pred2=pd.get_dummies(y_pred2)

In [458]:
y_pred2.shape

(319861, 1)

In [433]:
import seaborn as sn
import matplotlib.pyplot as plt
def ConfusionMatrix(y_pred,y_real):
  y_pred=np.array(y_pred)
  y_real=np.array(y_real)
  dim=y_pred.shape[1]
  CM=np.zeros([dim,dim])
  Percesion=np.zeros(dim)
  Recall=np.zeros(dim)
  F1=np.zeros(dim)
  #diag=np.zeros(dim)
  for i in range(y_pred.shape[0]):
    CM[np.argmax(y_real[i,:])  ,  np.argmax(y_pred[i,:])]+=1
  plt.figure(figsize = (10,7))
  sn.heatmap(CM, annot=True)
  
  diag=np.array([CM[i,i] for i in range(dim)])
  Percesion=diag/CM.sum(axis=0)
  Recall=diag/CM.sum(axis=1)
  F1=2*Percesion*Recall/(Percesion+Recall)
  print("Percesion:",Percesion)
  print("Recall:",Recall)
  print("F1:",F1)
  return CM,Percesion,Recall,F1


In [434]:
#print(type(y_test2))
CM=ConfusionMatrix(y_pred2,y_test2)

IndexError: ignored