In [1]:
import pandas as pd
import socket
import numpy as np
import glob
from datetime import datetime, timedelta
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_proto_bysocket(proto_name, proto_dict):
    try:
        proto_num = socket.getprotobyname(proto_name)

    except:
        proto_num = proto_dict[proto_name]
    return proto_num

def get_proto_dict(path='protocol-numbers-1-1.csv'):
    '''
    protocol-numbers-1.csv from 'https://www.iana.org/assignments/protocol-numbers/protocol-numbers.xhtml'
    and a part of proto dict from 'https://datatracker.ietf.org/doc/html/rfc1340'
    '''

    proto_df = pd.read_csv(path)
    proto_df = proto_df.drop(proto_df[proto_df['Decimal']=='144-252'].index)
    proto_df[proto_df['Decimal']=='61']=proto_df[proto_df['Decimal']=='61'].fillna('any')
    proto_df[proto_df['Decimal']=='63']=proto_df[proto_df['Decimal']=='63'].fillna('any')
    proto_df[proto_df['Decimal']=='68']=proto_df[proto_df['Decimal']=='68'].fillna('any')
    proto_df[proto_df['Decimal']=='99']=proto_df[proto_df['Decimal']=='99'].fillna('any')
    proto_df[proto_df['Decimal']=='114']=proto_df[proto_df['Decimal']=='114'].fillna('any')
    proto_df[proto_df['Decimal']=='253']=proto_df[proto_df['Decimal']=='253'].fillna('Use for experimentation and testing')
    proto_df[proto_df['Decimal']=='254']=proto_df[proto_df['Decimal']=='254'].fillna('Use for experimentation and testing')
    print(proto_df.shape)
    for i in range(144, 253):
        proto_df = pd.concat([proto_df, pd.DataFrame([{'Decimal': i, 'Keyword': 'unas'}])]) #proto_df.append({'Decimal': i, 'Keyword': 'unas'}, ignore_index=True)
    print(proto_df.shape)
    proto_df['Keyword'] = proto_df['Keyword'].apply(lambda x: str(x).lower())
    proto_df['Keyword'] = proto_df['Keyword'].apply(lambda x: str(x).replace(' (deprecated)', ''))
    proto_df['Decimal'] = proto_df['Decimal'].apply(lambda x: int(x))

    proto_df = proto_df.sort_values('Decimal')
    proto_df = proto_df.reset_index()
    proto_df = proto_df[['Keyword', 'Decimal']]

    proto_dict = proto_df.set_index('Keyword')['Decimal'].to_dict()#proto_df.set_index('Keyword').to_dict('Decimal')['Decimal']

    proto_dict['ipnip'] = 4
    proto_dict['st2'] = 5
    proto_dict['bbn-rcc'] = 10
    proto_dict['nvp'] = 11
    proto_dict['dcn'] = 19
    proto_dict['sep'] = 33
    proto_dict['mhrp'] = 48
    proto_dict['ipv6-no'] = 59
    proto_dict['aes-sp3-d'] = 96
    proto_dict['ipx-n-ip'] = 111
    proto_dict['sccopmce'] = 128

    proto_dict['zero'] = -1
    proto_dict['ib'] = -1
    proto_dict['pri-enc'] = -1
    return proto_dict

def convert_proto_num(proto_num):
    if proto_num in [61, 63, 68, 99, 114]:
        proto_num = 114
    elif proto_num == 253 or proto_num == 254:
         proto_num = 254
    elif 144 <= proto_num <= 252:
        proto_num = 252
    else:
        return proto_num
    return proto_num



def read_label(path, proto_dict):
    data = pd.read_csv(path)

    data_cols = ['Start time', 'Last time', 'Source IP', 'Source Port', 'Destination IP', 'Destination Port', 'Protocol', 'Attack category']
    col_list = ['Start time', 'Last time','src_ip', 'src_port', 'dst_ip', 'dst_port', 'protocol', 'label']

    label_df=pd.DataFrame()
    label_df[col_list] = data[data_cols]       
    label_df['protocol'] = label_df['protocol'].apply(lambda x: get_proto_bysocket(x, proto_dict))
    label_df['protocol'] = label_df['protocol'].astype('int')
    return label_df

def convert_time(time):
    new_time = time / 1000 
    return int(new_time)


In [3]:
def add_directionflip(lbl):
    if lbl =="Normal":
        return lbl
    else:
        return "direction_flip:"+lbl

In [4]:
 def add_label(label_df, nfs_data):
        
        nfs_data['ori_protocol'] = nfs_data['protocol']
        nfs_data['protocol'] = nfs_data['protocol'].apply(lambda x: convert_proto_num(x))
        #nfs_data['timestamp'] = nfs_data['bidirectional_first_seen_ms'].apply(lambda x: convert_time(x))
        
        mer_key = ['src_ip', 'src_port', 'dst_ip', 'dst_port', 'protocol']
        labeled_data_1 = pd.merge(nfs_data, label_df, on=mer_key, how='left')
        labeled_data_1['label'] = labeled_data_1['label'].fillna('Normal')
        labeled_data_1['label'] = labeled_data_1['label'].str.strip()

        labeled_data_1.drop_duplicates(subset=['id'], keep=False, inplace=True) 
        
        label_df_2 = label_df.rename(columns={'src_ip': 'dst_ip', 'dst_ip': "src_ip", 'src_port': 'dst_port', 'dst_port': "src_port"})
        labeled_data_2 = pd.merge(nfs_data, label_df_2, on=mer_key, how='left')
        labeled_data_2['label'] = labeled_data_2['label'].fillna('Normal')
        labeled_data_2['label'] = labeled_data_2['label'].str.strip()
        labeled_data_2['label'] = labeled_data_2['label'].apply(lambda x: add_directionflip(x))
        
        labeled_data_2.drop_duplicates(subset=['id'], keep=False, inplace=True) 
        # label conuts
        print('-------merge_label 1-------')
        print(labeled_data_1['label'].value_counts())
        print(labeled_data_1.shape[0])
        
        print('-------merge_label 2-------')
        print(labeled_data_2['label'].value_counts())
        print(labeled_data_2.shape[0])
        return labeled_data_1, labeled_data_2

In [5]:
import math
def validate_row(row):
    flow_start = math.ceil(row['bidirectional_first_seen_ms']/1000)
    flow_end = math.floor(row['bidirectional_last_seen_ms']/1000)
    if row['label'] != "Normal":
        condition = (flow_start >= row['Start time'] and  flow_start <= row['Last time']) or (flow_end >= row['Start time'] and  flow_end <= row['Last time'])             
        if condition:
            return "valide"
        else:
            return "invalid"
    else:
        return "Check Dupp Normal"

In [6]:
proto_dict = get_proto_dict()

(148, 5)
(257, 5)


In [7]:
list(proto_dict.items())[:10]

[('hopopt', 0),
 ('icmp', 1),
 ('igmp', 2),
 ('ggp', 3),
 ('ipv4', 4),
 ('st', 5),
 ('tcp', 6),
 ('cbt', 7),
 ('egp', 8),
 ('igp', 9)]

In [8]:
label_df = read_label("NUSW-NB15_GT.csv", proto_dict)

In [9]:
label_df.head()

Unnamed: 0,Start time,Last time,src_ip,src_port,dst_ip,dst_port,protocol,label
0,1421927414,1421927416,175.45.176.0,13284,149.171.126.16,80,6,Reconnaissance
1,1421927415,1421927415,175.45.176.3,21223,149.171.126.18,32780,17,Exploits
2,1421927416,1421927416,175.45.176.2,23357,149.171.126.16,80,6,Exploits
3,1421927417,1421927417,175.45.176.2,13792,149.171.126.16,5555,6,Exploits
4,1421927418,1421927418,175.45.176.2,26939,149.171.126.10,80,6,Exploits


In [10]:
timeouts = [(3,30)]#[(0.5,2), (1, 2), (2,2), (0.5,3), (1,3), (2, 3), (3,3), (0.5,4), (1, 4), (2,4), (3,4), (4,4), (0.5,5), (1,5), (2,5), (3,5), (4,5), (5,5), (0.5, 30), (1, 30), (2,30), (3,30), (4,30), (5,30), (10, 30), (0.5, 60), (1, 60), (2,60), (3,60), (4,60), (5,60), (10, 60)]

In [11]:
for timeout in timeouts:
    idle, active = timeout
    out_dir = f'/home/abdelkader.elmahdaou/lustre/data_sec-um6p-st-sccs-6sevvl76uja/IDS/mahdaouy/fixed_timeouts_v2/new_idle_{idle}min_active_{active}min/UNSW-NB15'
    files = glob.glob(out_dir+'/temp/*.csv')
    dfs = []
    for file in files:
        df = pd.read_csv(file)
        dfs.append(df)
    data_nfs = pd.concat(dfs)
    data_nfs["id"] = range(0, data_nfs.shape[0])
    
    labeled_data_1, labeled_data_2  = add_label(label_df, data_nfs)
    
    df_all = pd.concat([labeled_data_1, labeled_data_2])
    df_all["valid"] = df_all.apply(lambda x: validate_row(x), axis = 1)
    
    df_all_normal = df_all[df_all.label =='Normal']
    df_all_valid_attacks = df_all[df_all.valid =='valide']
    
    df_all_normal.drop_duplicates(subset=['id'], keep='first', inplace=True) 
    df_all_valid_attacks.drop_duplicates(subset=['id'], keep=False, inplace=True) 
    df_valid_ids = df_all_valid_attacks.id.tolist()
    df_all_normal = df_all_normal[~df_all_normal.id.isin(df_valid_ids)]
    
    df_labeled = pd.concat([df_all_normal, df_all_valid_attacks])
    df_labeled.drop_duplicates(subset=['id'], keep=False, inplace=True) 
    df_labeled['protocol'] = df_labeled['ori_protocol']
    
    df_labeled = df_labeled.drop(['Start time', 'Last time', 'valid', 'ori_protocol'], axis=1)
    df_labeled = df_labeled.rename(columns={'label': 'Attack'})
    df_labeled['Attack'] = df_labeled['Attack'].replace({'Backdoor':'Backdoors'})
    df_labeled['Attack'] = df_labeled['Attack'].replace({'Normal':'Benign'})
    
    print(df_labeled.shape[0] == data_nfs.shape[0])
    print(df_labeled.Attack.value_counts())
    df_labeled.to_csv(out_dir+"/UNSW-NB15.csv", index=False, header=True)

-------merge_label 1-------
label
Normal            1979370
Exploits            22598
Fuzzers             18451
Reconnaissance      11506
Generic              3646
DoS                  3473
Shellcode            1511
Analysis              307
Backdoor              302
Worms                 158
Backdoors              46
Name: count, dtype: int64
2041368
-------merge_label 2-------
label
Normal                           2047273
direction_flip:Exploits               27
direction_flip:Fuzzers                16
direction_flip:Reconnaissance          7
direction_flip:DoS                     6
direction_flip:Generic                 5
direction_flip:Worms                   1
Name: count, dtype: int64
2047335
True
Attack
Benign                           1985340
Exploits                           22593
Fuzzers                            18415
Reconnaissance                     11504
Generic                             3646
DoS                                 3468
Shellcode                        

In [12]:
1

1

In [13]:
1

1