In [1]:
import pandas as pd
import socket
import numpy as np
import glob
from datetime import datetime, timedelta
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
import warnings
warnings.filterwarnings("ignore")

In [2]:
timeouts = ['default', 0.5, 1, 2, 3, 4, 5, 6, 10, 30, 60]#[(0.5,2), (1, 2), (2,2), (0.5,3), (1,3), (2, 3), (3,3), (0.5,4), (1, 4), (2,4), (3,4), (4,4), (0.5,5), (1,5), (2,5), (3,5), (4,5), (5,5), (0.5, 30), (1, 30), (2,30), (3,30), (4,30), (5,30), (10, 30), (0.5, 60), (1, 60), (2,60), (3,60), (4,60), (5,60), (10, 60)]

In [37]:
def get_proto_bysocket(proto_name, proto_dict):
    try:
        proto_num = socket.getprotobyname(proto_name)

    except:
        proto_num = proto_dict[proto_name]
    return proto_num

def get_proto_dict(path='protocol-numbers-1-1.csv'):
    '''
    protocol-numbers-1.csv from 'https://www.iana.org/assignments/protocol-numbers/protocol-numbers.xhtml'
    and a part of proto dict from 'https://datatracker.ietf.org/doc/html/rfc1340'
    '''

    proto_df = pd.read_csv(path)
    proto_df = proto_df.drop(proto_df[proto_df['Decimal']=='144-252'].index)
    proto_df[proto_df['Decimal']=='61']=proto_df[proto_df['Decimal']=='61'].fillna('any')
    proto_df[proto_df['Decimal']=='63']=proto_df[proto_df['Decimal']=='63'].fillna('any')
    proto_df[proto_df['Decimal']=='68']=proto_df[proto_df['Decimal']=='68'].fillna('any')
    proto_df[proto_df['Decimal']=='99']=proto_df[proto_df['Decimal']=='99'].fillna('any')
    proto_df[proto_df['Decimal']=='114']=proto_df[proto_df['Decimal']=='114'].fillna('any')
    proto_df[proto_df['Decimal']=='253']=proto_df[proto_df['Decimal']=='253'].fillna('Use for experimentation and testing')
    proto_df[proto_df['Decimal']=='254']=proto_df[proto_df['Decimal']=='254'].fillna('Use for experimentation and testing')
    print(proto_df.shape)
    for i in range(144, 253):
        proto_df = pd.concat([proto_df, pd.DataFrame([{'Decimal': i, 'Keyword': 'unas'}])]) #proto_df.append({'Decimal': i, 'Keyword': 'unas'}, ignore_index=True)
    print(proto_df.shape)
    proto_df['Keyword'] = proto_df['Keyword'].apply(lambda x: str(x).lower())
    proto_df['Keyword'] = proto_df['Keyword'].apply(lambda x: str(x).replace(' (deprecated)', ''))
    proto_df['Decimal'] = proto_df['Decimal'].apply(lambda x: int(x))

    proto_df = proto_df.sort_values('Decimal')
    proto_df = proto_df.reset_index()
    proto_df = proto_df[['Keyword', 'Decimal']]

    proto_dict = proto_df.set_index('Keyword')['Decimal'].to_dict()#proto_df.set_index('Keyword').to_dict('Decimal')['Decimal']

    proto_dict['ipnip'] = 4
    proto_dict['st2'] = 5
    proto_dict['bbn-rcc'] = 10
    proto_dict['nvp'] = 11
    proto_dict['dcn'] = 19
    proto_dict['sep'] = 33
    proto_dict['mhrp'] = 48
    proto_dict['ipv6-no'] = 59
    proto_dict['aes-sp3-d'] = 96
    proto_dict['ipx-n-ip'] = 111
    proto_dict['sccopmce'] = 128

    proto_dict['zero'] = -1
    proto_dict['ib'] = -1
    proto_dict['pri-enc'] = -1
    return proto_dict

def convert_proto_num(proto_num):
    if proto_num in [61, 63, 68, 99, 114]:
        proto_num = 114
    elif proto_num == 253 or proto_num == 254:
         proto_num = 254
    elif 144 <= proto_num <= 252:
        proto_num = 252
    else:
        return proto_num
    return proto_num



def read_label(path, proto_dict):
    data = pd.read_csv(path)

    data_cols = ['Start time', 'Last time', 'Source IP', 'Source Port', 'Destination IP', 'Destination Port', 'Protocol', 'Attack category']
    col_list = ['Start time', 'Last time','id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p', 'protocol', 'label']
    
    label_df=pd.DataFrame()
    label_df[col_list] = data[data_cols]       
    label_df['protocol'] = label_df['protocol'].apply(lambda x: get_proto_bysocket(x, proto_dict))
    label_df['protocol'] = label_df['protocol'].astype('int')
    return label_df

def convert_time(time):
    new_time = time / 1000 
    return int(new_time)


In [38]:
def add_directionflip(lbl):
    if lbl =="Normal":
        return lbl
    else:
        return "direction_flip:"+lbl

In [39]:
 def add_label(label_df, nfs_data):
        
        nfs_data['ori_protocol'] = nfs_data['protocol']
        nfs_data['protocol'] = nfs_data['protocol'].apply(lambda x: convert_proto_num(x))
        #nfs_data['timestamp'] = nfs_data['bidirectional_first_seen_ms'].apply(lambda x: convert_time(x))

        mer_key = ['id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p', 'protocol']
        labeled_data_1 = pd.merge(nfs_data, label_df, on=mer_key, how='left')
        labeled_data_1['label'] = labeled_data_1['label'].fillna('Normal')
        labeled_data_1['label'] = labeled_data_1['label'].str.strip()
                
        labeled_data_1.drop_duplicates(subset=['id'], keep=False, inplace=True) 
        
        label_df_2 = label_df.rename(columns={'id.orig_h': 'id.resp_h', 'id.resp_h': "id.orig_h", 'id.orig_p': 'id.resp_p', 'id.resp_p': "id.orig_p"})
        labeled_data_2 = pd.merge(nfs_data, label_df_2, on=mer_key, how='left')
        labeled_data_2['label'] = labeled_data_2['label'].fillna('Normal')
        labeled_data_2['label'] = labeled_data_2['label'].str.strip()
        labeled_data_2['label'] = labeled_data_2['label'].apply(lambda x: add_directionflip(x))
        
        labeled_data_2.drop_duplicates(subset=['id'], keep=False, inplace=True) 
        # label conuts
        print('-------merge_label 1-------')
        print(labeled_data_1['label'].value_counts())
        print(labeled_data_1.shape[0])
        
        print('-------merge_label 2-------')
        print(labeled_data_2['label'].value_counts())
        print(labeled_data_2.shape[0])
        return labeled_data_1, labeled_data_2

In [40]:
import math
def validate_row(row):
    flow_start = math.ceil(row['bidirectional_first_seen_ms']/1000)
    flow_end = math.floor(row['bidirectional_last_seen_ms']/1000)
    if row['label'] != "Normal":
        condition = (flow_start >= row['Start time'] and  flow_start <= row['Last time']) or (flow_end >= row['Start time'] and  flow_end <= row['Last time'])             
        if condition:
            return "valide"
        else:
            return "invalid"
    else:
        return "Check Dupp Normal"

In [41]:
import math
def validate_row(row):
    if (row['ts'] == '-' or row['duration'] == '-') :
        pass
    else:
        flow_start = math.ceil(float(row['ts']))
        flow_end = math.ceil(float(row['ts'])+float(row['duration']))

        if row['label'] != "Normal":
            #print('flow start : ', flow_start, 'Start time : ', row['Start time'])
            #print('flow end : ', flow_end, 'last time : ', row['Last time'])
            condition = (flow_start >= row['Start time'] and  flow_start <= row['Last time']) or (flow_end >= row['Start time'] and  flow_end <= row['Last time'])             
            if condition:
                return "valide"
            else:
                return "invalid"
        else:
            return "Check Dupp Normal"

In [42]:
proto_dict = get_proto_dict()

(148, 5)
(257, 5)


In [43]:
list(proto_dict.items())[:10]

[('hopopt', 0),
 ('icmp', 1),
 ('igmp', 2),
 ('ggp', 3),
 ('ipv4', 4),
 ('st', 5),
 ('tcp', 6),
 ('cbt', 7),
 ('egp', 8),
 ('igp', 9)]

In [44]:
type(proto_dict)

dict

In [45]:
file=f'/home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/unsw/GroundTruth/NUSW-NB15_GT.csv'
label_df = read_label(file, proto_dict)

In [46]:
label_df.head()

Unnamed: 0,Start time,Last time,id.orig_h,id.orig_p,id.resp_h,id.resp_p,protocol,label
0,1421927414,1421927416,175.45.176.0,13284,149.171.126.16,80,6,Reconnaissance
1,1421927415,1421927415,175.45.176.3,21223,149.171.126.18,32780,17,Exploits
2,1421927416,1421927416,175.45.176.2,23357,149.171.126.16,80,6,Exploits
3,1421927417,1421927417,175.45.176.2,13792,149.171.126.16,5555,6,Exploits
4,1421927418,1421927418,175.45.176.2,26939,149.171.126.10,80,6,Exploits


In [15]:
def process_data_timeout(out_dir):
    files = glob.glob(out_dir+'/conn_logs/*.log')
    dfs = []
    for file in files:
        df = pd.read_csv(file, sep="\t", comment='#',
                 names=['ts', 'uid', 'id.orig_h', 'id.orig_p',
                        'id.resp_h', 'id.resp_p', 'proto', 'service',
                        'duration',  'orig_bytes', 'resp_bytes',
                        'conn_state', 'local_orig', 'local_resp',
                        'missed_bytes',  'history', 'orig_pkts',
                        'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes',
                        'tunnel_parents'])
        df['protocol'] = df['proto'].apply(lambda x: get_proto_bysocket(x, proto_dict))
        df['protocol'] = df['protocol'].astype('int')
        #df['ts'] = df['ts'].apply(lambda x: '{:.0f}'.format(x) if x!='-' else x)


        dfs.append(df)

    data_nfs = pd.concat(dfs)


    data_nfs["id"] = range(0, data_nfs.shape[0])

    labeled_data_1, labeled_data_2  = add_label(label_df, data_nfs)

    df_all = pd.concat([labeled_data_1, labeled_data_2])
    df_all.to_csv('df_all.csv', index=False, header=True)
    df_all["valid"] = df_all.apply(lambda x: validate_row(x), axis = 1)

    df_all_normal = df_all[df_all.label =='Normal']
    df_all_valid_attacks = df_all[df_all.valid =='valide']
    #df_all_valid_attacks = df_all[df_all.label !='Normal']

    df_all_normal.drop_duplicates(subset=['id'], keep='first', inplace=True) 
    df_all_valid_attacks.drop_duplicates(subset=['id'], keep=False, inplace=True) 
    df_valid_ids = df_all_valid_attacks.id.tolist()
    df_all_normal = df_all_normal[~df_all_normal.id.isin(df_valid_ids)]

    df_labeled = pd.concat([df_all_normal, df_all_valid_attacks])
    df_labeled.drop_duplicates(subset=['id'], keep=False, inplace=True) 
    df_labeled['protocol'] = df_labeled['ori_protocol']

    df_labeled = df_labeled.drop(['Start time', 'Last time', 'valid', 'ori_protocol', 'protocol'], axis=1)
    df_labeled = df_labeled.rename(columns={'label': 'Attack'})
    df_labeled['Attack'] = df_labeled['Attack'].replace({'Backdoor':'Backdoors'})
    df_labeled['Attack'] = df_labeled['Attack'].replace({'Normal':'Benign'})

    print(df_labeled.shape[0] == data_nfs.shape[0])
    print(df_labeled.Attack.value_counts())
    df_labeled.to_csv(f'{out_dir}/UNSW-NB15_zeek_{timeout}.csv', index=False, header=True)
    print("_____________________________")

In [60]:
for timeout in timeouts:
    print("Processing timeout ", timeout, '...')
    if timeout == 'default':
        out_dir = f'/home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/unsw/Zeek/22-01-2015/{timeout}/'
    else:
        out_dir = f'/home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/unsw/Zeek/22-01-2015/timeout{timeout}/'
        
    process_data_timeout(out_dir)

Processing timeout  default ...
-------merge_label 1-------
Normal            1017649
Fuzzers              3518
Exploits             3158
Reconnaissance       1718
Generic               547
DoS                   526
Shellcode             223
Analysis               47
Backdoors              46
Worms                  24
Name: label, dtype: int64
1027456
-------merge_label 2-------
Normal                           1028309
direction_flip:Exploits                2
direction_flip:Fuzzers                 1
direction_flip:Reconnaissance          1
direction_flip:Generic                 1
direction_flip:DoS                     1
Name: label, dtype: int64
1028315
True
Benign                     1019830
Fuzzers                       3029
Exploits                      2932
Reconnaissance                1328
Generic                        473
DoS                            462
Shellcode                      157
Analysis                        47
Backdoors                       33
Worms             

In [61]:
for timeout in timeouts:
    print("Processing timeout ", timeout, '...')
    if timeout == 'default':
        out_dir = f'/home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/unsw/Zeek/17-02-2015/{timeout}/'
    else:
        out_dir = f'/home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/unsw/Zeek/17-02-2015/timeout{timeout}/'
        
    process_data_timeout(out_dir)

Processing timeout  default ...
-------merge_label 1-------
Normal            973543
Exploits           19462
Fuzzers            15127
Reconnaissance      9792
Generic             3103
DoS                 2948
Shellcode           1288
Backdoor             302
Analysis             262
Worms                135
Name: label, dtype: int64
1025962
-------merge_label 2-------
Normal                           1029822
direction_flip:Exploits                5
direction_flip:DoS                     4
direction_flip:Reconnaissance          2
Name: label, dtype: int64
1029833
True
Benign                           984901
Exploits                          18098
Fuzzers                           12626
Reconnaissance                     7389
Generic                            2690
DoS                                2612
Shellcode                           865
Backdoors                           281
Analysis                            261
Worms                               113
direction_flip:DoS       