In [9]:
import numpy as np
import pandas as pd
import csv
import os
import glob

In [10]:
timeouts = [0.5, 1, 2, 3, 4, 5, 6, 10, 30, 60]

### Combine all conn files of Normal class

In [11]:
def process_benign(out_dir):
    # Directory containing the conn.log files
    #out_dir = f'/home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/ustc/Zeek/timeout{timeout}'

    files = glob.glob(out_dir+'/Benign/conn_logs/*.log')
    dfs = []
    for file in files:
        df = pd.read_csv(file, sep="\t", comment='#',
                 names=['ts', 'uid', 'id.orig_h', 'id.orig_p',
                        'id.resp_h', 'id.resp_p', 'proto', 'service',
                        'duration',  'orig_bytes', 'resp_bytes',
                        'conn_state', 'local_orig', 'local_resp',
                        'missed_bytes',  'history', 'orig_pkts',
                        'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes',
                        'tunnel_parents'])
        dfs.append(df)


    # Combine all DataFrames into a single DataFrame
    combined_df_normal = pd.concat(dfs, ignore_index=True)
    combined_df_normal['label'] = 'normal'

    # Store the combined DataFrame in a CSV file
    #combined_df_normal.to_csv(output_file, index=False)

    print(f'Combined {len(files)} conn.log files ...')
    return combined_df_normal


### Combine all conn files of Anomaly class

In [12]:
def process_malware(out_dir):
    # Directory containing the conn.log files
    #out_dir = f'/home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/ustc/Zeek/timeout{timeout}'

    files = glob.glob(out_dir+'/Malware/conn_logs/*.log')
    dfs = []
    for file in files:
        parts = file.split('/')
        attack_name = parts[-1].split('_')[0]
        df = pd.read_csv(file, sep="\t", comment='#',
                 names=['ts', 'uid', 'id.orig_h', 'id.orig_p',
                        'id.resp_h', 'id.resp_p', 'proto', 'service',
                        'duration',  'orig_bytes', 'resp_bytes',
                        'conn_state', 'local_orig', 'local_resp',
                        'missed_bytes',  'history', 'orig_pkts',
                        'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes',
                        'tunnel_parents'])
        df['label'] = attack_name
        dfs.append(df)

    # Combine all DataFrames into a single DataFrame
    combined_df_anomaly = pd.concat(dfs, ignore_index=True)

    # Store the combined DataFrame in a CSV file
    #combined_df_anomaly.to_csv(output_file, index=False)

    print(f'Combined {len(files)} conn.log files ...')
    return combined_df_anomaly


### Combine both normal and anomaly data 

In [30]:
# Combine DataFrames
for timeout in timeouts:
    print('Processing timeout : ', timeout , '... ')
    out_dir = f'/home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/ustc/Zeek/timeout{timeout}'
    combined_df_normal = process_benign(out_dir)
    combined_df_anomaly = process_malware(out_dir)

    combined_df = pd.concat([combined_df_normal, combined_df_anomaly], ignore_index=True)

    # Save to CSV
    combined_df.to_csv(f'{out_dir}/USTC-TFC16_zeek_{timeout}.csv', index=False, header=True)
    
    print(combined_df['label'].value_counts())
    print(f'Combined DataFrame saved to {output_file}')
    print('_____________________________________________________________')

Processing timeout :  0.5 ... 


  combined_df_normal = process_benign(out_dir)


Combined 14 conn.log files ...
Combined 10 conn.log files into ...
normal     1485640
Cridex       93409
Geodo        89810
Virut        88954
Neris        73964
Miuref       25800
Zeus         22432
Shifu        11865
Tinba        10817
Htbot        10424
Nsis-ay       8452
Name: label, dtype: int64
Combined DataFrame saved to /home/janati/Desktop/Meryem/IDS-Datasets/pcaps/ustc/ustc/ustc_normal_anomaly.csv
_____________________________________________________________
Processing timeout :  1 ... 
Combined 14 conn.log files ...
Combined 10 conn.log files into ...
normal     1485640
Cridex       91771
Virut        88711
Geodo        83366
Neris        72586
Miuref       22587
Zeus         22072
Shifu        11521
Tinba        10813
Htbot         8935
Nsis-ay       7548
Name: label, dtype: int64
Combined DataFrame saved to /home/janati/Desktop/Meryem/IDS-Datasets/pcaps/ustc/ustc/ustc_normal_anomaly.csv
_____________________________________________________________
Processing timeout :  2 .

In [13]:

print('Processing timeout default: ')
out_dir = f'/home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/ustc/Zeek/default'
combined_df_normal = process_benign(out_dir)
combined_df_anomaly = process_malware(out_dir)

combined_df = pd.concat([combined_df_normal, combined_df_anomaly], ignore_index=True)

# Save to CSV
combined_df.to_csv(f'{out_dir}/USTC-TFC16_zeek_default.csv', index=False, header=True)

print(combined_df['label'].value_counts())
print(f'Combined DataFrame saved to {out_dir}')
print('_____________________________________________________________')

Processing timeout default: 


  combined_df_normal = process_benign(out_dir)


Combined 14 conn.log files ...
Combined 10 conn.log files ...
normal     1485640
Cridex       91771
Virut        88048
Geodo        82659
Neris        71933
Zeus         22053
Miuref       20758
Shifu        11384
Tinba        10813
Htbot         8074
Nsis-ay       7517
Name: label, dtype: int64
Combined DataFrame saved to /home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/ustc/Zeek/default
_____________________________________________________________
