In [21]:
import pandas as pd

In [39]:
def load_log_file(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split(';')
            
            if len(parts) != 5:
                continue  
            
            id = parts[0].strip()
            timestamps_str = parts[1].strip().strip('[]')
            try:
                if timestamps_str and all(part.isdigit() for part in timestamps_str.split(', ')):
                    timestamps = list(map(int, timestamps_str.split(', ')))
                else:
                    timestamps = []
            except ValueError:
                timestamps = []
                
            fqdn_list_str = parts[2].strip().strip('[]')
            if fqdn_list_str:
                fqdn_list = [fqdn.strip("'\"") for fqdn in fqdn_list_str.split(', ')]
            else:
                fqdn_list = []
            
            status_list_str = parts[3].strip().strip('[]')
            if status_list_str:
                status_list = [status.strip("'\"") for status in status_list_str.split(', ')]
            else:
                status_list = []
            
            output_status = parts[4].strip()
            
            data.append((id, timestamps, fqdn_list, status_list, output_status))
    return data

In [41]:
def create_dataset(data):
    dataset = {
        'ID': [],
        'Timestamp': [],
        'FQDNs': [],
        'Status': [],
        'Output': []  
    }

    for entry in data:
        id, timestamps, fqdn_list, status_list, output_status = entry
        dataset['ID'].append(id)
        dataset['Timestamp'].append(timestamps)
        dataset['FQDNs'].append(fqdn_list)
        dataset['Status'].append(status_list)
        dataset['Output'].append(output_status) 
        
    return dataset

In [43]:
def calculate_features(dataset):
    features = []
    for i in range(len(dataset['ID'])):
        id = dataset['ID'][i]
        timestamps = dataset['Timestamp'][i]
        fqdn_list = dataset['FQDNs'][i]
        status_list = dataset['Status'][i]
        output_status = dataset['Output'][i]  # Retrieve the Output status
        
        # Calculate inter-arrival times (IAT)
        if len(timestamps) > 1:
            iat = [timestamps[j+1] - timestamps[j] for j in range(len(timestamps) - 1)]
        else:
            iat = []
        
        # Calculate the number of successive "KO" statuses
        nb_succesive_KO = 0
        current_ko_streak = 0
        for status in status_list:
            if status == "KO":
                current_ko_streak += 1
                if current_ko_streak > 1:
                    nb_succesive_KO += 1
            else:
                current_ko_streak = 0

        # Calculate average, standard deviation, min, max of IAT
        avg_iat = sum(iat) / len(iat) if iat else 0
        std_iat = (sum((x - avg_iat) ** 2 for x in iat) / len(iat)) ** 0.5 if iat else 0
        min_iat = min(iat) if iat else 0
        max_iat = max(iat) if iat else 0
        
        # Calculate the number of equal IATs
        nb_equal_iat = sum(1 for j in range(1, len(iat)) if abs(iat[j] - iat[j-1]) == 0)

        features.append((id, nb_succesive_KO, avg_iat, std_iat, min_iat, max_iat, nb_equal_iat, output_status))  # Include Output status
    
    return features

In [45]:
def main():
    log_file_path = 'dga_simulation_log.txt'  
    data = load_log_file(log_file_path)
    dataset = create_dataset(data)
    features = calculate_features(dataset)
    
    dataset_df = pd.DataFrame({
        'ID': dataset['ID'],
        'Timestamp': dataset['Timestamp'],
        'FQDNs': dataset['FQDNs'],
        'Status': dataset['Status'],
        'Output': dataset['Output'] 
    })
    
    features_df = pd.DataFrame(features, columns=[
        "Scenario", "Nb Successive KO", "Avg IAT", "Std IAT", "Min IAT", "Max IAT", 
        "Nb Equal IAT", "Output"  
    ])

    print("Dataset:")
    print(dataset_df.head())
    print("\nFeatures:")
    print(features_df.head())
    
    dataset_df.to_csv('dataset.csv', index=False)
    features_df.to_csv('features.csv', index=False)

if __name__ == "__main__":
    main()

Dataset:
  ID                                          Timestamp  \
0  1  [0, 422, 11277, 14205, 27878, 70512, 72948, 73...   
1  2  [0, 54172, 60885, 92304, 148272, 154550, 19054...   
2  3  [0, 36645, 60765, 64820, 109930, 151481, 16181...   
3  4  [0, 14951, 30152, 65948, 114708, 133320, 18533...   
4  5  [0, 8904, 17638, 68107, 98036, 145343, 145729,...   

                                               FQDNs  \
0  [kystrzymmhfaqcqia.an, mzhqfyook.ba, mtiwodiwm...   
1  [mtiwodiwmjqamjy2ntyz.com, mtewodiwmjqaode4ntu...   
2  [mtiwodiwmjqandi1nzi2.com, mtewodiwmjqandq0mdg...   
3  [58demanda ..rec, desainadura ..af, imperturba...   
4  [udonblletjz.de, gildtoqcxer.store, jkzybdgleu...   

                                              Status Output  
0  [KO, KO, KO, KO, OK, OK, KO, KO, KO, KO, OK, O...    dga  
1  [KO, KO, KO, KO, OK, OK, KO, KO, KO, KO, OK, O...    dga  
2  [KO, KO, KO, KO, OK, OK, KO, KO, KO, KO, OK, O...    dga  
3  [KO, KO, KO, KO, OK, OK, KO, KO, KO, KO, OK, O..