In [14]:
import pandas as pd
import numpy as np
from collections import Counter

In [16]:
def load_log_file(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split(';')
            
            if len(parts) != 5:
                continue  
            
            id = parts[0].strip()
            timestamps_str = parts[1].strip().strip('[]')
            try:
                if timestamps_str and all(part.isdigit() for part in timestamps_str.split(', ')):
                    timestamps = list(map(int, timestamps_str.split(', ')))
                else:
                    timestamps = []
            except ValueError:
                timestamps = []
                
            fqdn_list_str = parts[2].strip().strip('[]')
            if fqdn_list_str:
                fqdn_list = [fqdn.strip("'\"") for fqdn in fqdn_list_str.split(', ')]
            else:
                fqdn_list = []
            
            status_list_str = parts[3].strip().strip('[]')
            if status_list_str:
                status_list = [status.strip("'\"") for status in status_list_str.split(', ')]
            else:
                status_list = []
            
            output_status = parts[4].strip()
            
            data.append((id, timestamps, fqdn_list, status_list, output_status))
    return data

In [18]:
def calculate_entropy(s):
    probabilities = [float(s.count(c)) / len(s) for c in dict.fromkeys(list(s))]
    entropy = - sum([p * np.log2(p) for p in probabilities])
    return entropy

In [20]:
def create_dataset(data):
    dataset = {
        'ID': [],
        'Timestamp': [],
        'FQDNs': [],
        'Status': [],
        'Output': []
    }

    for entry in data:
        id, timestamps, fqdn_list, status_list, output_status = entry
        dataset['ID'].append(id)
        dataset['Timestamp'].append(timestamps)
        dataset['FQDNs'].append(fqdn_list)
        dataset['Status'].append(status_list)
        dataset['Output'].append(output_status)
        
    return dataset

In [22]:
def calculate_features(dataset):
    features = []
    for i in range(len(dataset['ID'])):
        id = dataset['ID'][i]
        timestamps = dataset['Timestamp'][i]
        fqdn_list = dataset['FQDNs'][i]
        status_list = dataset['Status'][i]
        output_status = dataset['Output'][i]  # Retrieve the Output status
        
        # Calculate inter-arrival times (IAT)
        if len(timestamps) > 1:
            iat = [timestamps[j+1] - timestamps[j] for j in range(len(timestamps) - 1)]
        else:
            iat = []
        
        # Calculate the number of successive "KO" statuses
        nb_succesive_KO = 0
        current_ko_streak = 0
        for status in status_list:
            if status == "KO":
                current_ko_streak += 1
                if current_ko_streak > 1:
                    nb_succesive_KO += 1
            else:
                current_ko_streak = 0

        # Calculate average, standard deviation, min, max of IAT
        avg_iat = sum(iat) / len(iat) if iat else 0
        std_iat = (sum((x - avg_iat) ** 2 for x in iat) / len(iat)) ** 0.5 if iat else 0
        min_iat = min(iat) if iat else 0
        max_iat = max(iat) if iat else 0
        
        # Calculate character distribution in FQDNs
        char_distribution = Counter()
        for fqdn in fqdn_list:
            char_distribution.update(fqdn)
        num_digits = sum(char_distribution[c] for c in char_distribution if c.isdigit())
        num_alpha = sum(char_distribution[c] for c in char_distribution if c.isalpha())
        num_special = sum(char_distribution[c] for c in char_distribution if not c.isalnum())
        
        # Calculate FQDN entropy
        fqdn_entropies = [calculate_entropy(fqdn) for fqdn in fqdn_list]
        avg_fqdn_entropy = sum(fqdn_entropies) / len(fqdn_entropies) if fqdn_entropies else 0
        
        # Calculate frequency of status codes
        status_counts = Counter(status_list)
        
        # Calculate request rate
        duration = (max(timestamps) - min(timestamps)) if timestamps else 1
        request_rate = len(timestamps) / duration if duration > 0 else 0
        
        # Calculate temporal patterns (example: moving average of IATs)
        temporal_patterns = sum(iat) / len(iat) if len(iat) > 0 else 0
        
        features.append((
            id, nb_succesive_KO, avg_iat, std_iat, min_iat, max_iat,
            num_digits, num_alpha, num_special, avg_fqdn_entropy,
            request_rate, temporal_patterns, output_status
        ))
    
    return features

In [24]:
def main():
    log_file_path = 'dga_simulation_log.txt'  
    data = load_log_file(log_file_path)
    dataset = create_dataset(data)
    features = calculate_features(dataset)
    
    dataset_df = pd.DataFrame({
        'ID': dataset['ID'],
        'Timestamp': dataset['Timestamp'],
        'FQDNs': dataset['FQDNs'],
        'Status': dataset['Status'],
        'Output': dataset['Output'] 
    })
    
    features_df = pd.DataFrame(features, columns=[
        "Scenario", "Nb Successive KO", "Avg IAT", "Std IAT", "Min IAT", "Max IAT", 
        "Num Digits", "Num Alpha", "Num Special",
        "Avg FQDN Entropy", "Request Rate", "Temporal Patterns", "Output"
    ])

    print("Dataset:")
    print(dataset_df.head())
    print("\nFeatures:")
    print(features_df.head())
    
    dataset_df.to_csv('dataset.csv', index=False)
    features_df.to_csv('features.csv', index=False)

if __name__ == "__main__":
    main()

Dataset:
  ID                                          Timestamp  \
0  1  [0, 54955, 104344, 128879, 157872, 199433, 252...   
1  2  [0, 51459, 92748, 109403, 137506, 157103, 1832...   
2  3  [0, 48919, 82205, 122029, 176252, 231081, 2333...   
3  4  [0, 13702, 62165, 73898, 100539, 138132, 17751...   
4  5  [0, 17744, 44416, 89472, 93368, 136703, 141995...   

                                               FQDNs  \
0  [mtiwodiwmjqandi1nzk1.com, mtiwodiwmjqantqwmji...   
1  [mtiwodiwmjqanjg2mdg0.com, mtewodiwmjqanju5mta...   
2  [mtiwodiwmjqaotcwnzm3.com, mtewodiwmjqaotewmte...   
3  [baloncesto ..at, centelladorinso..nato, jimio...   
4  [qjnmssnpvqccb.com, mfpzomfjpmuts.gov, ugprafa...   

                                              Status Output  
0  [KO, KO, KO, KO, OK, OK, KO, KO, KO, KO, OK, O...    dga  
1  [KO, KO, KO, KO, OK, OK, KO, KO, KO, KO, OK, O...    dga  
2  [KO, KO, KO, KO, OK, OK, KO, KO, KO, KO, OK, O...    dga  
3  [KO, KO, KO, KO, OK, OK, KO, KO, KO, KO, OK, O..