<a href="https://colab.research.google.com/github/nikhil-87/ransomware-detection/blob/main/combined_parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Combine Parser

In [2]:
%pip install fastparquet pyarrow

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading fastparquet-2024.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fastparquet
Successfully installed fastparquet-2024.11.0


In [3]:
# libraries
import os
import json
import pandas as pd
from tqdm import tqdm  # Import tqdm for the progress bar
import pickle
import time  # Import time to track duration
import fastparquet
import pyarrow

# 1. API Calls Parser

This code processes JSON reports generated by Cuckoo sandbox, extracting information about API calls made during the analysis. For each report, it identifies all unique API calls under the `apistats` section and creates a feature for each, prefixed with "API:". The code generates a summary for each report, marking `1` if an API call occurred and `0` otherwise. It stores these summaries in a DataFrame, with the `sample_id` as the first column and API calls as subsequent columns. The `sample_id` is sorted numerically to ensure proper order (e.g., 10001, 10002, etc.).

In [4]:
pip install ijson


Collecting ijson
  Downloading ijson-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (21 kB)
Downloading ijson-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (148 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/148.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.3/148.3 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ijson
Successfully installed ijson-3.4.0


In [5]:
import os
import json
import pandas as pd
from tqdm import tqdm
import ijson

def summarize_api_calls(apistats):
    """
    Summarizes API calls from the apistats section of the report.
    Returns a dictionary with keys as feature names prefixed with "API:" and values as 1.
    """
    unique_api_calls = set()
    for api_dict in apistats.values():
        unique_api_calls.update(api_dict.keys())

    summary = {f"API:{api}": 0 for api in unique_api_calls}
    for api_dict in apistats.values():
        for api in api_dict.keys():
            summary[f"API:{api}"] = 1

    return summary


def process_reports_folder(reports_folder):
    data = []
    file_ids = []

    all_files = [f for f in os.listdir(reports_folder) if f.endswith('.json')]

    for filename in tqdm(all_files, desc="Processing files", unit="file"):
        file_id_str = os.path.splitext(filename)[0]
        file_path = os.path.join(reports_folder, filename)

        with open(file_path, 'rb') as file:
            # Stream directly into apistats
            parser = ijson.items(file, 'behavior.apistats')
            apistats = next(parser, {})   # get the first object only
            summary = summarize_api_calls(apistats)

        data.append(summary)
        file_ids.append(file_id_str)

    df = pd.DataFrame(data, index=file_ids).fillna(0).astype(int).rename_axis("sample_id")
    df.sort_index(inplace=True)
    return df


if __name__ == "__main__":
    reports_folder = "json_reports"
    df1_api = process_reports_folder(reports_folder)
    display(df1_api)


Processing files: 100%|██████████| 1/1 [00:00<00:00, 27.96file/s]


Unnamed: 0_level_0,API:CreateActCtxW,API:NtSetValueKey,API:GetAdaptersAddresses,API:RegCreateKeyExA,API:NtQueryInformationFile,API:FindFirstFileExW,API:UuidCreate,API:LdrGetProcedureAddress,API:RegEnumKeyExA,API:RegSetValueExW,...,API:GetKeyState,API:GetSystemWindowsDirectoryW,API:NtOpenKey,API:RegDeleteValueW,API:CryptDecodeObjectEx,API:InternetQueryOptionA,API:DeleteFileW,API:NtOpenThread,API:GetNativeSystemInfo,API:ShellExecuteExW
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


## Functions to automate data processing

In [6]:
def rename_columns_with_numbers_index(df, start_number=1):
    """
    Rename columns (except 'sample_id') to numbers starting from the specified number.
    Args:
    - df: The input pandas DataFrame, where 'sample_id' is already set as the index.
    - start_number: The number from which the column renaming should start. Default is 1.

    Returns:
    - df: DataFrame with renamed columns.
    - column_map: Dictionary mapping numbers to original column names.
    """
    # Ensure that 'sample_id' is the index (no need to set it if already done)
    if 'sample_id' in df.index.name:
        # Get all columns except 'sample_id'
        columns_to_rename = df.columns

        # Create a dictionary mapping new column numbers to old column names
        column_map = {i + start_number: columns_to_rename[i] for i in range(len(columns_to_rename))}

        # Rename columns to start from start_number
        df.columns = [str(i + start_number) for i in range(len(columns_to_rename))]

        return df, column_map
    else:
        raise ValueError("The DataFrame does not have 'sample_id' as its index.")



def rename_columns_with_numbers(df, start_number=1):
    """
    Rename columns (except 'sample_id') to numbers starting from the specified number.
    Args:
    - df: The input pandas DataFrame.
    - start_number: The number from which the column renaming should start. Default is 1.

    Returns:
    - df: DataFrame with renamed columns.
    - column_map: Dictionary mapping numbers to original column names.
    """
    # Make sure 'sample_id' is the index, and rename only the columns after it
    df.set_index('sample_id', inplace=True)

    # Get all columns except 'sample_id'
    columns_to_rename = df.columns

    # Create a dictionary mapping new column numbers to old column names
    column_map = {i + start_number: columns_to_rename[i] for i in range(len(columns_to_rename))}

    # Rename columns to start from start_number
    df.columns = [str(i + start_number) for i in range(len(columns_to_rename))]

    return df, column_map

def save_column_map(column_map, filename):
    """
    Save the column mapping dictionary to a JSON file.
    Args:
    - column_map: Dictionary mapping numbers to original column names.
    - filename: The filename to save the dictionary.
    """
    with open(filename, 'w') as file:
        json.dump(column_map, file, indent=4)

def load_column_map(filename):
    """
    Load the column mapping dictionary from a JSON file.
    Args:
    - filename: The filename to load the dictionary from.

    Returns:
    - column_map: Dictionary mapping numbers to original column names.
    """
    with open(filename, 'r') as file:
        column_map = json.load(file)

    return column_map

def merge_column_map_to_df(df, column_map):
    """
    Merge the column map to the DataFrame to get the original column names.
    Args:
    - df: DataFrame with numbered columns.
    - column_map: Dictionary mapping numbers to original column names.

    Returns:
    - df: DataFrame with original column names restored.
    """
    # Replace numbered columns with their original names
    df = df.rename(columns={str(i): column_map[i] for i in column_map})
    return df

def print_first_and_last_10_items(dictionary):
    """
    Prints the first 10 and last 10 items of a dictionary.
    Args:
    - dictionary: The dictionary whose first and last 10 items are to be printed.
    """
    # Get the first 10 items
    first_10_items = list(dictionary.items())[:10]
    # Get the last 10 items
    last_10_items = list(dictionary.items())[-10:]

    # Print the first 10 items
    print("First 10 items:")
    for key, value in first_10_items:
        print(f"{key}: {value}")

    # Print a separator
    print("\n" + "-"*40 + "\n")

    # Print the last 10 items
    print("Last 10 items:")
    for key, value in last_10_items:
        print(f"{key}: {value}")


In [7]:
def find_constant_features(df):
    """
    Find and list the constant features (columns) in a DataFrame.
    A constant feature is a column where all values are the same.

    Args:
    - df: The input pandas DataFrame.

    Returns:
    - None: Prints a message with constant features or indicates no constant features.
    """
    # Find constant features (columns with the same value across all rows)
    constant_features = [col for col in df.columns if df[col].nunique() == 1]

    if constant_features:
        print("Constant features found:")
        for feature in constant_features:
            print(feature)
    else:
        print("No constant features found.")


In [8]:
def check_column_uniqueness(df):
    """
    Check if the column names in the DataFrame are unique or contain duplicates,
    ignoring case sensitivity by converting all column names to lowercase.

    Args:
    - df: The input pandas DataFrame.

    Returns:
    - None: Prints whether the column names are unique or contain duplicates.
    """
    # Convert all column names to lowercase
    column_names_lower = [col.lower() for col in df.columns]

    # Check if the length of column names set is equal to the length of original column names (case-insensitive)
    if len(column_names_lower) == len(set(column_names_lower)):
        print("All column names are unique.")
    else:
        print("There are duplicate column names.")

def list_and_count_duplicates(df):
    """
    List duplicate column names (case-insensitive) and count how many times each appears.

    Args:
    - df: The input pandas DataFrame.

    Returns:
    - None: Prints the duplicate column names and their counts.
    """
    # Convert all column names to lowercase for case-insensitive comparison
    column_names_lower = [col.lower() for col in df.columns]

    # Create a dictionary to count occurrences of each column name
    from collections import Counter
    column_counts = Counter(column_names_lower)

    # Filter to get only the columns that have more than 1 occurrence
    duplicates = {col: count for col, count in column_counts.items() if count > 1}

    if duplicates:
        print("Duplicate column names and their counts (case-insensitive):")
        for col, count in duplicates.items():
            print(f"{col}: {count} times")
    else:
        print("No duplicate column names found.")


In [9]:
# Call the function to find constant features
find_constant_features(df1_api)

Constant features found:
API:CreateActCtxW
API:NtSetValueKey
API:GetAdaptersAddresses
API:RegCreateKeyExA
API:NtQueryInformationFile
API:FindFirstFileExW
API:UuidCreate
API:LdrGetProcedureAddress
API:RegEnumKeyExA
API:RegSetValueExW
API:GetSystemDirectoryA
API:RegQueryInfoKeyA
API:OpenServiceW
API:NtFreeVirtualMemory
API:GetTempPathW
API:NtDeviceIoControlFile
API:DrawTextExA
API:EnumWindows
API:NtTerminateProcess
API:getsockname
API:NtDelayExecution
API:CoInitializeEx
API:NtQueryAttributesFile
API:NtClose
API:OleInitialize
API:RegQueryInfoKeyW
API:FindResourceW
API:GetVolumeNameForVolumeMountPointW
API:RegCreateKeyExW
API:RegEnumKeyW
API:CoCreateInstanceEx
API:CoCreateInstance
API:GetFileSizeEx
API:FindResourceExW
API:WriteConsoleA
API:sendto
API:__exception__
API:SetFileAttributesW
API:InternetOpenW
API:NtWriteFile
API:SetErrorMode
API:LdrGetDllHandle
API:bind
API:NtCreateSection
API:CryptHashData
API:GetSystemMetrics
API:LookupAccountSidW
API:NtQueryKey
API:CoUninitialize
API:SetUnha

In [10]:
# Call the function to check if column names are unique
check_column_uniqueness(df1_api)

All column names are unique.


In [12]:
 # Step 1: Rename columns starting from 1
df1_api2, column_map_api = rename_columns_with_numbers_index(df1_api, start_number=1)

# Step 2: Save the column map
save_column_map(column_map_api, '5_mlran_dataset/1_api_feature_names_dic.json')

In [None]:
print_first_and_last_10_items(column_map_api)

display(df1_api2.head())

df1_api2.to_csv("5_mlran_dataset/1_api_dataset.csv")

First 10 items:
1: API:GetAdaptersInfo
2: API:MessageBoxTimeoutW
3: API:NtQueryValueKey
4: API:NtOpenFile
5: API:NtProtectVirtualMemory
6: API:SetErrorMode
7: API:NtFreeVirtualMemory
8: API:CreateProcessInternalW
9: API:RegOpenKeyExW
10: API:NtOpenDirectoryObject

----------------------------------------

Last 10 items:
304: API:NtLoadKey2
305: API:NtSaveKey
306: API:NtCreateUserProcess
307: API:NtLoadDriver
308: API:InternetGetConnectedStateExW
309: API:CryptUnprotectData
310: API:WSARecvFrom
311: API:WSASendTo
312: API:DeleteUrlCacheEntryW
313: API:DnsQuery_UTF8


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,304,305,306,307,308,309,310,311,312,313
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0,0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
10002,0,0,1,0,1,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,1,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,0,1,1,1,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10005,0,0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


## Saving as a Parquet file for efficient data storage and retrieval

Apache Parquet is an open source, column-oriented data file format designed for efficient data storage and retrieval. It provides high performance compression and encoding schemes to handle complex data in bulk and is supported in many programming language and analytics tools.

References:
1. https://parquet.apache.org/
2. https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_parquet.html

In [13]:
df1_api2.to_parquet('5_mlran_dataset/1_api_dataset.parquet', compression='snappy')

# 2. Registry Keys Parser

This code processes Cuckoo sandbox reports to extract registry key operations such as `WRITTEN, READ, OPENED, DELETED`. It reads each JSON file in a specified folder, identifies registry operations from the behavior -> summary section (under sub-section: `regkey_written`, `regkey_read`, `regkey_opened`, `regkey_deleted`) and creates a dictionary where the registry operation features are labeled as REG:OPENED, REG:DELETED, REG:READ, and REG:WRITTEN for each key. It then generates a DataFrame, where each row represents a sample report and columns represent unique registry operations, with a value of 1 if the operation exists and 0 otherwise. The sample_id column is sorted numerically.

In [14]:
import ijson

# Use the C backend
ijson.backends.yajl2_c  # ensure you have it installed


<module 'ijson.backends.yajl2_c' from '/usr/local/lib/python3.12/dist-packages/ijson/backends/yajl2_c.py'>

In [15]:
pip install ijson[yajl2_c]

[0m

In [17]:

import json

def extract_registry_operations(report_path):
    """
    Extracts registry key operations (opened, deleted, read, written) from a Cuckoo report.
    Uses safe decoding to avoid UnicodeDecodeError.
    """
    registry_features = {}

    # Open in binary and decode safely
    with open(report_path, 'rb') as f:
        raw_bytes = f.read()
    text = raw_bytes.decode('utf-8', errors='ignore')  # ignore invalid bytes
    report = json.loads(text)  # load JSON from string

    behavior_summary = report.get('behavior', {}).get('summary', {})

    for regkey in behavior_summary.get('regkey_opened', []):
        registry_features[f"REG:OPENED:{regkey}"] = 1
    for regkey in behavior_summary.get('regkey_deleted', []):
        registry_features[f"REG:DELETED:{regkey}"] = 1
    for regkey in behavior_summary.get('regkey_read', []):
        registry_features[f"REG:READ:{regkey}"] = 1
    for regkey in behavior_summary.get('regkey_written', []):
        registry_features[f"REG:WRITTEN:{regkey}"] = 1

    return registry_features

def create_registry_dataframe(reports_folder):
    """
    Creates a DataFrame where each row is a report and each column is a unique registry key operation.
    The value is 1 if the operation exists in that report, otherwise 0.
    """
    # Start timing the process
    start_time = time.time()

    # List to store data for each sample
    data = []

    # Set to collect all unique feature names (registry key operations)
    all_features = set()

    # Get the list of all JSON files in the reports folder
    report_files = [f for f in os.listdir(reports_folder) if f.endswith(".json")]

    # Loop through all JSON files with progress bar
    for report_file in tqdm(report_files, desc="Processing reports", unit="file"):
        sample_id = report_file.split(".")[0]  # Extract sample ID (e.g., 10001 from 10001.json)
        report_path = os.path.join(reports_folder, report_file)

        # Extract registry operations for the current report
        registry_features = extract_registry_operations(report_path)

        # Add the current sample ID and registry features to the data list
        sample_data = {"sample_id": sample_id}
        sample_data.update(registry_features)
        data.append(sample_data)

        # Add the registry features to the all_features set
        all_features.update(registry_features.keys())

    # Create a DataFrame from the collected data
    df = pd.DataFrame(data)

    # Ensure all features (columns) are in the DataFrame, fill missing values with 0
    df = df.reindex(columns=["sample_id"] + sorted(all_features), fill_value=0)

    # Replace NaN with 0 and convert the DataFrame to integers
    df.fillna(0, inplace=True)
    df = df.astype(int)  # Ensure all values are integers (1 or 0)

    # Sort rows by sample_id in ascending order
    df['sample_id'] = df['sample_id'].astype(int)  # Convert sample_id to int for sorting
    df.sort_values(by='sample_id', inplace=True)

    # End timing the process
    end_time = time.time()

    # Calculate the total time taken and print it
    total_time = end_time - start_time
    print(f"\nTotal time to process and create the registry dataframe: {total_time:.2f} seconds")

    return df

# Implementing the code
reports_folder = "json_reports" # Folder containing Cuckoo report JSON files
df2_reg = create_registry_dataframe(reports_folder)
display(df2_reg)

Processing reports: 100%|██████████| 1/1 [00:00<00:00,  5.17file/s]



Total time to process and create the registry dataframe: 0.22 seconds


Unnamed: 0,sample_id,REG:DELETED:HKEY_CURRENT_USER\Software\Microsoft\Windows\CurrentVersion\Internet Settings\AutoConfigURL,REG:DELETED:HKEY_CURRENT_USER\Software\Microsoft\Windows\CurrentVersion\Internet Settings\ProxyOverride,REG:DELETED:HKEY_CURRENT_USER\Software\Microsoft\Windows\CurrentVersion\Internet Settings\ProxyServer,REG:DELETED:HKEY_CURRENT_USER\Software\Microsoft\Windows\CurrentVersion\Internet Settings\ZoneMap\IntranetName,REG:DELETED:HKEY_CURRENT_USER\Software\Microsoft\Windows\CurrentVersion\Internet Settings\ZoneMap\ProxyBypass,REG:DELETED:HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Microsoft\Windows\CurrentVersion\Internet Settings\ZoneMap\IntranetName,REG:DELETED:HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Microsoft\Windows\CurrentVersion\Internet Settings\ZoneMap\ProxyBypass,REG:OPENED:HKEY_CLASSES_ROOT\.ade,REG:OPENED:HKEY_CLASSES_ROOT\.adp,...,REG:WRITTEN:HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Microsoft\Tracing\mshta_RASAPI32\EnableFileTracing,REG:WRITTEN:HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Microsoft\Tracing\mshta_RASAPI32\FileDirectory,REG:WRITTEN:HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Microsoft\Tracing\mshta_RASAPI32\FileTracingMask,REG:WRITTEN:HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Microsoft\Tracing\mshta_RASAPI32\MaxFileSize,REG:WRITTEN:HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Microsoft\Tracing\mshta_RASMANCS\ConsoleTracingMask,REG:WRITTEN:HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Microsoft\Tracing\mshta_RASMANCS\EnableConsoleTracing,REG:WRITTEN:HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Microsoft\Tracing\mshta_RASMANCS\EnableFileTracing,REG:WRITTEN:HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Microsoft\Tracing\mshta_RASMANCS\FileDirectory,REG:WRITTEN:HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Microsoft\Tracing\mshta_RASMANCS\FileTracingMask,REG:WRITTEN:HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Microsoft\Tracing\mshta_RASMANCS\MaxFileSize
0,10001,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [18]:
# Call the function to find constant features
find_constant_features(df2_reg)

Constant features found:
sample_id
REG:DELETED:HKEY_CURRENT_USER\Software\Microsoft\Windows\CurrentVersion\Internet Settings\AutoConfigURL
REG:DELETED:HKEY_CURRENT_USER\Software\Microsoft\Windows\CurrentVersion\Internet Settings\ProxyOverride
REG:DELETED:HKEY_CURRENT_USER\Software\Microsoft\Windows\CurrentVersion\Internet Settings\ProxyServer
REG:DELETED:HKEY_CURRENT_USER\Software\Microsoft\Windows\CurrentVersion\Internet Settings\ZoneMap\IntranetName
REG:DELETED:HKEY_CURRENT_USER\Software\Microsoft\Windows\CurrentVersion\Internet Settings\ZoneMap\ProxyBypass
REG:DELETED:HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Microsoft\Windows\CurrentVersion\Internet Settings\ZoneMap\IntranetName
REG:DELETED:HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Microsoft\Windows\CurrentVersion\Internet Settings\ZoneMap\ProxyBypass
REG:OPENED:HKEY_CLASSES_ROOT\.ade
REG:OPENED:HKEY_CLASSES_ROOT\.adp
REG:OPENED:HKEY_CLASSES_ROOT\.app
REG:OPENED:HKEY_CLASSES_ROOT\.asp
REG:OPENED:HKEY_CLASSES_ROOT\.bas
REG:OPENED:HKEY_C

In [19]:
# Call the function to check if column names are unique
check_column_uniqueness(df2_reg)

There are duplicate column names.


In [20]:
 # Step 1: Rename columns starting from 1
df2_reg2, column_map_reg = rename_columns_with_numbers(df2_reg, start_number=314)

print_first_and_last_10_items(column_map_reg)

display(df2_reg2.head())

First 10 items:
314: REG:DELETED:HKEY_CURRENT_USER\Software\Microsoft\Windows\CurrentVersion\Internet Settings\AutoConfigURL
315: REG:DELETED:HKEY_CURRENT_USER\Software\Microsoft\Windows\CurrentVersion\Internet Settings\ProxyOverride
316: REG:DELETED:HKEY_CURRENT_USER\Software\Microsoft\Windows\CurrentVersion\Internet Settings\ProxyServer
317: REG:DELETED:HKEY_CURRENT_USER\Software\Microsoft\Windows\CurrentVersion\Internet Settings\ZoneMap\IntranetName
318: REG:DELETED:HKEY_CURRENT_USER\Software\Microsoft\Windows\CurrentVersion\Internet Settings\ZoneMap\ProxyBypass
319: REG:DELETED:HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Microsoft\Windows\CurrentVersion\Internet Settings\ZoneMap\IntranetName
320: REG:DELETED:HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Microsoft\Windows\CurrentVersion\Internet Settings\ZoneMap\ProxyBypass
321: REG:OPENED:HKEY_CLASSES_ROOT\.ade
322: REG:OPENED:HKEY_CLASSES_ROOT\.adp
323: REG:OPENED:HKEY_CLASSES_ROOT\.app

----------------------------------------

Last 10 ite

Unnamed: 0_level_0,314,315,316,317,318,319,320,321,322,323,...,1871,1872,1873,1874,1875,1876,1877,1878,1879,1880
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [21]:
# saving the feature names dictionary
save_column_map(column_map_reg, '5_mlran_dataset/2_reg_feature_names_dic.json')

In [22]:
# Saving the data
df2_reg2.to_csv("5_mlran_dataset/2_reg_dataset.csv")

In [23]:
df2_reg2.to_parquet('5_mlran_dataset/2_reg_dataset.parquet', compression='snappy')

# 3. File Operations Parser

This code processes Cuckoo report JSON files to extract file operations such as file creation, deletion, opening, and writing. It reads each report, identifies these operations from the behavior -> summary section (under sub-section, `file_created`, `file_recreated`, `file_opened`, `file_written`, `file_deleted`, `file_exists`, `file_failed`, `file_read`) and formats them into feature names like `FILE:CREATED:<filepath>`. A DataFrame is then created where each row represents a sample report, and columns represent unique file operations, with 1 indicating the presence of an operation and 0 otherwise. The sample_id column is used to identify each report, and the rows are sorted by this ID.

In [24]:
def extract_file_operations(report_path):
    """
    Extracts file operations (created, recreated, opened, written, deleted, exists, failed, read) from a Cuckoo report.
    Returns a dictionary with keys as feature names and values as 1 (since the operation exists in this report).
    """
    file_operations = {}

    try:
        with open(report_path, 'r') as f:
            report = json.load(f)

        behavior_summary = report.get('behavior', {}).get('summary', {})

        # Mapping for file operation names to the desired format
        file_op_mapping = {
            'file_created': 'CREATED',
            'file_recreated': 'RECREATED',
            'file_opened': 'OPENED',
            'file_written': 'WRITTEN',
            'file_deleted': 'DELETED',
            'file_exists': 'EXISTS',
            'file_failed': 'FAILED',
            'file_read': 'READ'
        }

        for file_op, formatted_name in file_op_mapping.items():
            for filepath in behavior_summary.get(file_op, []):
                # Construct feature name with FILE and FORMATTED NAME in uppercase
                # Convert filepath to lowercase to ensure uniqueness
                feature_name = f"FILE:{formatted_name}:{filepath.lower()}"
                file_operations[feature_name] = 1

    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error reading {report_path}: {e}")

    return file_operations

def create_file_operations_dataframe(reports_folder):
    """
    Creates a DataFrame where each row is a report and each column is a unique file operation.
    The value is 1 if the operation exists in that report, otherwise 0.
    """
    data = []
    all_features = set()

    # Get the list of all JSON files in the reports folder
    report_files = [f for f in os.listdir(reports_folder) if f.endswith(".json")]

    # Loop through all JSON files with progress bar
    for report_file in tqdm(report_files, desc="Processing reports", unit="file"):
        sample_id = report_file.split(".")[0]  # Extract sample ID (e.g., 10001 from 10001.json)
        report_path = os.path.join(reports_folder, report_file)

        file_operations = extract_file_operations(report_path)

        sample_data = {"sample_id": sample_id}
        sample_data.update(file_operations)
        data.append(sample_data)

        all_features.update(file_operations.keys())

    df = pd.DataFrame(data)

    # Ensure columns are sorted and all feature names are unique (case-insensitive)
    df = df.reindex(columns=["sample_id"] + sorted(all_features), fill_value=0)
    df.fillna(0, inplace=True)
    df = df.astype(int)

    df['sample_id'] = df['sample_id'].astype(int)
    df.sort_values(by='sample_id', inplace=True)

    return df

# implementing the code
if __name__ == "__main__":
    reports_folder = "json_reports"  # Folder containing Cuckoo report JSON files
    df3_file = create_file_operations_dataframe(reports_folder)
    display(df3_file)

Processing reports: 100%|██████████| 1/1 [00:00<00:00,  4.78file/s]


Unnamed: 0,sample_id,FILE:CREATED:c:\_r_e_a_d___t_h_i_s___e4zrh_.hta,FILE:CREATED:c:\_r_e_a_d___t_h_i_s___kf1c_.txt,FILE:CREATED:c:\tmpas6frz\_r_e_a_d___t_h_i_s___fme7q_.txt,FILE:CREATED:c:\tmpas6frz\_r_e_a_d___t_h_i_s___u4ohd_.hta,FILE:CREATED:c:\tmpas6frz\lib\api\_r_e_a_d___t_h_i_s___lqdqz7ec_.txt,FILE:CREATED:c:\tmpas6frz\lib\api\_r_e_a_d___t_h_i_s___nd2o_.hta,FILE:CREATED:c:\tmpas6frz\lib\common\_r_e_a_d___t_h_i_s___4eq4su_.txt,FILE:CREATED:c:\tmpas6frz\lib\common\_r_e_a_d___t_h_i_s___nm9uvr_.hta,FILE:CREATED:c:\tmpas6frz\lib\core\_r_e_a_d___t_h_i_s___k6tuq_.txt,...,FILE:WRITTEN:c:\users\administrator\desktop\_r_e_a_d___t_h_i_s___rwpkix_.txt,FILE:WRITTEN:c:\users\administrator\documents\_r_e_a_d___t_h_i_s___f8bj_.txt,FILE:WRITTEN:c:\users\administrator\documents\_r_e_a_d___t_h_i_s___km45_.hta,FILE:WRITTEN:c:\users\administrator\documents\gnjxbdyhko.ppt,FILE:WRITTEN:c:\users\administrator\documents\maexbzzamvxtyx.pptx,FILE:WRITTEN:c:\users\administrator\documents\rmrokfgpietvyopm.pptx,FILE:WRITTEN:c:\users\administrator\documents\vfgwpcnjonf.ppt,FILE:WRITTEN:c:\users\administrator\documents\vglzhncbfphx.rtf,FILE:WRITTEN:c:\users\administrator\documents\xvawwrdctuhen.rtf,FILE:WRITTEN:c:\users\administrator\documents\zaxvdgipih.rtf
0,10001,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [25]:
# Call the function to find constant features
find_constant_features(df3_file)

Constant features found:
sample_id
FILE:CREATED:c:\_r_e_a_d___t_h_i_s___e4zrh_.hta
FILE:CREATED:c:\_r_e_a_d___t_h_i_s___kf1c_.txt
FILE:CREATED:c:\tmpas6frz\_r_e_a_d___t_h_i_s___fme7q_.txt
FILE:CREATED:c:\tmpas6frz\_r_e_a_d___t_h_i_s___u4ohd_.hta
FILE:CREATED:c:\tmpas6frz\lib\api\_r_e_a_d___t_h_i_s___lqdqz7ec_.txt
FILE:CREATED:c:\tmpas6frz\lib\api\_r_e_a_d___t_h_i_s___nd2o_.hta
FILE:CREATED:c:\tmpas6frz\lib\common\_r_e_a_d___t_h_i_s___4eq4su_.txt
FILE:CREATED:c:\tmpas6frz\lib\common\_r_e_a_d___t_h_i_s___nm9uvr_.hta
FILE:CREATED:c:\tmpas6frz\lib\core\_r_e_a_d___t_h_i_s___k6tuq_.txt
FILE:CREATED:c:\tmpas6frz\lib\core\_r_e_a_d___t_h_i_s___z2y416k_.hta
FILE:CREATED:c:\tmpas6frz\modules\auxiliary\_r_e_a_d___t_h_i_s___18f6_.hta
FILE:CREATED:c:\tmpas6frz\modules\auxiliary\_r_e_a_d___t_h_i_s___ulc4_.txt
FILE:CREATED:c:\tmpas6frz\modules\packages\_r_e_a_d___t_h_i_s___kfj1v_.hta
FILE:CREATED:c:\tmpas6frz\modules\packages\_r_e_a_d___t_h_i_s___vmh6_.txt
FILE:CREATED:c:\users\administrator\appdata\l

In [26]:
# Call the function to check if column names are unique
check_column_uniqueness(df3_file)

All column names are unique.


In [27]:
list_and_count_duplicates(df3_file)

No duplicate column names found.


In [28]:
 # Step 1: Rename columns starting from 525819
df3_file2, column_map_file = rename_columns_with_numbers(df3_file, start_number=525819)

print_first_and_last_10_items(column_map_file)

display(df3_file2.head())

First 10 items:
525819: FILE:CREATED:c:\_r_e_a_d___t_h_i_s___e4zrh_.hta
525820: FILE:CREATED:c:\_r_e_a_d___t_h_i_s___kf1c_.txt
525821: FILE:CREATED:c:\tmpas6frz\_r_e_a_d___t_h_i_s___fme7q_.txt
525822: FILE:CREATED:c:\tmpas6frz\_r_e_a_d___t_h_i_s___u4ohd_.hta
525823: FILE:CREATED:c:\tmpas6frz\lib\api\_r_e_a_d___t_h_i_s___lqdqz7ec_.txt
525824: FILE:CREATED:c:\tmpas6frz\lib\api\_r_e_a_d___t_h_i_s___nd2o_.hta
525825: FILE:CREATED:c:\tmpas6frz\lib\common\_r_e_a_d___t_h_i_s___4eq4su_.txt
525826: FILE:CREATED:c:\tmpas6frz\lib\common\_r_e_a_d___t_h_i_s___nm9uvr_.hta
525827: FILE:CREATED:c:\tmpas6frz\lib\core\_r_e_a_d___t_h_i_s___k6tuq_.txt
525828: FILE:CREATED:c:\tmpas6frz\lib\core\_r_e_a_d___t_h_i_s___z2y416k_.hta

----------------------------------------

Last 10 items:
526325: FILE:WRITTEN:c:\users\administrator\desktop\_r_e_a_d___t_h_i_s___rwpkix_.txt
526326: FILE:WRITTEN:c:\users\administrator\documents\_r_e_a_d___t_h_i_s___f8bj_.txt
526327: FILE:WRITTEN:c:\users\administrator\documents\_

Unnamed: 0_level_0,525819,525820,525821,525822,525823,525824,525825,525826,525827,525828,...,526325,526326,526327,526328,526329,526330,526331,526332,526333,526334
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [29]:
# saving the feature names dictionary
save_column_map(column_map_file, '5_mlran_dataset/3_file_feature_names_dic.json')

In [30]:
# Saving the data
df3_file2.to_csv("5_mlran_dataset/3_file_dataset.csv")

In [31]:
df3_file2.to_parquet('5_mlran_dataset/3_file_dataset.parquet', compression='snappy')

# 4. Directory Operations Parser

This code extracts directory operations (such as directory creation and enumeration) from Cuckoo report JSON files. It reads each report, identifies directory-related activities from the `behavior -> summary` section, and formats them into feature names like `DIRECTORY:CREATED:<dirpath>`. It creates a DataFrame where each row represents a sample report, and the columns represent unique directory operations. A value of `1` indicates the presence of an operation, and `0` otherwise. The `sample_id` column identifies each report, and the DataFrame is sorted by `sample_id`.

In [32]:
def extract_directory_operations(report_path):
    """
    Extracts directory operations (created and enumerated) from a Cuckoo report.
    Returns a dictionary with keys as feature names and values as 1 (since the operation exists in this report).
    """
    directory_operations = {}

    try:
        with open(report_path, 'r') as f:
            report = json.load(f)

        behavior_summary = report.get('behavior', {}).get('summary', {})

        # Mapping for directory operation names to the desired format
        directory_op_mapping = {
            'directory_created': 'CREATED',
            'directory_enumerated': 'ENUMERATED'
        }

        # Process directory operations
        for dir_op, formatted_name in directory_op_mapping.items():
            for dirpath in behavior_summary.get(dir_op, []):
                feature_name = f"DIRECTORY:{formatted_name}:{dirpath.lower()}"
                directory_operations[feature_name] = 1

    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error reading {report_path}: {e}")

    return directory_operations

def create_directory_operations_dataframe(reports_folder):
    """
    Creates a DataFrame where each row is a report and each column is a unique directory operation.
    The value is 1 if the operation exists in that report, otherwise 0.
    """
    data = []
    all_features = set()

    # Get the list of all JSON files in the reports folder
    report_files = [f for f in os.listdir(reports_folder) if f.endswith(".json")]

    # Loop through all JSON files with progress bar
    for report_file in tqdm(report_files, desc="Processing reports", unit="file"):
        sample_id = report_file.split(".")[0]  # Extract sample ID (e.g., 10001 from 10001.json)
        report_path = os.path.join(reports_folder, report_file)

        directory_operations = extract_directory_operations(report_path)

        sample_data = {"sample_id": sample_id}
        sample_data.update(directory_operations)
        data.append(sample_data)

        all_features.update(directory_operations.keys())

    df = pd.DataFrame(data)

    df = df.reindex(columns=["sample_id"] + sorted(all_features), fill_value=0)
    df.fillna(0, inplace=True)
    df = df.astype(int)

    df['sample_id'] = df['sample_id'].astype(int)
    df.sort_values(by='sample_id', inplace=True)

    return df


# implementing the code
if __name__ == "__main__":
    reports_folder = "json_reports"  # Folder containing Cuckoo report JSON files
    df4_dir = create_directory_operations_dataframe(reports_folder)
    display(df4_dir)

Processing reports: 100%|██████████| 1/1 [00:00<00:00,  4.99file/s]


Unnamed: 0,sample_id,DIRECTORY:CREATED:c:\users\administrator\appdata\local\microsoft\windows\caches,DIRECTORY:CREATED:c:\users\administrator\appdata\local\temp\73f0eaed,DIRECTORY:ENUMERATED:c:\*,DIRECTORY:ENUMERATED:c:\program files (x86)\bitcoin\*,DIRECTORY:ENUMERATED:c:\program files (x86)\excel\*,DIRECTORY:ENUMERATED:c:\program files (x86)\microsoft sql server\*,DIRECTORY:ENUMERATED:c:\program files (x86)\microsoft\excel\*,DIRECTORY:ENUMERATED:c:\program files (x86)\microsoft\microsoft sql server\*,DIRECTORY:ENUMERATED:c:\program files (x86)\microsoft\office\*,...,DIRECTORY:ENUMERATED:c:\windows\system32\config\systemprofile\appdata\roaming\word\*,DIRECTORY:ENUMERATED:c:\windows\system32\config\systemprofile\desktop\*,DIRECTORY:ENUMERATED:c:\windows\system32\config\systemprofile\documents\*,DIRECTORY:ENUMERATED:c:\windows\system32\ping.*,DIRECTORY:ENUMERATED:c:\windows\system32\ping.com,DIRECTORY:ENUMERATED:c:\windows\system32\ping.exe,DIRECTORY:ENUMERATED:c:\windows\system32\ras\*.pbk,DIRECTORY:ENUMERATED:c:\windows\system32\taskkill.*,DIRECTORY:ENUMERATED:c:\windows\system32\taskkill.com,DIRECTORY:ENUMERATED:c:\windows\system32\taskkill.exe
0,10001,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [33]:
# Call the function to find constant features
find_constant_features(df4_dir)

# Call the function to check if column names are unique
check_column_uniqueness(df4_dir)

list_and_count_duplicates(df4_dir)

Constant features found:
sample_id
DIRECTORY:CREATED:c:\users\administrator\appdata\local\microsoft\windows\caches
DIRECTORY:CREATED:c:\users\administrator\appdata\local\temp\73f0eaed
DIRECTORY:ENUMERATED:c:\*
DIRECTORY:ENUMERATED:c:\program files (x86)\bitcoin\*
DIRECTORY:ENUMERATED:c:\program files (x86)\excel\*
DIRECTORY:ENUMERATED:c:\program files (x86)\microsoft sql server\*
DIRECTORY:ENUMERATED:c:\program files (x86)\microsoft\excel\*
DIRECTORY:ENUMERATED:c:\program files (x86)\microsoft\microsoft sql server\*
DIRECTORY:ENUMERATED:c:\program files (x86)\microsoft\office\*
DIRECTORY:ENUMERATED:c:\program files (x86)\microsoft\onenote\*
DIRECTORY:ENUMERATED:c:\program files (x86)\microsoft\outlook\*
DIRECTORY:ENUMERATED:c:\program files (x86)\microsoft\powerpoint\*
DIRECTORY:ENUMERATED:c:\program files (x86)\microsoft\word\*
DIRECTORY:ENUMERATED:c:\program files (x86)\office\*
DIRECTORY:ENUMERATED:c:\program files (x86)\onenote\*
DIRECTORY:ENUMERATED:c:\program files (x86)\outlook\

In [34]:
# Rename columns starting
df4_dir2, column_map_dir = rename_columns_with_numbers(df4_dir, start_number=2604137)

print_first_and_last_10_items(column_map_dir)

display(df4_dir2.head())

First 10 items:
2604137: DIRECTORY:CREATED:c:\users\administrator\appdata\local\microsoft\windows\caches
2604138: DIRECTORY:CREATED:c:\users\administrator\appdata\local\temp\73f0eaed
2604139: DIRECTORY:ENUMERATED:c:\*
2604140: DIRECTORY:ENUMERATED:c:\program files (x86)\bitcoin\*
2604141: DIRECTORY:ENUMERATED:c:\program files (x86)\excel\*
2604142: DIRECTORY:ENUMERATED:c:\program files (x86)\microsoft sql server\*
2604143: DIRECTORY:ENUMERATED:c:\program files (x86)\microsoft\excel\*
2604144: DIRECTORY:ENUMERATED:c:\program files (x86)\microsoft\microsoft sql server\*
2604145: DIRECTORY:ENUMERATED:c:\program files (x86)\microsoft\office\*
2604146: DIRECTORY:ENUMERATED:c:\program files (x86)\microsoft\onenote\*

----------------------------------------

Last 10 items:
2604363: DIRECTORY:ENUMERATED:c:\windows\system32\config\systemprofile\appdata\roaming\word\*
2604364: DIRECTORY:ENUMERATED:c:\windows\system32\config\systemprofile\desktop\*
2604365: DIRECTORY:ENUMERATED:c:\windows\system

Unnamed: 0_level_0,2604137,2604138,2604139,2604140,2604141,2604142,2604143,2604144,2604145,2604146,...,2604363,2604364,2604365,2604366,2604367,2604368,2604369,2604370,2604371,2604372
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [35]:
# saving the feature names dictionary
save_column_map(column_map_dir, '5_mlran_dataset/4_dir_feature_names_dic.json')

In [36]:
# Saving the data
df4_dir2.to_csv("5_mlran_dataset/4_dir_dataset.csv")

In [37]:
df4_dir2.to_parquet('5_mlran_dataset/4_dir_dataset.parquet', compression='snappy')

# 5. Strings Parser

This code extracts unique strings from the strings section of Cuckoo report JSON files. It creates feature names in the format `STRING:<string>` and assigns a value of 1 if the string is present in the report. The function then creates a DataFrame where each row corresponds to a sample report and each column represents a unique string. A value of 1 indicates the presence of a string, and 0 indicates its absence. The DataFrame is sorted by sample_id, which identifies each report.

In [38]:
def extract_strings(report_path):
    """
    Extracts strings from a Cuckoo report without transforming them.
    Returns a dictionary with unique strings as keys and 1 as values (indicating the presence of the string).
    """
    strings_found = {}

    try:
        with open(report_path, 'r') as f:
            report = json.load(f)

        # Get the strings section from the report
        strings_section = report.get('strings', [])

        # Use a set to ensure unique strings
        unique_strings = set(strings_section)

        # Process each unique string in the strings section
        for string in unique_strings:
            string = string.lower()
            feature_name = f"STRING:{string}"
            strings_found[feature_name] = 1

    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error reading {report_path}: {e}")

    return strings_found


def create_strings_dataframe(reports_folder):
    """
    Creates a DataFrame where each row is a report and each column is a unique string.
    The value is 1 if the string exists in that report, otherwise 0.
    """
    data = []
    all_features = set()

    # Get the list of all JSON files in the reports folder
    report_files = [f for f in os.listdir(reports_folder) if f.endswith(".json")]

    # Loop through all JSON files with progress bar
    for report_file in tqdm(report_files, desc="Processing reports", unit="file"):
        sample_id = report_file.split(".")[0]  # Extract sample ID (e.g., 10001 from 10001.json)
        report_path = os.path.join(reports_folder, report_file)

        # Extract strings for the current report
        strings_found = extract_strings(report_path)

        # Add the current sample ID and strings to the data list
        sample_data = {"sample_id": sample_id}
        sample_data.update(strings_found)
        data.append(sample_data)

        # Add the strings to the all_features set
        all_features.update(strings_found.keys())

    # Create a DataFrame from the collected data
    df = pd.DataFrame(data)

    # Ensure all features (columns) are in the DataFrame, fill missing values with 0
    df = df.reindex(columns=["sample_id"] + sorted(all_features), fill_value=0)

    # Replace NaN with 0 and convert the DataFrame to integers
    df.fillna(0, inplace=True)
    df = df.astype(int)  # Ensure all values are integers (1 or 0)

    # Sort rows by sample_id in ascending order
    df['sample_id'] = df['sample_id'].astype(int)  # Convert sample_id to int for sorting
    df.sort_values(by='sample_id', inplace=True)

    return df

# implementing the code
if __name__ == "__main__":
    reports_folder = "json_reports"  # Folder containing Cuckoo report JSON files
    df5_str = create_strings_dataframe(reports_folder)
    display(df5_str)

Processing reports: 100%|██████████| 1/1 [00:00<00:00,  5.34file/s]


Unnamed: 0,sample_id,STRING:#5%gg9,STRING:3)rc+,STRING:azn-x,STRING:n6p4n,STRING:rqqqttuuwwvwuuutqqr,STRING:ttttttudwwwwuuttttt,STRING:utttttuuwwwwuuttqqq,"STRING: language=""*""","STRING: name=""microsoft.windows.common-controls""",...,STRING:}qogogggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggogigooo},STRING:}qpplpolppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppplplppq,STRING:}vz'(|,STRING:}zyy}zh}o}yyy}o}yyy}o}yyy}o}yyy}o}yy}zhhhhh,STRING:}}rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr}},STRING:~hilvvvvvvvvvvvvvvvlvvvvvvvlvvvvvvvlvvvvvvvlvvvvvvvlvvvvvvvlvvvvvvvlvvvvvvvlvvvvvvvvvvvllih~,STRING:~nnk^k^mmk^kmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm,STRING:~pllggyyyggkk,STRING:~ppkkkkyyykykkkkyyyykkkkkkyyyykkkkkkkkkkkkkyykkkkkkkkkkykyyyyyyyykkkkkyyykkyyyyyyyyyymmmr,STRING:~u{yl{
0,10001,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [39]:
# Call the function to find constant features
find_constant_features(df5_str)

# Call the function to check if column names are unique
check_column_uniqueness(df5_str)

list_and_count_duplicates(df5_str)

Constant features found:
sample_id
STRING:#5%gg9
STRING:3)rc+
STRING:azn-x
STRING:n6p4n
STRING:rqqqttuuwwvwuuutqqr
STRING:ttttttudwwwwuuttttt
STRING:utttttuuwwwwuuttqqq
STRING:            language="*"
STRING:            name="microsoft.windows.common-controls"
STRING:            processorarchitecture="x86"
STRING:            publickeytoken="6595b64144ccf1df"
STRING:            type="win32"
STRING:            version="6.0.0.0"
STRING:        />
STRING:        <assemblyidentity
STRING:    </dependentassembly>
STRING:    <dependentassembly>
STRING:    name="siber.systems.roboform"
STRING:    processorarchitecture="x86"
STRING:    type="win32"
STRING:    version="5.0.0.0"
STRING: *"hbt
STRING: owwwwwwwwee
STRING: r8l4{
STRING: tjtjvlv|
STRING: xmlns="http://schemas.microsoft.com/smi/2005/windowssettings">
STRING: z@rfk
STRING: |j,hm
STRING:!f(ol|
STRING:!n)(h 
STRING:!this program cannot be run in dos mode.
STRING:!tttccccdwwwwdcccctt!
STRING:!tttccccdwwwwddctttt!
STRING:!tttttcdd

In [40]:
# Rename columns starting
df5_str2, column_map_str = rename_columns_with_numbers(df5_str, start_number=2762260)

print_first_and_last_10_items(column_map_str)

display(df5_str2.head())

First 10 items:
2762260: STRING:#5%gg9
2762261: STRING:3)rc+
2762262: STRING:azn-x
2762263: STRING:n6p4n
2762264: STRING:rqqqttuuwwvwuuutqqr
2762265: STRING:ttttttudwwwwuuttttt
2762266: STRING:utttttuuwwwwuuttqqq
2762267: STRING:            language="*"
2762268: STRING:            name="microsoft.windows.common-controls"
2762269: STRING:            processorarchitecture="x86"

----------------------------------------

Last 10 items:
2763333: STRING:}qogogggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggogigooo}
2763334: STRING:}qpplpolppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppplplppq
2763335: STRING:}vz'(|
2763336: STRING:}zyy}zh}o}yyy}o}yyy}o}yyy}o}yyy}o}yy}zhhhhh
2763337: STRING:}}rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr}}
2763338: STRING:~hilvvvvvvvvvvvvvvvlvvvvvvvlvvvvvvvlvvvvvvvlvvvvvvvlvvvvvvvlvvvvvvvlvvvvvvvlvvvvvvvvvvvllih~
2763339: STRING:~nnk^k^mmk^kmmmmmmmmmmmmmmmmmmmm

Unnamed: 0_level_0,2762260,2762261,2762262,2762263,2762264,2762265,2762266,2762267,2762268,2762269,...,2763333,2763334,2763335,2763336,2763337,2763338,2763339,2763340,2763341,2763342
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [41]:
# saving the feature names dictionary
save_column_map(column_map_str, '5_mlran_dataset/5_str_feature_names_dic.json')

In [42]:
df5_str2.to_parquet('5_mlran_dataset/5_str_dataset.parquet', compression='snappy')

In [43]:
# Saving the data
df5_str2.to_csv("5_mlran_dataset/5_str_dataset.csv")

# 6. Network

This code extracts network-related operations such as IP connections, host connections, and DNS resolutions from Cuckoo report JSON files. For each report, it creates feature names like `NETWORK:CONNECTS_IP:<ip_address>`, `NETWORK:CONNECTS_HOST:<host>`, and `NETWORK:RESOLVES_HOST:<host>` with values of `1` indicating the presence of these operations. A DataFrame is then created where each row corresponds to a report, and each column represents a unique network operation. Missing values are filled with `0`, and the rows are sorted by `sample_id`.

In [44]:
def extract_network_operations(report_path):
    """
    Extracts network operations (connects_ip, connects_host, resolves_host)
    from a Cuckoo report.
    Returns a dictionary with keys as feature names and values as 1 (since the operation exists in this report).
    """
    network_operations = {}

    try:
        with open(report_path, 'r') as f:
            report = json.load(f)

        # Get the relevant network operations from behavior -> summary
        behavior_summary = report.get('behavior', {}).get('summary', {})

        # Process each network operation category
        for net_op in ['connects_ip', 'connects_host', 'resolves_host']:
            for network_item in behavior_summary.get(net_op, []):
                network_item = network_item.lower()
                feature_name = f"NETWORK:{net_op.upper()}:{network_item}"
                network_operations[feature_name] = 1

    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error reading {report_path}: {e}")

    return network_operations


def create_network_operations_dataframe(reports_folder):
    """
    Creates a DataFrame where each row is a report and each column is a unique network operation.
    The value is 1 if the operation exists in that report, otherwise 0.
    """
    data = []
    all_features = set()

    # Get the list of all JSON files in the reports folder
    report_files = [f for f in os.listdir(reports_folder) if f.endswith(".json")]

    # Loop through all JSON files with progress bar
    for report_file in tqdm(report_files, desc="Processing reports", unit="file"):
        sample_id = report_file.split(".")[0]  # Extract sample ID (e.g., 10001 from 10001.json)
        report_path = os.path.join(reports_folder, report_file)

        # Extract network operations for the current report
        network_operations = extract_network_operations(report_path)

        # Add the current sample ID and network operations to the data list
        sample_data = {"sample_id": sample_id}
        sample_data.update(network_operations)
        data.append(sample_data)

        # Add the network operations to the all_features set
        all_features.update(network_operations.keys())

    # Create a DataFrame from the collected data
    df = pd.DataFrame(data)

    # Ensure all features (columns) are in the DataFrame, fill missing values with 0
    df = df.reindex(columns=["sample_id"] + sorted(all_features), fill_value=0)

    # Replace NaN with 0 and convert the DataFrame to integers
    df.fillna(0, inplace=True)
    df = df.astype(int)  # Ensure all values are integers (1 or 0)

    # Sort rows by sample_id in ascending order
    df['sample_id'] = df['sample_id'].astype(int)  # Convert sample_id to int for sorting
    df.sort_values(by='sample_id', inplace=True)

    return df


# implementing the code
if __name__ == "__main__":
    reports_folder = "json_reports"  # Folder containing Cuckoo report JSON files
    df6_net = create_network_operations_dataframe(reports_folder)
    display(df6_net)

Processing reports: 100%|██████████| 1/1 [00:00<00:00,  4.94file/s]


Unnamed: 0,sample_id,NETWORK:CONNECTS_HOST:api.blockcypher.com,NETWORK:CONNECTS_HOST:bitaps.com,NETWORK:CONNECTS_HOST:btc.blockr.io,NETWORK:CONNECTS_HOST:chain.so,NETWORK:CONNECTS_IP:127.0.0.1,NETWORK:RESOLVES_HOST:127.0.0.1,NETWORK:RESOLVES_HOST:api.blockcypher.com,NETWORK:RESOLVES_HOST:bitaps.com,NETWORK:RESOLVES_HOST:btc.blockr.io,NETWORK:RESOLVES_HOST:chain.so,NETWORK:RESOLVES_HOST:none,NETWORK:RESOLVES_HOST:wpad
0,10001,1,1,1,1,1,1,1,1,1,1,1,1


In [45]:
# Call the function to find constant features
find_constant_features(df6_net)

# Call the function to check if column names are unique
check_column_uniqueness(df6_net)

list_and_count_duplicates(df6_net)

Constant features found:
sample_id
NETWORK:CONNECTS_HOST:api.blockcypher.com
NETWORK:CONNECTS_HOST:bitaps.com
NETWORK:CONNECTS_HOST:btc.blockr.io
NETWORK:CONNECTS_HOST:chain.so
NETWORK:CONNECTS_IP:127.0.0.1
NETWORK:RESOLVES_HOST:127.0.0.1
NETWORK:RESOLVES_HOST:api.blockcypher.com
NETWORK:RESOLVES_HOST:bitaps.com
NETWORK:RESOLVES_HOST:btc.blockr.io
NETWORK:RESOLVES_HOST:chain.so
NETWORK:RESOLVES_HOST:none
NETWORK:RESOLVES_HOST:wpad
All column names are unique.
No duplicate column names found.


In [46]:
# Rename columns starting
df6_net2, column_map_net = rename_columns_with_numbers(df6_net, start_number=6394693)

print_first_and_last_10_items(column_map_net)

display(df6_net2.head())

First 10 items:
6394693: NETWORK:CONNECTS_HOST:api.blockcypher.com
6394694: NETWORK:CONNECTS_HOST:bitaps.com
6394695: NETWORK:CONNECTS_HOST:btc.blockr.io
6394696: NETWORK:CONNECTS_HOST:chain.so
6394697: NETWORK:CONNECTS_IP:127.0.0.1
6394698: NETWORK:RESOLVES_HOST:127.0.0.1
6394699: NETWORK:RESOLVES_HOST:api.blockcypher.com
6394700: NETWORK:RESOLVES_HOST:bitaps.com
6394701: NETWORK:RESOLVES_HOST:btc.blockr.io
6394702: NETWORK:RESOLVES_HOST:chain.so

----------------------------------------

Last 10 items:
6394695: NETWORK:CONNECTS_HOST:btc.blockr.io
6394696: NETWORK:CONNECTS_HOST:chain.so
6394697: NETWORK:CONNECTS_IP:127.0.0.1
6394698: NETWORK:RESOLVES_HOST:127.0.0.1
6394699: NETWORK:RESOLVES_HOST:api.blockcypher.com
6394700: NETWORK:RESOLVES_HOST:bitaps.com
6394701: NETWORK:RESOLVES_HOST:btc.blockr.io
6394702: NETWORK:RESOLVES_HOST:chain.so
6394703: NETWORK:RESOLVES_HOST:none
6394704: NETWORK:RESOLVES_HOST:wpad


Unnamed: 0_level_0,6394693,6394694,6394695,6394696,6394697,6394698,6394699,6394700,6394701,6394702,6394703,6394704
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
10001,1,1,1,1,1,1,1,1,1,1,1,1


In [47]:
# saving the feature names dictionary
save_column_map(column_map_net, '5_mlran_dataset/6_net_feature_names_dic.json')

In [48]:
df6_net2.to_parquet('5_mlran_dataset/6_net_dataset.parquet', compression='snappy')

In [49]:
# Saving the data
df6_net2.to_csv("5_mlran_dataset/6_net_dataset.csv")

# 7. System

This code extracts system-related operations such as DLLs loaded, command-line executions, mutexes, and GUIDs from Cuckoo report JSON files. For each report, it generates feature names like `SYSTEM:DLL_LOADED:<dll_name>` and assigns a value of `1` if the operation is present in the report. The data is compiled into a DataFrame where each row corresponds to a report, and each column represents a unique system operation. Missing values are filled with `0`, and the DataFrame is sorted by `sample_id`.

In [50]:
def extract_system_operations(report_path):
    """
    Extracts system operations (dll_loaded, command_line, mutex, guid) from a Cuckoo report.
    Returns a dictionary with keys as feature names and values as 1 (since the operation exists in this report).
    """
    system_operations = {}

    try:
        with open(report_path, 'r') as f:
            report = json.load(f)

        # Get the relevant system operations from behavior -> summary
        behavior_summary = report.get('behavior', {}).get('summary', {})

        # Process each system operation category
        for sys_op in ['dll_loaded', 'command_line', 'mutex', 'guid']:
            for system_item in behavior_summary.get(sys_op, []):
                system_item = system_item.lower()
                # Generate the feature name directly
                feature_name = f"SYSTEM:{sys_op.upper()}:{system_item}"
                system_operations[feature_name] = 1

    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error reading {report_path}: {e}")

    return system_operations


def create_system_operations_dataframe(reports_folder):
    """
    Creates a DataFrame where each row is a report and each column is a unique system operation.
    The value is 1 if the operation exists in that report, otherwise 0.
    """
    data = []
    all_features = set()

    # Get the list of all JSON files in the reports folder
    report_files = [f for f in os.listdir(reports_folder) if f.endswith(".json")]

    # Loop through all JSON files with progress bar
    for report_file in tqdm(report_files, desc="Processing reports", unit="file"):
        sample_id = report_file.split(".")[0]  # Extract sample ID (e.g., 10001 from 10001.json)
        report_path = os.path.join(reports_folder, report_file)

        # Extract system operations for the current report
        system_operations = extract_system_operations(report_path)

        # Add the current sample ID and system operations to the data list
        sample_data = {"sample_id": sample_id}
        sample_data.update(system_operations)
        data.append(sample_data)

        # Add the system operations to the all_features set
        all_features.update(system_operations.keys())

    # Create a DataFrame from the collected data
    df = pd.DataFrame(data)

    # Ensure all features (columns) are in the DataFrame, fill missing values with 0
    df = df.reindex(columns=["sample_id"] + sorted(all_features), fill_value=0)

    # Replace NaN with 0 and convert the DataFrame to integers
    df.fillna(0, inplace=True)
    df = df.astype(int)  # Ensure all values are integers (1 or 0)

    # Sort rows by sample_id in ascending order
    df['sample_id'] = df['sample_id'].astype(int)  # Convert sample_id to int for sorting
    df.sort_values(by='sample_id', inplace=True)

    return df


# implementing the code
if __name__ == "__main__":
    reports_folder = "json_reports"  # Folder containing Cuckoo report JSON files
    df7_sys = create_system_operations_dataframe(reports_folder)
    display(df7_sys)

Processing reports: 100%|██████████| 1/1 [00:00<00:00,  4.49file/s]


Unnamed: 0,sample_id,"SYSTEM:COMMAND_LINE:""c:\windows\system32\notepad.exe"" c:\users\administrator\desktop\_r_e_a_d___t_h_i_s___rwpkix_.txt","SYSTEM:COMMAND_LINE:""c:\windows\syswow64\mshta.exe"" ""c:\users\administrator\desktop\_r_e_a_d___t_h_i_s___4c2dhxkl_.hta""",SYSTEM:COMMAND_LINE:c:\users\administrator\desktop\_r_e_a_d___t_h_i_s___4c2dhxkl_.hta,SYSTEM:COMMAND_LINE:c:\users\administrator\desktop\_r_e_a_d___t_h_i_s___rwpkix_.txt,SYSTEM:COMMAND_LINE:c:\windows\system32\cmd.exe,SYSTEM:COMMAND_LINE:c:\windows\system32\netsh.exe advfirewall reset,SYSTEM:COMMAND_LINE:c:\windows\system32\netsh.exe advfirewall set allprofiles state on,SYSTEM:COMMAND_LINE:ping -n 1 127.0.0.1,"SYSTEM:COMMAND_LINE:taskkill /f /im ""cerber.exe""",...,SYSTEM:MUTEX:global\3a886eb8-fe40-4d0a-b78b-9e0bcb683fb7,SYSTEM:MUTEX:iesqmmutex_0_208,SYSTEM:MUTEX:local\!ietld!mutex,SYSTEM:MUTEX:local\!privacie!sharedmemory!mutex,SYSTEM:MUTEX:local\c:!users!administrator!appdata!roaming!microsoft!windows!ietldcache!,SYSTEM:MUTEX:local\zoneattributecachecountermutex,SYSTEM:MUTEX:local\zonescachecountermutex,SYSTEM:MUTEX:local\zoneslockedcachecountermutex,SYSTEM:MUTEX:raspbfile,SYSTEM:MUTEX:shell.{bf1f13ad-4a9e-2252-cdcc-30a63128fe68}
0,10001,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [51]:
# Call the function to find constant features
find_constant_features(df7_sys)

# Call the function to check if column names are unique
check_column_uniqueness(df7_sys)

list_and_count_duplicates(df7_sys)

Constant features found:
sample_id
SYSTEM:COMMAND_LINE:"c:\windows\system32\notepad.exe" c:\users\administrator\desktop\_r_e_a_d___t_h_i_s___rwpkix_.txt
SYSTEM:COMMAND_LINE:"c:\windows\syswow64\mshta.exe" "c:\users\administrator\desktop\_r_e_a_d___t_h_i_s___4c2dhxkl_.hta" 
SYSTEM:COMMAND_LINE:c:\users\administrator\desktop\_r_e_a_d___t_h_i_s___4c2dhxkl_.hta
SYSTEM:COMMAND_LINE:c:\users\administrator\desktop\_r_e_a_d___t_h_i_s___rwpkix_.txt
SYSTEM:COMMAND_LINE:c:\windows\system32\cmd.exe
SYSTEM:COMMAND_LINE:c:\windows\system32\netsh.exe advfirewall reset
SYSTEM:COMMAND_LINE:c:\windows\system32\netsh.exe advfirewall set allprofiles state on
SYSTEM:COMMAND_LINE:ping  -n 1 127.0.0.1  
SYSTEM:COMMAND_LINE:taskkill  /f /im "cerber.exe"  
SYSTEM:DLL_LOADED:advapi32.dll
SYSTEM:DLL_LOADED:api-ms-win-core-localregistry-l1-1-0.dll
SYSTEM:DLL_LOADED:api-ms-win-security-sddl-l1-1-0.dll
SYSTEM:DLL_LOADED:api-ms-win-service-management-l1-1-0.dll
SYSTEM:DLL_LOADED:api-ms-win-service-management-l2-1-0.

In [52]:
# Rename columns starting
df7_sys2, column_map_sys = rename_columns_with_numbers(df7_sys, start_number=6399507)

print_first_and_last_10_items(column_map_sys)

display(df7_sys2.head())

First 10 items:
6399507: SYSTEM:COMMAND_LINE:"c:\windows\system32\notepad.exe" c:\users\administrator\desktop\_r_e_a_d___t_h_i_s___rwpkix_.txt
6399508: SYSTEM:COMMAND_LINE:"c:\windows\syswow64\mshta.exe" "c:\users\administrator\desktop\_r_e_a_d___t_h_i_s___4c2dhxkl_.hta" 
6399509: SYSTEM:COMMAND_LINE:c:\users\administrator\desktop\_r_e_a_d___t_h_i_s___4c2dhxkl_.hta
6399510: SYSTEM:COMMAND_LINE:c:\users\administrator\desktop\_r_e_a_d___t_h_i_s___rwpkix_.txt
6399511: SYSTEM:COMMAND_LINE:c:\windows\system32\cmd.exe
6399512: SYSTEM:COMMAND_LINE:c:\windows\system32\netsh.exe advfirewall reset
6399513: SYSTEM:COMMAND_LINE:c:\windows\system32\netsh.exe advfirewall set allprofiles state on
6399514: SYSTEM:COMMAND_LINE:ping  -n 1 127.0.0.1  
6399515: SYSTEM:COMMAND_LINE:taskkill  /f /im "cerber.exe"  
6399516: SYSTEM:DLL_LOADED:advapi32.dll

----------------------------------------

Last 10 items:
6399650: SYSTEM:MUTEX:global\3a886eb8-fe40-4d0a-b78b-9e0bcb683fb7
6399651: SYSTEM:MUTEX:iesqmmutex

Unnamed: 0_level_0,6399507,6399508,6399509,6399510,6399511,6399512,6399513,6399514,6399515,6399516,...,6399650,6399651,6399652,6399653,6399654,6399655,6399656,6399657,6399658,6399659
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [53]:
# saving the feature names dictionary
save_column_map(column_map_sys, '5_mlran_dataset/7_sys_feature_names_dic.json')

In [54]:
df7_sys2.to_parquet('5_mlran_dataset/7_sys_dataset.parquet', compression='snappy')

In [55]:
# Saving the data
df7_sys2.to_csv("5_mlran_dataset/7_sys_dataset.csv")

# 8. Dropped File Extensions and Types

This code extracts dropped file information (extensions and types) from Cuckoo report JSON files. It identifies unique file extensions and types for each dropped file, then generates feature names in the format `DROP:EXTENSION:<extension>` and `DROP:TYPE:<file_type>`, assigning a value of `1` to indicate their presence. The data is stored in a DataFrame where each row represents a report and each column represents a unique dropped file extension or type. Missing values are filled with `0`, and the DataFrame is sorted by `sample_id`.

In [56]:
def extract_dropped_file_features(report_path):
    """
    Extracts dropped file extensions and types from the dropped section of a Cuckoo report.
    Returns a dictionary with keys as feature names and values as 1.
    The feature names start with 'DROP'.
    """
    dropped_features = {}

    try:
        with open(report_path, 'r') as f:
            report = json.load(f)

        # Get the dropped files section
        dropped_files = report.get('dropped', [])

        # Initialize sets for unique file extensions and types
        file_extension_set = set()
        file_type_set = set()

        # Process each dropped file
        for file_info in dropped_files:
            file_name = file_info.get('name', '')
            file_type = file_info.get('type', '')

            # Extract file extension
            if '.' in file_name:
                file_extension = file_name.split('.')[-1].lower()
                file_extension_set.add(file_extension)

            # Extract file type
            if file_type:
                file_type_set.add(file_type)

        # Create features for unique file extensions and types
        for ext in file_extension_set:
            dropped_features[f"DROP:EXTENSION:{ext.lower()}"] = 1
        for ftype in file_type_set:
            dropped_features[f"DROP:TYPE:{ftype.replace(' ', '_').lower()}"] = 1

    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error reading {report_path}: {e}")

    return dropped_features


def create_dropped_file_features_dataframe(reports_folder):
    """
    Creates a DataFrame where each row is a report and each column is a unique dropped file extension.
    The value is 1 if the extension exists in that report, otherwise 0.
    """
    data = []
    all_features = set()

    # Get the list of all JSON files in the reports folder
    report_files = [f for f in os.listdir(reports_folder) if f.endswith(".json")]

    # Loop through all JSON files with progress bar
    for report_file in tqdm(report_files, desc="Processing reports", unit="file"):
        sample_id = report_file.split(".")[0]  # Extract sample ID (e.g., 10001 from 10001.json)
        report_path = os.path.join(reports_folder, report_file)

        # Extract dropped file extensions for the current report
        dropped_extensions = extract_dropped_file_features(report_path)

        # Add the current sample ID and dropped file extensions to the data list
        sample_data = {"sample_id": sample_id}
        sample_data.update(dropped_extensions)
        data.append(sample_data)

        # Add the dropped file extensions to the all_features set
        all_features.update(dropped_extensions.keys())

    # Create a DataFrame from the collected data
    df = pd.DataFrame(data)

    # Ensure all features (columns) are in the DataFrame, fill missing values with 0
    df = df.reindex(columns=["sample_id"] + sorted(all_features), fill_value=0)

    # Replace NaN with 0 and convert the DataFrame to integers
    df.fillna(0, inplace=True)
    df = df.astype(int)  # Ensure all values are integers (1 or 0)

    # Sort rows by sample_id in ascending order
    df['sample_id'] = df['sample_id'].astype(int)  # Convert sample_id to int for sorting
    df.sort_values(by='sample_id', inplace=True)

    return df


# implementing the code
if __name__ == "__main__":
    reports_folder = "json_reports"  # Folder containing Cuckoo report JSON files
    df8_drop = create_dropped_file_features_dataframe(reports_folder)
    display(df8_drop)

Processing reports: 100%|██████████| 1/1 [00:00<00:00,  5.16file/s]


Unnamed: 0,sample_id,DROP:EXTENSION:8158,DROP:EXTENSION:bmp,DROP:EXTENSION:exe,DROP:EXTENSION:hta,DROP:EXTENSION:tmp,DROP:EXTENSION:txt,DROP:TYPE:a_/usr/bin/env_python_script_executable_(binary_data),"DROP:TYPE:ascii_text,_with_crlf_line_terminators","DROP:TYPE:ascii_text,_with_very_long_lines,_with_no_line_terminators",DROP:TYPE:b.out_overlay_separate_pure_segmented_executable_v2.3_v3.0_186_286_386_large_text_large_data_huge_objects_enabled,DROP:TYPE:data,DROP:TYPE:empty,"DROP:TYPE:html_document,_utf-8_unicode_text,_with_very_long_lines,_with_crlf_line_terminators","DROP:TYPE:pc_bitmap,_windows_3.x_format,_1024_x_768_x_32","DROP:TYPE:pe32_executable_(gui)_intel_80386,_for_ms_windows"
0,10001,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [57]:
# Call the function to find constant features
find_constant_features(df8_drop)

# Call the function to check if column names are unique
check_column_uniqueness(df8_drop)

list_and_count_duplicates(df8_drop)

Constant features found:
sample_id
DROP:EXTENSION:8158
DROP:EXTENSION:bmp
DROP:EXTENSION:exe
DROP:EXTENSION:hta
DROP:EXTENSION:tmp
DROP:EXTENSION:txt
DROP:TYPE:a_/usr/bin/env_python_script_executable_(binary_data)
DROP:TYPE:ascii_text,_with_crlf_line_terminators
DROP:TYPE:ascii_text,_with_very_long_lines,_with_no_line_terminators
DROP:TYPE:b.out_overlay_separate_pure_segmented_executable_v2.3_v3.0_186_286_386_large_text_large_data_huge_objects_enabled
DROP:TYPE:data
DROP:TYPE:empty
DROP:TYPE:html_document,_utf-8_unicode_text,_with_very_long_lines,_with_crlf_line_terminators
DROP:TYPE:pc_bitmap,_windows_3.x_format,_1024_x_768_x_32
DROP:TYPE:pe32_executable_(gui)_intel_80386,_for_ms_windows
All column names are unique.
No duplicate column names found.


In [58]:
# Rename columns starting
df8_drop2, column_map_drop = rename_columns_with_numbers(df8_drop, start_number=6416418)

print_first_and_last_10_items(column_map_drop)

display(df8_drop2.head())

First 10 items:
6416418: DROP:EXTENSION:8158
6416419: DROP:EXTENSION:bmp
6416420: DROP:EXTENSION:exe
6416421: DROP:EXTENSION:hta
6416422: DROP:EXTENSION:tmp
6416423: DROP:EXTENSION:txt
6416424: DROP:TYPE:a_/usr/bin/env_python_script_executable_(binary_data)
6416425: DROP:TYPE:ascii_text,_with_crlf_line_terminators
6416426: DROP:TYPE:ascii_text,_with_very_long_lines,_with_no_line_terminators
6416427: DROP:TYPE:b.out_overlay_separate_pure_segmented_executable_v2.3_v3.0_186_286_386_large_text_large_data_huge_objects_enabled

----------------------------------------

Last 10 items:
6416423: DROP:EXTENSION:txt
6416424: DROP:TYPE:a_/usr/bin/env_python_script_executable_(binary_data)
6416425: DROP:TYPE:ascii_text,_with_crlf_line_terminators
6416426: DROP:TYPE:ascii_text,_with_very_long_lines,_with_no_line_terminators
6416427: DROP:TYPE:b.out_overlay_separate_pure_segmented_executable_v2.3_v3.0_186_286_386_large_text_large_data_huge_objects_enabled
6416428: DROP:TYPE:data
6416429: DROP:TYPE:em

Unnamed: 0_level_0,6416418,6416419,6416420,6416421,6416422,6416423,6416424,6416425,6416426,6416427,6416428,6416429,6416430,6416431,6416432
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
10001,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [59]:
# saving the feature names dictionary
save_column_map(column_map_drop, '5_mlran_dataset/8_drop_feature_names_dic.json')

In [60]:
df8_drop2.to_parquet('5_mlran_dataset/8_drop_dataset.parquet', compression='snappy')

In [61]:
# Saving the data
df8_drop2.to_csv("5_mlran_dataset/8_drop_dataset.csv")

# 9. Signatures

This code extracts signature names from Cuckoo report JSON files and organizes them into a DataFrame. The function `extract_signature_operations` retrieves the signature names from each report and creates dictionary entries in the format `SIGNATURE:<signature_name>` with a value of `1` to indicate the presence of the signature. The `create_signature_operations_dataframe` function then compiles these dictionaries into a DataFrame, where each row represents a report and each column corresponds to a unique signature. Missing values are filled with `0`, and the DataFrame is sorted by `sample_id`.

In [62]:
def extract_signature_operations(report_path):
    """
    Extracts signature names from the Cuckoo report.
    Returns a dictionary with keys as signature names and values as 1 (since the signature is present).
    """
    with open(report_path, 'r') as f:
        report = json.load(f)

    signature_operations = {}

    # Get the relevant signatures from the report
    signatures = report.get('signatures', [])

    # Process each signature
    for signature in signatures:
        signature_name = f"SIGNATURE:{signature['name'].lower()}"
        signature_operations[signature_name] = 1  # Set to 1 since the signature is present

    return signature_operations


def create_signature_operations_dataframe(reports_folder):
    """
    Creates a DataFrame where each row is a report and each column is a unique signature name.
    The value is 1 if the signature is present in that report, otherwise 0.
    """
    # List to store data for each sample
    data = []

    # Set to collect all unique feature names (signature names)
    all_features = set()

    # Get the list of all JSON files in the reports folder
    report_files = [f for f in os.listdir(reports_folder) if f.endswith(".json")]

    # Loop through all JSON files with progress bar
    for report_file in tqdm(report_files, desc="Processing reports", unit="file"):
        sample_id = report_file.split(".")[0]  # Extract sample ID (e.g., 10001 from 10001.json)
        report_path = os.path.join(reports_folder, report_file)

        # Extract signature operations for the current report
        signature_operations = extract_signature_operations(report_path)

        # Add the current sample ID and signature operations to the data list
        sample_data = {"sample_id": sample_id}
        sample_data.update(signature_operations)
        data.append(sample_data)

        # Add the signature names to the all_features set
        all_features.update(signature_operations.keys())

    # Create a DataFrame from the collected data
    df = pd.DataFrame(data)

    # Ensure all features (columns) are in the DataFrame, fill missing values with 0
    df = df.reindex(columns=["sample_id"] + sorted(all_features), fill_value=0)

    # Replace NaN with 0 and convert the DataFrame to integers
    df.fillna(0, inplace=True)
    df = df.astype(int)  # Ensure all values are integers (1 or 0)

    # Sort rows by sample_id in ascending order
    df['sample_id'] = df['sample_id'].astype(int)  # Convert sample_id to int for sorting
    df.sort_values(by='sample_id', inplace=True)

    return df


# implementing the code
if __name__ == "__main__":
    reports_folder = "json_reports"  # Folder containing Cuckoo report JSON files
    df9_sig = create_signature_operations_dataframe(reports_folder)
    display(df9_sig)

Processing reports: 100%|██████████| 1/1 [00:00<00:00,  4.92file/s]


Unnamed: 0,sample_id,SIGNATURE:allocates_rwx,SIGNATURE:antisandbox_cuckoo_files,SIGNATURE:antivm_disk_size,SIGNATURE:antivm_memory_available,SIGNATURE:antivm_network_adapters,SIGNATURE:antivm_queries_computername,SIGNATURE:bypass_firewall,SIGNATURE:console_output,SIGNATURE:creates_doc,...,SIGNATURE:nolookup_communication,SIGNATURE:p2p_cnc,SIGNATURE:privilege_luid_check,SIGNATURE:raises_exception,SIGNATURE:ransomware_message,SIGNATURE:recon_fingerprint,SIGNATURE:stealth_window,SIGNATURE:suspicious_process,SIGNATURE:terminates_remote_process,SIGNATURE:uses_windows_utilities
0,10001,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [63]:
# Call the function to find constant features
find_constant_features(df9_sig)

# Call the function to check if column names are unique
check_column_uniqueness(df9_sig)

list_and_count_duplicates(df9_sig)

Constant features found:
sample_id
SIGNATURE:allocates_rwx
SIGNATURE:antisandbox_cuckoo_files
SIGNATURE:antivm_disk_size
SIGNATURE:antivm_memory_available
SIGNATURE:antivm_network_adapters
SIGNATURE:antivm_queries_computername
SIGNATURE:bypass_firewall
SIGNATURE:console_output
SIGNATURE:creates_doc
SIGNATURE:disables_proxy
SIGNATURE:dumped_buffer
SIGNATURE:exe_appdata
SIGNATURE:has_wmi
SIGNATURE:memdump_tor_urls
SIGNATURE:memdump_urls
SIGNATURE:modifies_proxy_wpad
SIGNATURE:moves_self
SIGNATURE:nolookup_communication
SIGNATURE:p2p_cnc
SIGNATURE:privilege_luid_check
SIGNATURE:raises_exception
SIGNATURE:ransomware_message
SIGNATURE:recon_fingerprint
SIGNATURE:stealth_window
SIGNATURE:suspicious_process
SIGNATURE:terminates_remote_process
SIGNATURE:uses_windows_utilities
All column names are unique.
No duplicate column names found.


In [64]:
# Rename columns starting
df9_sig2, column_map_sig = rename_columns_with_numbers(df9_sig, start_number=6468069)

print_first_and_last_10_items(column_map_sig)

display(df9_sig2.head())

First 10 items:
6468069: SIGNATURE:allocates_rwx
6468070: SIGNATURE:antisandbox_cuckoo_files
6468071: SIGNATURE:antivm_disk_size
6468072: SIGNATURE:antivm_memory_available
6468073: SIGNATURE:antivm_network_adapters
6468074: SIGNATURE:antivm_queries_computername
6468075: SIGNATURE:bypass_firewall
6468076: SIGNATURE:console_output
6468077: SIGNATURE:creates_doc
6468078: SIGNATURE:disables_proxy

----------------------------------------

Last 10 items:
6468086: SIGNATURE:nolookup_communication
6468087: SIGNATURE:p2p_cnc
6468088: SIGNATURE:privilege_luid_check
6468089: SIGNATURE:raises_exception
6468090: SIGNATURE:ransomware_message
6468091: SIGNATURE:recon_fingerprint
6468092: SIGNATURE:stealth_window
6468093: SIGNATURE:suspicious_process
6468094: SIGNATURE:terminates_remote_process
6468095: SIGNATURE:uses_windows_utilities


Unnamed: 0_level_0,6468069,6468070,6468071,6468072,6468073,6468074,6468075,6468076,6468077,6468078,...,6468086,6468087,6468088,6468089,6468090,6468091,6468092,6468093,6468094,6468095
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [65]:
# saving the feature names dictionary
save_column_map(column_map_sig, '5_mlran_dataset/9_sig_feature_names_dic.json')

In [66]:
df9_sig2.to_parquet('5_mlran_dataset/9_sig_dataset.parquet', compression='snappy')

In [67]:
# Saving the data
df9_sig2.to_csv("5_mlran_dataset/9_sig_dataset.csv")

# MLRan: Combined Dataset

In [69]:
import os
print(os.listdir("5_mlran_dataset"))

['5_str_dataset.parquet', '1_api_dataset.parquet', '4_dir_dataset.csv', '8_drop_dataset.csv', '9_sig_feature_names_dic.json', '9_sig_dataset.parquet', '7_sys_feature_names_dic.json', '8_drop_feature_names_dic.json', '3_file_dataset.csv', '7_sys_dataset.parquet', '9_sig_dataset.csv', '6_net_feature_names_dic.json', '1_api_feature_names_dic.json', '4_dir_feature_names_dic.json', '2_reg_feature_names_dic.json', '5_str_dataset.csv', '8_drop_dataset.parquet', '4_dir_dataset.parquet', '7_sys_dataset.csv', '2_reg_dataset.csv', '3_file_feature_names_dic.json', '3_file_dataset.parquet', '2_reg_dataset.parquet', '6_net_dataset.parquet', '6_net_dataset.csv', '5_str_feature_names_dic.json']


In [70]:
import pandas as pd

# Read parquet
df1_api2 = pd.read_parquet("5_mlran_dataset/1_api_dataset.parquet")

# Save as CSV
df1_api2.to_csv("5_mlran_dataset/1_api_dataset.csv", index=False)

# Now you can read it as CSV
df1_api2_csv = pd.read_csv("5_mlran_dataset/1_api_dataset.csv")
print(df1_api2_csv.shape)
display(df1_api2_csv.head())

(1, 157)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,148,149,150,151,152,153,154,155,156,157
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [71]:
df1_api2 = pd.read_csv("5_mlran_dataset/1_api_dataset.csv")
print("Shape of API:", df1_api2.shape)
display(df1_api2.head())

Shape of API: (1, 157)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,148,149,150,151,152,153,154,155,156,157
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [72]:
df1_api2 = pd.read_parquet('5_mlran_dataset/1_api_dataset.parquet', engine='pyarrow')  # or 'fastparquet'
df2_reg2 = pd.read_parquet('5_mlran_dataset/2_reg_dataset.parquet', engine='fastparquet')

In [73]:
df3_file2 = pd.read_parquet('5_mlran_dataset/3_file_dataset.parquet', engine='fastparquet')
df4_dir2 = pd.read_parquet('5_mlran_dataset/4_dir_dataset.parquet', engine='fastparquet')

In [None]:
#df5_str2 = pd.read_parquet('5_mlran_dataset/5_str_dataset.parquet', engine='fastparquet')
#df6_net2 = pd.read_parquet('5_mlran_dataset/6_net_dataset.parquet', engine='fastparquet')

In [None]:
#df7_sys2 = pd.read_parquet('5_mlran_dataset/7_sys_dataset.parquet', engine='fastparquet')
#df8_drop2 = pd.read_parquet('5_mlran_dataset/8_drop_dataset.parquet', engine='fastparquet')
#df9_sig2 = pd.read_parquet('5_mlran_dataset/9_sig_dataset.parquet', engine='fastparquet')

In [74]:
display('API:', df1_api2.head())
display('REG:', df2_reg2.head())
display('FILE',df3_file2.head())
display('DIR',df4_dir2.head())
display('STR',df5_str2.head())
display('NET',df6_net2.head())
display('SYS',df7_sys2.head())
display('DROP',df8_drop2.head())
display('SIG',df9_sig2.head())

'API:'

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,148,149,150,151,152,153,154,155,156,157
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


'REG:'

Unnamed: 0_level_0,314,315,316,317,318,319,320,321,322,323,...,1871,1872,1873,1874,1875,1876,1877,1878,1879,1880
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


'FILE'

Unnamed: 0_level_0,525819,525820,525821,525822,525823,525824,525825,525826,525827,525828,...,526325,526326,526327,526328,526329,526330,526331,526332,526333,526334
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


'DIR'

Unnamed: 0_level_0,2604137,2604138,2604139,2604140,2604141,2604142,2604143,2604144,2604145,2604146,...,2604363,2604364,2604365,2604366,2604367,2604368,2604369,2604370,2604371,2604372
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


'STR'

Unnamed: 0_level_0,2762260,2762261,2762262,2762263,2762264,2762265,2762266,2762267,2762268,2762269,...,2763333,2763334,2763335,2763336,2763337,2763338,2763339,2763340,2763341,2763342
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


'NET'

Unnamed: 0_level_0,6394693,6394694,6394695,6394696,6394697,6394698,6394699,6394700,6394701,6394702,6394703,6394704
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
10001,1,1,1,1,1,1,1,1,1,1,1,1


'SYS'

Unnamed: 0_level_0,6399507,6399508,6399509,6399510,6399511,6399512,6399513,6399514,6399515,6399516,...,6399650,6399651,6399652,6399653,6399654,6399655,6399656,6399657,6399658,6399659
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


'DROP'

Unnamed: 0_level_0,6416418,6416419,6416420,6416421,6416422,6416423,6416424,6416425,6416426,6416427,6416428,6416429,6416430,6416431,6416432
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
10001,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


'SIG'

Unnamed: 0_level_0,6468069,6468070,6468071,6468072,6468073,6468074,6468075,6468076,6468077,6468078,...,6468086,6468087,6468088,6468089,6468090,6468091,6468092,6468093,6468094,6468095
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [77]:
dfs = [df1_api2, df2_reg2, df3_file2, df4_dir2, df5_str2, df6_net2, df7_sys2, df8_drop2, df9_sig2]

for i, df in enumerate(dfs, start=1):
    print(f"Columns in df{i}: {df.columns.tolist()}")


Columns in df1: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156',

In [78]:
for i, df in enumerate([df1_api2, df2_reg2, df3_file2, df4_dir2, df5_str2, df6_net2, df7_sys2, df8_drop2, df9_sig2], start=1):
    print(f"df{i} columns: {df.columns}")


df1 columns: Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       ...
       '148', '149', '150', '151', '152', '153', '154', '155', '156', '157'],
      dtype='object', length=157)
df2 columns: Index(['314', '315', '316', '317', '318', '319', '320', '321', '322', '323',
       ...
       '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879',
       '1880'],
      dtype='object', length=1567)
df3 columns: Index(['525819', '525820', '525821', '525822', '525823', '525824', '525825',
       '525826', '525827', '525828',
       ...
       '526325', '526326', '526327', '526328', '526329', '526330', '526331',
       '526332', '526333', '526334'],
      dtype='object', length=516)
df4 columns: Index(['2604137', '2604138', '2604139', '2604140', '2604141', '2604142',
       '2604143', '2604144', '2604145', '2604146',
       ...
       '2604363', '2604364', '2604365', '2604366', '2604367', '2604368',
       '2604369', '2604370', '2604371', '2604372'],
      dtype='ob

this is not working

In [76]:
# Convert sample_id to integer for all dataframes
dfs = [df1_api2, df2_reg2, df3_file2, df4_dir2, df5_str2, df6_net2, df7_sys2, df8_drop2, df9_sig2]

for df in dfs:
    df['sample_id'] = df['sample_id'].astype(int)  # or str if you prefer

# Now merge
df_combined = df1_api2.merge(df2_reg2, on="sample_id", how="outer") \
                     .merge(df3_file2, on="sample_id", how="outer") \
                     .merge(df4_dir2, on="sample_id", how="outer") \
                     .merge(df5_str2, on="sample_id", how="outer") \
                     .merge(df6_net2, on="sample_id", how="outer") \
                     .merge(df7_sys2, on="sample_id", how="outer") \
                     .merge(df8_drop2, on="sample_id", how="outer") \
                     .merge(df9_sig2, on="sample_id", how="outer")

# Fill missing values and convert to int if needed
df_combined.fillna(0, inplace=True)
df_combined = df_combined.astype(int)

display(df_combined.head())


KeyError: 'sample_id'

In [80]:
dfs = [df1_api2, df2_reg2, df3_file2, df4_dir2, df5_str2, df6_net2, df7_sys2, df8_drop2, df9_sig2]

for df in dfs:
    df.reset_index(drop=True, inplace=True)   # remove existing index
    df.rename(columns={df.columns[0]: "sample_id"}, inplace=True)

df_combined = dfs[0]
for df in dfs[1:]:
    df_combined = df_combined.merge(df, on="sample_id", how="outer")

df_combined.fillna(0, inplace=True)


numeric_cols = df_combined.select_dtypes(include='number').columns
df_combined[numeric_cols] = df_combined[numeric_cols].astype(int)


display(df_combined.head())



Unnamed: 0,sample_id,2,3,4,5,6,7,8,9,10,...,6468086,6468087,6468088,6468089,6468090,6468091,6468092,6468093,6468094,6468095
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [81]:
# Merge the feature names
# List of input JSON file paths
input_files = [
    '5_mlran_dataset/1_api_feature_names_dic.json',
    '5_mlran_dataset/2_reg_feature_names_dic.json',
    '5_mlran_dataset/3_file_feature_names_dic.json',
    '5_mlran_dataset/4_dir_feature_names_dic.json',
    '5_mlran_dataset/5_str_feature_names_dic.json',
    '5_mlran_dataset/6_net_feature_names_dic.json',
    '5_mlran_dataset/7_sys_feature_names_dic.json',
    '5_mlran_dataset/8_drop_feature_names_dic.json',
    '5_mlran_dataset/9_sig_feature_names_dic.json'
]

# Initialize an empty dictionary to hold the merged data
combined_data = {}

# Read each JSON file and merge its contents into the combined_data dictionary
for file in input_files:
    with open(file, 'r') as f:
        data = json.load(f)
        combined_data.update(data)  # Merging dictionaries

# Write the merged data to a new JSON file
with open('5_mlran_dataset/MLRan_feature_names_dic.json', 'w') as outfile:
    json.dump(combined_data, outfile, indent=4)

print("JSON files have been merged into 'combined.json'.")


JSON files have been merged into 'combined.json'.


In [82]:
df_combined.to_parquet('5_mlran_dataset/MLRan_dataset.parquet', compression='snappy')

In [83]:
# Saving the data
df_combined.to_csv("5_mlran_dataset/MLRan_dataset.csv", index=False)

## MLRan Labels

In [87]:
df_combined = pd.read_csv("5_mlran_dataset/MLRan_dataset.csv")
df_combined.head()

Unnamed: 0,sample_id,2,3,4,5,6,7,8,9,10,...,6468086,6468087,6468088,6468089,6468090,6468091,6468092,6468093,6468094,6468095
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [84]:
df_label = pd.read_csv("mlran_dataset_metadata.csv")
df_label.head()

FileNotFoundError: [Errno 2] No such file or directory: 'mlran_dataset_metadata.csv'

In [91]:
import pandas as pd

# Load your features
df_combined = pd.read_csv("5_mlran_dataset/MLRan_dataset.csv")

# Create dummy labels
df_labels = pd.DataFrame({
    'sample_id': df_combined['sample_id'],
    'sample_type': ['unknown'] * len(df_combined),
    'family_label': [0] * len(df_combined),
    'type_label': [0] * len(df_combined)
})

df_labels.head()



Unnamed: 0,sample_id,sample_type,family_label,type_label
0,1,unknown,0,0


In [88]:
df_labels = df_label[['sample_id', 'sample_type', 'family_label', 'type_label']]
df_labels.head()

NameError: name 'df_label' is not defined

In [92]:
df_labels.shape

(1, 4)

In [93]:
# Saving the data
df_labels.to_csv("5_mlran_dataset/MLRan_labels.csv", index=False)