In [1]:
import pandas as pd
import numpy as np
import r2pipe as r2
from tqdm import tqdm

BYTE_LENGTH = 2000
N_GRAM_1 = 2
N_GRAM_2 = 4
N_WAYS = 10
SEED = 7
NUM_EXAMPLES = 100
NUM_EXAMPLES_TEST = 30
NUM_EXAMPLES_VAL = 0

# load dataset
DATASET_PATH = "./dataset/malware_original_x86_64.csv"
DATASET_FOLDER = "/home/mandy900619/data/Malware202403/"
CLUSTER_PATH = "./cluster_data/"
CPU_ARCH = "x86_64"  
EMBEDDING_PATH = "./dataset_embedding/" 
ADDITONAL_INFO = "_rm_dup"

print(f"Loading {CPU_ARCH} dataset from {DATASET_PATH}...")
dataset = pd.read_csv(DATASET_PATH)


Loading x86_64 dataset from ./dataset/malware_original_x86_64.csv...


In [5]:
# extract byte sequences from ELF files
from elftools.elf.elffile import ELFFile
import concurrent.futures

notHaveByteSequence = False
removeDup = True

def split_hex_string(hex_string):
    return " ".join([hex_string[i:i+2] for i in range(0, len(hex_string), 2)])

if notHaveByteSequence:
    # extract byte sequences
    print(f"Extract byte sequences from {CPU_ARCH} dataset...")
    print(f"Extracting byte sequences of length {BYTE_LENGTH}...")

    for row in tqdm(dataset.itertuples(), total=len(dataset)):
        # open file with r2
        byteAnalysis = r2.open(DATASET_FOLDER + row.file_name[:2] + "/" + row.file_name, flags=["-2"])
        out = byteAnalysis.cmd(f"px* {BYTE_LENGTH}")
        byteAnalysis.cmd("quit")
        lines = out.strip().split("\n")
        byteSeqence = [line[3:-1] for line in lines if not line.startswith("s-")]
        byteSeqence = "".join(byteSeqence)
        byteSeqence = split_hex_string(byteSeqence)
        dataset.at[row.Index, "byte_sequence"] = byteSeqence

In [6]:
# output dataset
if notHaveByteSequence:  
    OUTPUT_PATH = f"./dataset/malware_original_{CPU_ARCH}_byte_sequence{BYTE_LENGTH}_split.csv"
    dataset.to_csv(OUTPUT_PATH, index=False)


In [7]:
if removeDup:
    dataset = pd.read_csv(f"./dataset/malware_original_{CPU_ARCH}_byte_sequence{BYTE_LENGTH}_split.csv")
    # remove duplicate rows based on byte_sequence
    print("Original dataset shape:", dataset.shape)
    print("Removing duplicate rows based on byte_sequence...")
    dataset_rm_dup = dataset.drop_duplicates(subset="byte_sequence")
    dataset_rm_dup = dataset_rm_dup.reset_index(drop=True)
    print("Dataset shape after removing duplicates:", dataset_rm_dup.shape)
    family_counts = dataset_rm_dup["family"].value_counts()
    print(family_counts[:])

    # output dataset
    OUTPUT_PATH = f"./dataset/malware_original_{CPU_ARCH}_byte_sequence{BYTE_LENGTH}_split{ADDITONAL_INFO}.csv"
    dataset_rm_dup.to_csv(OUTPUT_PATH, index=False)

                                           file_name  \
0  00001167300f0d583aff72a78a99a84a0729f3d159e03f...   
1  000071dec3aaadf9759438c65b514c5797a51943ca450e...   
2  0000e2fed3bad7d994fd0a25003269ce4531d636a21cec...   
3  00011a42ef9e77b80ed66e4f5451726a31314c48ca835f...   
4  0001cb07e51b157c2e14d8b2bfdc2e1876b4d91ba519df...   

                                md5    label                            CPU  \
0  9961981cacc112f39c0115042582f949  Malware  Advanced Micro Devices X86-64   
1  f5475dbe501bf005948235620239b543  Malware  Advanced Micro Devices X86-64   
2  d6b2df486692cbf7e088a245e5bb7aea  Malware  Advanced Micro Devices X86-64   
3  4f8080a5d1925c43aff40169b123ddb7  Malware  Advanced Micro Devices X86-64   
4  661278e864815fa95a2d81b9c7847b98  Malware  Advanced Micro Devices X86-64   

    family           first_seen      size  is_packed  packer_info  \
0  ngioweb  2024-01-31 17:32:58   96710.0      False          NaN   
1  ngioweb  2023-12-31 17:43:13  185729.0      Fal

In [8]:
dataset = pd.read_csv(f"./dataset/malware_original_{CPU_ARCH}_byte_sequence{BYTE_LENGTH}_split{ADDITONAL_INFO}.csv")

In [9]:
family = dataset['family'].value_counts()[:(N_WAYS)].index
dataset_exp = dataset[dataset['family'].isin(family)]

print(dataset_exp.shape)


dataset_train = dataset_exp.groupby('family').sample(n=NUM_EXAMPLES, random_state=SEED)

dataset_test = dataset_exp[~dataset_exp.index.isin(dataset_train.index)].groupby('family').sample(n=NUM_EXAMPLES_TEST, random_state=SEED)
if NUM_EXAMPLES_VAL > 0:
    dataset_val = dataset_exp[~dataset_exp.index.isin(dataset_train.index) & ~dataset_exp.index.isin(dataset_test.index)].groupby('family').sample(n=NUM_EXAMPLES_VAL, random_state=SEED)

print(dataset_train.shape)
print(dataset_test.shape)
if NUM_EXAMPLES_VAL > 0:
    print(dataset_val.shape)


byteSeqenceTrain = dataset_train['byte_sequence'].values
byteSeqenceTest = dataset_test['byte_sequence'].values
if NUM_EXAMPLES_VAL > 0:
    byteSeqenceVal = dataset_val['byte_sequence'].values
y_train = dataset_train['family'].values
y_test = dataset_test['family'].values
if NUM_EXAMPLES_VAL > 0:
    y_val = dataset_val['family'].values


# convert y family to number
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
if NUM_EXAMPLES_VAL > 0:
    list_ = list(y_train) + list(y_test) + list(y_val)
else:
    list_ = list(y_train) + list(y_test)
le.fit(list_)
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)
if NUM_EXAMPLES_VAL > 0:
    y_val = le.fit_transform(y_val)


(33994, 14)
(1000, 14)
(300, 14)


In [11]:
# extract tf-idf features
from sklearn.feature_extraction.text import TfidfVectorizer
# 4-grams
tfidf_vec = TfidfVectorizer(analyzer='word', ngram_range=(N_GRAM_1, N_GRAM_2), max_features=1000) # , max_features=1000
tfidf_matrix_train = tfidf_vec.fit_transform(byteSeqenceTrain)
tfidf_matrix_test = tfidf_vec.transform(byteSeqenceTest)
if NUM_EXAMPLES_VAL > 0:
    tfidf_matrix_val = tfidf_vec.transform(byteSeqenceVal)

tfidf_matrix_train = tfidf_matrix_train.toarray()
tfidf_matrix_test = tfidf_matrix_test.toarray()
if NUM_EXAMPLES_VAL > 0:
    tfidf_matrix_val = tfidf_matrix_val.toarray()

label_mapping = {index: label for index, label in enumerate(le.classes_)}

In [13]:
print(f"Training set shape: {tfidf_matrix_train.shape}")
print(f"Testing set shape: {tfidf_matrix_test.shape}")
if NUM_EXAMPLES_VAL > 0:
    print(f"Validation set shape: {tfidf_matrix_val.shape}")
print(f"Label mapping: {label_mapping}")

Training set shape: (1000, 1000)
Testing set shape: (300, 1000)
Label mapping: {0: 'camelot', 1: 'dropperl', 2: 'gafgyt', 3: 'mirai', 4: 'rekoobe', 5: 'sliver', 6: 'sshdoor', 7: 'tsunami', 8: 'vtflooder', 9: 'xmrig'}


In [14]:
import pickle

with open(f"{EMBEDDING_PATH}{CPU_ARCH}_label_mapping{ADDITONAL_INFO}.pickle", 'wb') as f:
    pickle.dump(label_mapping, f)
    f.close()
with open(f"{EMBEDDING_PATH}{CPU_ARCH}_tfidf_vec_train{ADDITONAL_INFO}.pickle", 'wb') as f:
    pickle.dump(tfidf_matrix_train, f)
    f.close()
with open(f"{EMBEDDING_PATH}{CPU_ARCH}_tfidf_vec_test{ADDITONAL_INFO}.pickle", 'wb') as f:
    pickle.dump(tfidf_matrix_test, f)
    f.close()
if NUM_EXAMPLES_VAL > 0:
    with open(f"{EMBEDDING_PATH}{CPU_ARCH}_tfidf_vec_val{ADDITONAL_INFO}.pickle", 'wb') as f:
        pickle.dump(tfidf_matrix_val, f)
        f.close()
with open(f"{EMBEDDING_PATH}{CPU_ARCH}_y_train{ADDITONAL_INFO}.pickle", 'wb') as f:
    pickle.dump(y_train, f)
    f.close()
with open(f"{EMBEDDING_PATH}{CPU_ARCH}_y_test{ADDITONAL_INFO}.pickle", 'wb') as f:
    pickle.dump(y_test, f)
    f.close()
if NUM_EXAMPLES_VAL > 0:
    with open(f"{EMBEDDING_PATH}{CPU_ARCH}_y_val{ADDITONAL_INFO}.pickle", 'wb') as f:
        pickle.dump(y_val, f)
        f.close()

In [15]:
# Format is a vector one line, each dimension value split by blank space
# cluster preprocessing
for key in label_mapping:
    outputPathTrain = f"{CLUSTER_PATH}{CPU_ARCH}_{label_mapping[key]}_train{ADDITONAL_INFO}.txt"
    # outputPathTest = f"{CLUSTER_PATH}{CPU_ARCH}_{label_mapping[key]}_test.txt"
    with open(outputPathTrain, 'w') as f:
        for i in range(len(tfidf_matrix_train)):
            if y_train[i] == key:
                f.write('\t'.join(map(str, tfidf_matrix_train[i])) + "\n")
    f.close()
    # with open(outputPathTest, 'w') as f:
    #     for i in range(len(tfidf_matrix_test)):
    #         if y_test[i] == key:
    #             f.write('\t'.join(map(str, tfidf_matrix_test[i])) + "\n")