### Data Loading


In [1]:
import pandas as pd
import numpy as np
import r2pipe as r2
from tqdm import tqdm

BYTE_LENGTH = 2000
N_GRAM_1 = 2
N_GRAM_2 = 4
N_WAYS = 10
SEED = 7
NUM_EXAMPLES = 200
NUM_EXAMPLES_TEST = 30

# load dataset
DATASET_PATH = f"./dataset/malware_original_x86_64_byte_sequence{BYTE_LENGTH}_split.csv"
DATASET_FOLDER = "/home/mandy900619/data/Malware202403/"
CLUSTER_PATH = "./cluster_data/"
TSNE_PATH = "./pic/"
MODEL_PATH = "./models/"
LOG_PATH = "./logs/"
CPU_ARCH = "x86_64"   

print(f"Loading {CPU_ARCH} dataset from {DATASET_PATH}...")
dataset = pd.read_csv(DATASET_PATH)


Loading x86_64 dataset from ./dataset/malware_original_x86_64_byte_sequence2000_split.csv...


In [2]:
family = dataset['family'].value_counts()[:(N_WAYS)].index
dataset_exp = dataset[dataset['family'].isin(family)]

print(dataset_exp.shape)


dataset_train = dataset_exp.groupby('family').sample(n=NUM_EXAMPLES, random_state=SEED)

dataset_test = dataset_exp[~dataset_exp.index.isin(dataset_train.index)].groupby('family').sample(n=NUM_EXAMPLES_TEST, random_state=SEED)
print(dataset_train.shape)
print(dataset_test.shape)


byteSeqenceTrain = dataset_train['byte_sequence'].values
byteSeqenceTest = dataset_test['byte_sequence'].values
y_train = dataset_train['family'].values
y_test = dataset_test['family'].values


# convert y family to number
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(list(y_train) + list(y_test))
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

(119454, 14)
(2000, 14)
(300, 14)


In [3]:
# extract tf-idf features
from sklearn.feature_extraction.text import TfidfVectorizer
# 4-grams
tfidf_vec = TfidfVectorizer(analyzer='word', ngram_range=(N_GRAM_1, N_GRAM_2), max_features=1000) # , max_features=1000
tfidf_matrix_train = tfidf_vec.fit_transform(byteSeqenceTrain)
tfidf_matrix_test = tfidf_vec.transform(byteSeqenceTest)

tfidf_matrix_train = tfidf_matrix_train.toarray()
tfidf_matrix_test = tfidf_matrix_test.toarray()

label_mapping = {index: label for index, label in enumerate(le.classes_)}

In [4]:
print(f"Training set shape: {tfidf_matrix_train.shape}")
print(f"Testing set shape: {tfidf_matrix_test.shape}")
print(f"Label mapping: {label_mapping}")

Training set shape: (2000, 1000)
Testing set shape: (300, 1000)
Label mapping: {0: 'camelot', 1: 'gafgyt', 2: 'meterpreter', 3: 'mirai', 4: 'ngioweb', 5: 'rekoobe', 6: 'sliver', 7: 'sshdoor', 8: 'tsunami', 9: 'xmrig'}


In [5]:
# Format is a vector one line, each dimension value split by blank space
for key in label_mapping:
    outputPathTrain = f"{CLUSTER_PATH}{CPU_ARCH}_{label_mapping[key]}_train.txt"
    outputPathTest = f"{CLUSTER_PATH}{CPU_ARCH}_{label_mapping[key]}_test.txt"
    with open(outputPathTrain, 'w') as f:
        for i in range(len(tfidf_matrix_train)):
            if y_train[i] == key:
                f.write('\t'.join(map(str, tfidf_matrix_train[i])) + "\n")
    with open(outputPathTest, 'w') as f:
        for i in range(len(tfidf_matrix_test)):
            if y_test[i] == key:
                f.write('\t'.join(map(str, tfidf_matrix_test[i])) + "\n")
        

In [6]:
import pickle
with open(f"{CPU_ARCH}_label_mapping.pickle", 'wb') as f:
    pickle.dump(label_mapping, f)
    f.close()
with open(f"{CPU_ARCH}_tfidf_vec_train.pickle", 'wb') as f:
    pickle.dump(tfidf_matrix_train, f)
    f.close()
with open(f"{CPU_ARCH}_tfidf_vec_test.pickle", 'wb') as f:
    pickle.dump(tfidf_matrix_test, f)
    f.close()
with open(f"{CPU_ARCH}_y_train.pickle", 'wb') as f:
    pickle.dump(y_train, f)
    f.close()
with open(f"{CPU_ARCH}_y_test.pickle", 'wb') as f:
    pickle.dump(y_test, f)
    f.close()