In [2]:
import pandas as pd
import numpy as np
import r2pipe as r2
from tqdm import tqdm

BYTE_LENGTH = 2000
N_GRAM_1 = 2
N_GRAM_2 = 4
SEED = 7
NUM_EXAMPLES = 50

# load dataset
DATASET_FOLDER = "/home/mandy900619/data/Malware202403/"
CLUSTER_PATH = "./cluster_data/"
CPU_ARCH = ["x86_64", "i386", "arm", "mips"]
DATASET_PATH = f"./dataset/"  
EMBEDDING_PATH = "./dataset_embedding/" 
ADDITONAL_INFO = "_rm_dup"

# load dataset
dataset = pd.DataFrame()
for arch in CPU_ARCH:
    print(f"Loading {arch} dataset")
    temp = pd.read_csv(f"{DATASET_PATH}malware_original_{arch}_byte_sequence{BYTE_LENGTH}_split{ADDITONAL_INFO}.csv")
    dataset = pd.concat([dataset, temp], axis=0)
dataset = dataset.reset_index(drop=True)




Loading x86_64 dataset
Loading i386 dataset
Loading arm dataset
Loading mips dataset


In [5]:
print(dataset.value_counts("CPU")) 
i386 = dataset[dataset["CPU"] == "Intel 80386"]
x86_64 = dataset[dataset["CPU"] == "Advanced Micro Devices X86-64"]
arm = dataset[dataset["CPU"] == "ARM"]
mips = dataset[dataset["CPU"] == "MIPS R3000"]  

print(i386.value_counts("family")[i386.value_counts("family")>25])
print(x86_64.value_counts("family"))
print(arm.value_counts("family"))
print(mips.value_counts("family"))


CPU
ARM                              60283
Advanced Micro Devices X86-64    34930
MIPS R3000                       31311
Intel 80386                      23293
Name: count, dtype: int64
family
mirai          14098
gafgyt          7927
tsunami          679
race              51
exploitscan       47
xorddos           41
sshdoor           33
local             32
rkit              31
sshbrute          30
kaiji             28
sliver            27
backegmm          27
dnsamp            27
prochider         26
pnscan            26
meterpreter       26
Name: count, dtype: int64
family
gafgyt          22740
mirai            4815
tsunami          4635
sliver            408
camelot           372
sshdoor           302
dropperl          247
xmrig             180
rekoobe           165
vtflooder         130
drtycow           123
pnscan             87
horsepill          78
merlin             77
prochider          56
revproxy           53
aenjaris           49
exploitscan        46
malsource          42

In [68]:
select_family = ["gafgyt", "mirai", "tsunami", "sliver", "camelot", "sshdoor", "xmrig", "rekoobe",
    "race", "exploitscan", "xorddos", "kaiji", "meterpreter", "triada"]
datasetFamily = dataset[dataset["family"].isin(select_family)]
print(datasetFamily.value_counts("family"))

i386Family = datasetFamily[datasetFamily["CPU"] == "Intel 80386"]
x86_64Family = datasetFamily[datasetFamily["CPU"] == "Advanced Micro Devices X86-64"]
armFamily = datasetFamily[datasetFamily["CPU"] == "ARM"]
mipsFamily = datasetFamily[datasetFamily["CPU"] == "MIPS R3000"]


def sample_or_all(group, n, seed):
    if len(group) <= n:
        return group
    return group.sample(n=n, random_state=seed)


i386FamilyGroup = i386Family.groupby("family", group_keys=False).apply(sample_or_all, n=NUM_EXAMPLES, seed=SEED)
x86_64FamilyGroup = x86_64Family.groupby("family", group_keys=False).apply(sample_or_all, n=NUM_EXAMPLES, seed=SEED)
armFamilyGroup = armFamily.groupby("family", group_keys=False).apply(sample_or_all, n=NUM_EXAMPLES, seed=SEED)
mipsFamilyGroup = mipsFamily.groupby("family", group_keys=False).apply(sample_or_all, n=NUM_EXAMPLES, seed=SEED)

family
mirai          87521
gafgyt         51794
tsunami         6986
sliver           435
camelot          372
sshdoor          335
xmrig            180
rekoobe          165
kaiji            120
exploitscan       93
meterpreter       66
race              51
xorddos           41
triada            28
Name: count, dtype: int64


  i386FamilyGroup = i386Family.groupby("family", group_keys=False).apply(sample_or_all, n=NUM_EXAMPLES, seed=SEED)
  x86_64FamilyGroup = x86_64Family.groupby("family", group_keys=False).apply(sample_or_all, n=NUM_EXAMPLES, seed=SEED)
  armFamilyGroup = armFamily.groupby("family", group_keys=False).apply(sample_or_all, n=NUM_EXAMPLES, seed=SEED)
  mipsFamilyGroup = mipsFamily.groupby("family", group_keys=False).apply(sample_or_all, n=NUM_EXAMPLES, seed=SEED)


In [69]:
print(i386FamilyGroup.value_counts("family"))
print(x86_64FamilyGroup.value_counts("family"))
print(armFamilyGroup.value_counts("family"))
print(mipsFamilyGroup.value_counts("family"))

family
gafgyt         50
race           50
tsunami        50
mirai          50
exploitscan    47
xorddos        41
sshdoor        33
kaiji          28
sliver         27
meterpreter    26
Name: count, dtype: int64
family
camelot        50
gafgyt         50
rekoobe        50
tsunami        50
mirai          50
sshdoor        50
sliver         50
xmrig          50
exploitscan    46
meterpreter    40
kaiji          25
Name: count, dtype: int64
family
gafgyt     50
mirai      50
tsunami    50
kaiji      34
triada     28
Name: count, dtype: int64
family
gafgyt     50
mirai      50
tsunami    50
kaiji      33
Name: count, dtype: int64


In [70]:
print(len(i386FamilyGroup))
print(len(x86_64FamilyGroup))
print(len(armFamilyGroup))
print(len(mipsFamilyGroup))

402
511
212
183


In [71]:
i386Train = i386FamilyGroup.groupby('family').sample(frac=0.6, random_state=SEED)
i386Test = i386FamilyGroup.drop(i386Train.index)
x86_64Train = x86_64FamilyGroup.groupby('family').sample(frac=0.6, random_state=SEED)
x86_64Test = x86_64FamilyGroup.drop(x86_64Train.index)
armTrain = armFamilyGroup.groupby('family').sample(frac=0.6, random_state=SEED)
armTest = armFamilyGroup.drop(armTrain.index)
mipsTrain = mipsFamilyGroup.groupby('family').sample(frac=0.6, random_state=SEED)
mipsTest = mipsFamilyGroup.drop(mipsTrain.index)

In [74]:
datasetTrain = pd.concat([i386Train, x86_64Train, armTrain, mipsTrain], axis=0)
datasetTrain = datasetTrain.sort_values(by=['family'])
datasetTest = pd.concat([i386Test, x86_64Test, armTest, mipsTest], axis=0)
datasetTest = datasetTest.sort_values(by=['family'])
byteSeqenceTrain = datasetTrain['byte_sequence'].values
byteSeqenceTest = datasetTest['byte_sequence'].values
y_train = datasetTrain['family'].values
y_test = datasetTest['family'].values

In [75]:
# add column train/test to dataset
datasetTrain["train_test"] = "train"
datasetTest["train_test"] = "test"

In [76]:
datasetOut = pd.concat([datasetTrain, datasetTest], axis=0)
# datasetOut.to_csv(f"{DATASET_PATH}malware_original_crossArch_byte_sequence{BYTE_LENGTH}_split{ADDITONAL_INFO}_train_test.csv", index=False)
print(datasetTrain.value_counts("family"))
print(datasetTest.value_counts("family"))

family
mirai          120
gafgyt         120
tsunami        120
kaiji           72
exploitscan     56
sshdoor         50
sliver          46
meterpreter     40
camelot         30
rekoobe         30
race            30
xmrig           30
xorddos         25
triada          17
Name: count, dtype: int64
family
mirai          80
gafgyt         80
tsunami        80
kaiji          48
exploitscan    37
sshdoor        33
sliver         31
meterpreter    26
camelot        20
rekoobe        20
race           20
xmrig          20
xorddos        16
triada         11
Name: count, dtype: int64


In [77]:
print(byteSeqenceTrain.shape)
print(byteSeqenceTest.shape)
print(y_train.shape)
print(y_test.shape)

(786,)
(522,)
(786,)
(522,)


In [78]:


# convert y family to number
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
list_ = list(y_train) + list(y_test)
le.fit(list_)
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)



In [79]:
# extract tf-idf features
from sklearn.feature_extraction.text import TfidfVectorizer
# 4-grams
tfidf_vec = TfidfVectorizer(analyzer='word', ngram_range=(N_GRAM_1, N_GRAM_2), max_features=500) # , max_features=1000
tfidf_matrix_train = tfidf_vec.fit_transform(byteSeqenceTrain)
tfidf_matrix_test = tfidf_vec.transform(byteSeqenceTest)

tfidf_matrix_train = tfidf_matrix_train.toarray()
tfidf_matrix_test = tfidf_matrix_test.toarray()


label_mapping = {index: label for index, label in enumerate(le.classes_)}

In [80]:
print(f"Training set shape: {tfidf_matrix_train.shape}")
print(f"Testing set shape: {tfidf_matrix_test.shape}")
print(f"Label mapping: {label_mapping}")

Training set shape: (786, 1000)
Testing set shape: (522, 1000)
Label mapping: {0: 'camelot', 1: 'exploitscan', 2: 'gafgyt', 3: 'kaiji', 4: 'meterpreter', 5: 'mirai', 6: 'race', 7: 'rekoobe', 8: 'sliver', 9: 'sshdoor', 10: 'triada', 11: 'tsunami', 12: 'xmrig', 13: 'xorddos'}


In [81]:
import pickle

with open(f"{EMBEDDING_PATH}crossArch_label_mapping{ADDITONAL_INFO}.pickle", 'wb') as f:
    pickle.dump(label_mapping, f)
    f.close()
with open(f"{EMBEDDING_PATH}crossArch_tfidf_vec_train{ADDITONAL_INFO}.pickle", 'wb') as f:
    pickle.dump(tfidf_matrix_train, f)
    f.close()
with open(f"{EMBEDDING_PATH}crossArch_tfidf_vec_test{ADDITONAL_INFO}.pickle", 'wb') as f:
    pickle.dump(tfidf_matrix_test, f)
    f.close()
with open(f"{EMBEDDING_PATH}crossArch_y_train{ADDITONAL_INFO}.pickle", 'wb') as f:
    pickle.dump(y_train, f)
    f.close()
with open(f"{EMBEDDING_PATH}crossArch_y_test{ADDITONAL_INFO}.pickle", 'wb') as f:
    pickle.dump(y_test, f)
    f.close()

In [82]:
# Format is a vector one line, each dimension value split by blank space
# cluster preprocessing
for key in label_mapping:
    outputPathTrain = f"{CLUSTER_PATH}crossArch_{label_mapping[key]}_train{ADDITONAL_INFO}.txt"
    # outputPathTest = f"{CLUSTER_PATH}{CPU_ARCH}_{label_mapping[key]}_test.txt"
    with open(outputPathTrain, 'w') as f:
        for i in range(len(tfidf_matrix_train)):
            if y_train[i] == key:
                f.write('\t'.join(map(str, tfidf_matrix_train[i])) + "\n")
    f.close()
    # with open(outputPathTest, 'w') as f:
    #     for i in range(len(tfidf_matrix_test)):
    #         if y_test[i] == key:
    #             f.write('\t'.join(map(str, tfidf_matrix_test[i])) + "\n")