In [37]:
import pandas as pd
import argparse
import os
import math
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

#python3 dataframe_creator.py --benignware ../Labeled-Elfs-main/benignware/ --malware ../Labeled-Elfs-main/malware

In [38]:
output_path = "../MultiEvasion/data/"

In [39]:
def read_directory(directory_path, is_malware):
    files = os.listdir(directory_path)
    list_df = []
    for file in files:
        file_path = os.path.join(directory_path, file)
        if os.path.isfile(file_path):
            row = create_file_row(file_path, is_malware) 
            list_df.append(row)
    return pd.DataFrame(data = list_df, columns=['name','is_malware','size','architecture','address_size','endianness','ABI','compiler_used','optimization_level','obfuscation','stripped','package','program'])


In [40]:
def create_file_row(file_path, is_malware):
    row = []
    base_name = os.path.basename(file_path)
    size = os.path.getsize(file_path)
    split = base_name.split("__")
    row.append(base_name)
    row.append(is_malware)
    row.append(size)
    row.extend(split)
    return row

In [41]:
def reduce_to_80_20_benignware_malware(dataframe):
    malware_total = dataframe['is_malware'].sum()
    count = 0
    to_drop = []
    for i,row in dataframe.iterrows():
        if row['is_malware'] == 0:
            if count < (malware_total*4):
                count = count + 1
            else:
                to_drop.append(i)

    return dataframe.drop(index=to_drop, axis = 0).sample(frac=1)

In [42]:
def reduce_non_malware_to_same_amount(dataframe):
    malware_total = dataframe['is_malware'].sum()
    count = 0
    to_drop = []
    for i,row in dataframe.iterrows():
        if row['is_malware'] == 0:
            if count < malware_total:
                count = count + 1
            else:
                to_drop.append(i)

    return dataframe.drop(index=to_drop, axis = 0).sample(frac=1)

In [43]:
parser = argparse.ArgumentParser("dataframe_creator")

parser.add_argument("--benignware", default = "../MultiEvasion/data/benignware/", help = "The path to a folder containing only benign ELF executables")
parser.add_argument("--malware", default = "../MultiEvasion/data/malware/", help = "The path to a folder containing only malware ELF executables")
parser.add_argument("--num_to_use", help = "Number of entries from files to use, leave empty for whole dataset")

args = parser.parse_known_args()[0]

In [44]:
df1 = read_directory(args.benignware,0)
df2 = read_directory(args.malware,1)

In [45]:
combined_df = pd.concat([df1, df2], ignore_index=True)
combined_df = combined_df[combined_df['name'].str.contains('x86__64__lsb__unix-system-v')]

combined_df = combined_df.sample(frac = 1,random_state=1382,ignore_index = True)
print(combined_df)

                                                    name  is_malware   size  \
0      x86__64__lsb__unix-system-v__gcc-5.5.0__O1__no...           0  38256   
1      x86__64__lsb__unix-system-v__gcc-10.1.0__Os__n...           0  28960   
2      x86__64__lsb__unix-system-v__llvm-obfuscator-3...           0  20504   
3      x86__64__lsb__unix-system-v__clang-3.9.0__O0__...           0  15032   
4      x86__64__lsb__unix-system-v__gcc-8.4.0__O0__no...           0  41792   
...                                                  ...         ...    ...   
39964  x86__64__lsb__unix-system-v__llvm-obfuscator-3...           0  34000   
39965  x86__64__lsb__unix-system-v__gcc-10.1.0__Os__n...           0  42896   
39966  x86__64__lsb__unix-system-v__llvm-obfuscator-3...           0  17328   
39967  x86__64__lsb__unix-system-v__clang-6.0.1__O0__...           0  66920   
39968  x86__64__lsb__unix-system-v__gcc-5.1.0__O1__no...           0  15200   

      architecture address_size endianness         

In [46]:
combined_df.to_csv(output_path + "Combined_ELF_Dataset.csv",header=False,index=False)

combined_df = combined_df[~combined_df['program'].astype('str').str.endswith(".o")]

if args.num_to_use is None:
    num_rows = combined_df.shape[0]
    df_to_split = combined_df
else:
    num_rows = int(args.num_to_use )
    df_to_split = combined_df.iloc[:num_rows]

df_to_split

Unnamed: 0,name,is_malware,size,architecture,address_size,endianness,ABI,compiler_used,optimization_level,obfuscation,stripped,package,program
0,x86__64__lsb__unix-system-v__gcc-5.5.0__O1__no...,0,38256,x86,64,lsb,unix-system-v,gcc-5.5.0,O1,no-obf,unstripped,coreutils-8.30,nice
1,x86__64__lsb__unix-system-v__gcc-10.1.0__Os__n...,0,28960,x86,64,lsb,unix-system-v,gcc-10.1.0,Os,no-obf,unstripped,util-linux-2.33,namei
2,x86__64__lsb__unix-system-v__llvm-obfuscator-3...,0,20504,x86,64,lsb,unix-system-v,llvm-obfuscator-3.3.0,O0,mllvmsub_mllvmbcf,unstripped,util-linux-2.33,mesg
4,x86__64__lsb__unix-system-v__gcc-8.4.0__O0__no...,0,41792,x86,64,lsb,unix-system-v,gcc-8.4.0,O0,no-obf,unstripped,coreutils-8.30,pathchk
5,x86__64__lsb__unix-system-v__llvm-obfuscator-4...,0,55088,x86,64,lsb,unix-system-v,llvm-obfuscator-4.0.1,O3,mllvmbcf,unstripped,util-linux-2.33,uuidparse
...,...,...,...,...,...,...,...,...,...,...,...,...,...
39964,x86__64__lsb__unix-system-v__llvm-obfuscator-3...,0,34000,x86,64,lsb,unix-system-v,llvm-obfuscator-3.4.0,O3,mllvmsub_mllvmbcf,unstripped,util-linux-2.33,fincore
39965,x86__64__lsb__unix-system-v__gcc-10.1.0__Os__n...,0,42896,x86,64,lsb,unix-system-v,gcc-10.1.0,Os,no-obf,unstripped,util-linux-2.33,last
39966,x86__64__lsb__unix-system-v__llvm-obfuscator-3...,0,17328,x86,64,lsb,unix-system-v,llvm-obfuscator-3.4.0,O2,mllvmsub_mllvmbcf,unstripped,util-linux-2.33,setarch
39967,x86__64__lsb__unix-system-v__clang-6.0.1__O0__...,0,66920,x86,64,lsb,unix-system-v,clang-6.0.1,O0,no-obf,unstripped,coreutils-8.30,md5sum


In [47]:
print("Benignware amount: ", df_to_split.shape[0] - df_to_split['is_malware'].sum())

Benignware amount:  38810


In [48]:
print("Malware amount: ", df_to_split['is_malware'].sum())

Malware amount:  711


#### originally done this way unstratified

train, test, valid = np.split(df_to_split, [int(args.train_amount*num_rows),
                                            int((args.train_amount+args.test_amount)*num_rows)])

# Stratify and Split

In [49]:
train, valid_test = train_test_split(df_to_split, test_size=0.2, random_state=0, 
                               stratify=df_to_split[['is_malware']])
valid, test = train_test_split(valid_test, test_size=0.5, random_state=0, 
                               stratify=valid_test[['is_malware']])

# Save

In [28]:
train.to_csv(output_path + "Train_ELF_Dataset.csv",index=False)
malware_amount = train['is_malware'].sum()
print("Train Benignware amount: " + str( train.shape[0] - malware_amount) + "\t\t Malware amount: " + str(malware_amount))


Train Benignware amount: 31047		 Malware amount: 569


In [29]:
test.to_csv(output_path + "Test_ELF_Dataset.csv",index=False)
malware_amount = test['is_malware'].sum()
print("Test Benignware amount: " + str( test.shape[0] - malware_amount) + "\t\t Malware amount: " + str(malware_amount))


Test Benignware amount: 3882		 Malware amount: 71


In [30]:
valid.to_csv(output_path + "Valid_ELF_Dataset.csv",index=False)
malware_amount = valid['is_malware'].sum()
print("Validation Benignware amount: " + str( valid.shape[0] - malware_amount) + "\t\t Malware amount: " + str(malware_amount))

Validation Benignware amount: 3881		 Malware amount: 71


In [31]:
train_equal = reduce_non_malware_to_same_amount(train)
test_equal = reduce_non_malware_to_same_amount(test)
valid_equal = reduce_non_malware_to_same_amount(valid)

In [32]:
train_equal.iloc[:,0:2].to_csv(output_path + "TrainEven_ELF_Dataset.csv",header=False,index=False)
test_equal.iloc[:,0:2].to_csv(output_path + "TestEven_ELF_Dataset.csv",header=False,index=False)
valid_equal.iloc[:,0:2].to_csv(output_path + "ValidEven_ELF_Dataset.csv",header=False,index=False)

In [50]:
train_unequal = reduce_to_80_20_benignware_malware(train)
test_unequal = reduce_to_80_20_benignware_malware(test)
valid_unequal = reduce_to_80_20_benignware_malware(valid)

In [51]:
train_unequal.iloc[:,0:2].to_csv(output_path + "TrainUneven_ELF_Dataset.csv",header=False,index=False)
test_unequal.iloc[:,0:2].to_csv(output_path + "TestUneven_ELF_Dataset.csv",header=False,index=False)
valid_unequal.iloc[:,0:2].to_csv(output_path + "ValidUneven_ELF_Dataset.csv",header=False,index=False)

# Below here needs to be updated to remove 3 malware from the new dataset

In [None]:
train_equal[train_equal['name'].str.contains("x86__64__lsb__unix-system-v__gcc-9.3.0__O1__no-obf__unstripped__Mirai-vanilla__bot")]


In [None]:
train_equal_minus_one = train_equal[~train_equal['name'].str.contains("x86__64__lsb__unix-system-v__gcc-9.3.0__O1__no-obf__unstripped__Mirai-vanilla__bot")]
train_equal_minus_one

In [None]:
train_equal_minus_mirai_bot = train_equal[~train_equal['name'].str.contains("Mirai-vanilla__bot")]
train_equal_minus_mirai_bot

train_equal_minus_one.iloc[:,0:2].to_csv(output_path + "TrainEven_Minus-One_ELF_Dataset.csv",header=False,index=False)
train_equal_minus_mirai_bot.iloc[:,0:2].to_csv(output_path + "TrainEven_Minus-Mirai-Bot_ELF_Dataset.csv",header=False,index=False)

In [None]:
size_in_kb = combined_df['size'].div(1024)

In [None]:
percentiles = size_in_kb.describe(percentiles=[0.5,0.7,0.9,0.95,0.98,0.99])
percentiles

In [None]:
size_in_kb.plot(kind='hist',bins=100)
plt.ylabel("Occurrences")
plt.xlabel("Size of ELF Executable in Dataset (KiloBytes)")
plt.title("Histogram of size of ELF Executables in Labeled Elfs Dataset")
plt.axvline(percentiles['90%'],color = 'red',linestyle='dashed',linewidth=0.8)
plt.axvline(percentiles['95%'],color = 'blue',linestyle='dashed',linewidth=0.8)
plt.axvline(percentiles['98%'],color = 'green',linestyle='dashed',linewidth=0.8)
plt.axvline(percentiles['99%'],color = 'orange',linestyle='dashed',linewidth=0.8)
plt.legend(['File Counts',
            '90th percentile %.0f KB'%(percentiles['90%']),
            '95th percentile %.0f KB'%(percentiles['95%']),
            '98th percentile %.0f KB'%(percentiles['98%']),
            '99th percentile %.0f KB'%(percentiles['99%']),
           ])
plt.show()

In [None]:
size_in_kb.max()

In [None]:
combined_df_kb = combined_df
combined_df_kb['size'] = combined_df_kb['size'].div(1024)
malware_groups = combined_df_kb.groupby('is_malware')
malware_groups['size'].plot(kind='hist',bins=100)

In [None]:
percentiles = malware_groups.get_group(1)['size'].describe(percentiles=[0.5,0.7,0.9,0.95,0.98,0.99])
percentiles

In [None]:
malware_groups.get_group(1)['size'].plot(kind='hist',bins=100)
plt.ylabel("Occurrences")
plt.xlabel("Size of ELF Malware Executable in Dataset (KiloBytes)")
plt.title("Histogram of size of ELF Malware Executables in Labeled Elfs Dataset")
plt.axvline(percentiles['90%'],color = 'red',linestyle='dashed',linewidth=0.8)
plt.axvline(percentiles['95%'],color = 'blue',linestyle='dashed',linewidth=0.8)
plt.axvline(percentiles['98%'],color = 'green',linestyle='dashed',linewidth=0.8)
plt.axvline(percentiles['99%'],color = 'orange',linestyle='dashed',linewidth=0.8)
plt.legend(['File Counts',
            '90th percentile %.0f KB'%(percentiles['90%']),
            '95th percentile %.0f KB'%(percentiles['95%']),
            '98th percentile %.0f KB'%(percentiles['98%']),
            '99th percentile %.0f KB'%(percentiles['99%']),
           ])
plt.show()

In [None]:
train_equal_minus_mirai_bot.shape

In [None]:
train_equal_minus_all_three = train_equal_minus_mirai_bot[~train_equal_minus_mirai_bot['name'].str.contains("lightaidra-1.0__lightaidra")]

In [None]:
train_equal_minus_all_three = train_equal_minus_all_three[~train_equal_minus_all_three['name'].str.contains("BASHLITE-lizkebab__client")]

In [None]:
train_equal_minus_three = train_equal_minus_one[~train_equal_minus_one['name'].str.contains("x86__64__lsb__unix-system-v__gcc-4.9.0__O2__no-obf__unstripped__lightaidra-1.0__lightaidra")]
train_equal_minus_three = train_equal_minus_three[~train_equal_minus_three['name'].str.contains("x86__64__lsb__unix-system-v__clang-5.0.1__Os__no-obf__unstripped__BASHLITE-lizkebab__client")]
train_equal_minus_three

In [None]:
train_equal_minus_three.iloc[:,0:2].to_csv(output_path + "TrainEven_Minus-Three_ELF_Dataset.csv",header=False,index=False)
train_equal_minus_all_three.iloc[:,0:2].to_csv(output_path + "TrainEven_Minus-All-Three_ELF_Dataset.csv",header=False,index=False)