Let's combine all features.

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from IPython.display import display

In [3]:
from scipy import sparse

In [4]:
from matplotlib import pyplot as plt
from matplotlib import style
style.use(style='seaborn-deep')

In [5]:
import glob
import numpy as np
import os
import pandas as pd
import pickle
import shutil
import random as r

In [6]:
src_path = 'data/main_data/'

In [7]:
all_data_files = glob.glob(pathname=os.path.join(src_path, '*.csv'))
print(all_data_files)

['data/main_data/bytes_unigrams_best_bigrams_sizes.csv', 'data/main_data/asm_pixels_out_sizes.csv']


In [8]:
bytes_df = pd.read_csv(filepath_or_buffer=all_data_files[0])
bytes_df.head()

Unnamed: 0,ID,0,1,2,3,4,5,6,7,8,...,f8,f9,fa,fb,fc,fd,fe,ff,??,Byte_Size
0,01kcPWA9K2BOxQeS5Rju,3.0,0.0,1.0,3.0,1.0,0.0,4.0,1.0,3.0,...,873,485,462,516,1133,471,761,7998,13940,0.574219
1,04EjIdbPV5e1XroFOpiN,2.0,2.0,2.0,2.0,2.0,0.0,3.0,0.0,2.0,...,806,479,357,394,1114,507,1080,10826,340,0.717773
2,05EeG39MTRrI6VY21DPd,0.0,0.0,0.0,2.0,0.0,0.0,1.0,6.0,1.0,...,463,354,472,393,384,303,390,2104,8512,0.430664
3,05rJTUWYAKNegBk2wE8X,0.0,2.0,1.0,4.0,9.0,1.0,10.0,4.0,4.0,...,4012,1511,1604,2407,5196,4614,4448,47484,44280,2.356689
4,0AnoOZDNbPXIr2MRBSCJ,2.0,1.0,1.0,1.0,0.0,3.0,1.0,1.0,4.0,...,500,316,297,343,452,283,401,4532,27384,0.478516


In [9]:
asm_df = pd.read_csv(filepath_or_buffer=all_data_files[1])
asm_df.head()

Unnamed: 0,ID,0,1,2,3,4,5,6,7,8,...,edx,esi,eax,ebx,ecx,edi,ebp,esp,eip,ASM_Size
0,01kcPWA9K2BOxQeS5Rju,72.0,69.0,65.0,68.0,69.0,82.0,58.0,49.0,48.0,...,18,66,15,43,83,0,17,48,29,0.07819
1,04EjIdbPV5e1XroFOpiN,72.0,69.0,65.0,68.0,69.0,82.0,58.0,52.0,68.0,...,2496,5013,3960,6382,717,0,27,0,88,4.243397
2,05EeG39MTRrI6VY21DPd,72.0,69.0,65.0,68.0,69.0,82.0,58.0,49.0,48.0,...,444,1378,610,446,732,0,20,45,85,0.935457
3,05rJTUWYAKNegBk2wE8X,72.0,69.0,65.0,68.0,69.0,82.0,58.0,55.0,68.0,...,11358,15495,16570,28503,2825,0,87,0,491,12.703107
4,0AnoOZDNbPXIr2MRBSCJ,72.0,69.0,65.0,68.0,69.0,82.0,58.0,49.0,48.0,...,930,1558,1250,930,851,0,14,0,66,0.891982


In [10]:
label_df = pd.read_csv(filepath_or_buffer='data/trainLabels.csv')
label_df.columns = ['ID', 'Class']
label_df.head()

Unnamed: 0,ID,Class
0,01kcPWA9K2BOxQeS5Rju,1
1,04EjIdbPV5e1XroFOpiN,1
2,05EeG39MTRrI6VY21DPd,1
3,05rJTUWYAKNegBk2wE8X,1
4,0AnoOZDNbPXIr2MRBSCJ,1


__Great Validation__

In [11]:
print(label_df['ID'].all() == bytes_df['ID'].all() == asm_df['ID'].all())

True


Now that we know our file id values are perfectly synchronized with `label_df`, `bytes_df` and `asm_df` lets merge all of them into final data and export it to `csv` format.

In [12]:
bytes_asm_df = bytes_df.merge(right=asm_df, on='ID')
bytes_asm_df.head()

Unnamed: 0,ID,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,...,edx,esi,eax,ebx,ecx,edi,ebp,esp,eip,ASM_Size
0,01kcPWA9K2BOxQeS5Rju,3.0,0.0,1.0,3.0,1.0,0.0,4.0,1.0,3.0,...,18,66,15,43,83,0,17,48,29,0.07819
1,04EjIdbPV5e1XroFOpiN,2.0,2.0,2.0,2.0,2.0,0.0,3.0,0.0,2.0,...,2496,5013,3960,6382,717,0,27,0,88,4.243397
2,05EeG39MTRrI6VY21DPd,0.0,0.0,0.0,2.0,0.0,0.0,1.0,6.0,1.0,...,444,1378,610,446,732,0,20,45,85,0.935457
3,05rJTUWYAKNegBk2wE8X,0.0,2.0,1.0,4.0,9.0,1.0,10.0,4.0,4.0,...,11358,15495,16570,28503,2825,0,87,0,491,12.703107
4,0AnoOZDNbPXIr2MRBSCJ,2.0,1.0,1.0,1.0,0.0,3.0,1.0,1.0,4.0,...,930,1558,1250,930,851,0,14,0,66,0.891982


In [13]:
bytes_asm_label_df = bytes_asm_df.merge(right=label_df, on='ID')
bytes_asm_label_df.head()

Unnamed: 0,ID,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,...,esi,eax,ebx,ecx,edi,ebp,esp,eip,ASM_Size,Class
0,01kcPWA9K2BOxQeS5Rju,3.0,0.0,1.0,3.0,1.0,0.0,4.0,1.0,3.0,...,66,15,43,83,0,17,48,29,0.07819,1
1,04EjIdbPV5e1XroFOpiN,2.0,2.0,2.0,2.0,2.0,0.0,3.0,0.0,2.0,...,5013,3960,6382,717,0,27,0,88,4.243397,1
2,05EeG39MTRrI6VY21DPd,0.0,0.0,0.0,2.0,0.0,0.0,1.0,6.0,1.0,...,1378,610,446,732,0,20,45,85,0.935457,1
3,05rJTUWYAKNegBk2wE8X,0.0,2.0,1.0,4.0,9.0,1.0,10.0,4.0,4.0,...,15495,16570,28503,2825,0,87,0,491,12.703107,1
4,0AnoOZDNbPXIr2MRBSCJ,2.0,1.0,1.0,1.0,0.0,3.0,1.0,1.0,4.0,...,1558,1250,930,851,0,14,0,66,0.891982,1


In [14]:
bytes_asm_label_df.to_csv(path_or_buf=os.path.join(src_path, 'all_bytes_asm_data.csv'), index=None)

Testing

In [15]:
test_df = pd.read_csv(filepath_or_buffer=os.path.join(src_path, 'all_bytes_asm_data.csv'))
test_df.head()

Unnamed: 0,ID,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,...,esi,eax,ebx,ecx,edi,ebp,esp,eip,ASM_Size,Class
0,01kcPWA9K2BOxQeS5Rju,3.0,0.0,1.0,3.0,1.0,0.0,4.0,1.0,3.0,...,66,15,43,83,0,17,48,29,0.07819,1
1,04EjIdbPV5e1XroFOpiN,2.0,2.0,2.0,2.0,2.0,0.0,3.0,0.0,2.0,...,5013,3960,6382,717,0,27,0,88,4.243397,1
2,05EeG39MTRrI6VY21DPd,0.0,0.0,0.0,2.0,0.0,0.0,1.0,6.0,1.0,...,1378,610,446,732,0,20,45,85,0.935457,1
3,05rJTUWYAKNegBk2wE8X,0.0,2.0,1.0,4.0,9.0,1.0,10.0,4.0,4.0,...,15495,16570,28503,2825,0,87,0,491,12.703107,1
4,0AnoOZDNbPXIr2MRBSCJ,2.0,1.0,1.0,1.0,0.0,3.0,1.0,1.0,4.0,...,1558,1250,930,851,0,14,0,66,0.891982,1
