#### Watch this [video](https://www.youtube.com/watch?v=VLQTRlLGz5Y#t=13m11s) and include pixel intensity features to improve the log-loss.

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import array
import codecs
import glob
import numpy as np
import os
import pandas as pd
import pickle
import shutil
import random as r

In [3]:
from tqdm import tqdm

Below is the code snippet, winners used.

```python
with codecs.open(filename=os.path.join(src_path, 'xx.asm'), mode='rb') as asm_f:
    ln = os.path.getsize(os.path.join(src_path, 'xx.asm'))
    width = int(ln ** 0.5)
    rem = ln % width
    a = array.array("B")
    a.fromfile(asm_f, ln-rem)
g = np.reshape(a, new_shape=(len(a)//width, width))
g = np.uint8(g)
```

But I have slightly modified from my end. I will be using `array.frombytes(s)` method instead of `array.fromfile(f, n)` method.

In [4]:
src_path = '../../Modules/M06-ML-Case-Studies/06-Microsoft-Malware-Detection/data/train/train_asm_files/'
dest_path = os.path.join(os.getcwd(), 'data/csv_asms/')

In [5]:
no_of_files = len(os.listdir(path=src_path))
print(no_of_files)

10868


In [6]:
def get_segments_and_filenames(no_of_files, step=500):
    """
    This function returns segments and filenames based on segments.
    """
    segments = list()
    filenames = list()
    range_list = list(range(0, no_of_files, step))
    for i in range(len(range_list) - 1):
        segment = [range_list[i], range_list[i + 1]]
        segments.append(segment)
        segment_file = '{:06d}_{:06d}.csv'.format(segment[0], segment[1])
        filenames.append(segment_file)
    segments.append([range_list[-1], no_of_files])
    filenames.append('{:06d}_{:06d}.csv'.format(range_list[-1], no_of_files))
    return segments, filenames

In [7]:
segments, seg_filenames = get_segments_and_filenames(no_of_files=no_of_files)

In [8]:
file_ids = pd.read_csv(filepath_or_buffer='data/trainLabels.csv')['Id'].to_list()

In [9]:
def asm_to_pixel_flatten(filename, how_many_pixels=800):
    """
    This function fetches the pixel values from asm files.
    Winners used 800 pixles based on cross validation.
    """
    with codecs.open(filename=filename, mode='rb') as asm_f:
        a = array.array("B")
        a.frombytes(asm_f.read())
    p_arr = np.uint8(a[:how_many_pixels])
    return p_arr

In [10]:
def asm_image_featurization(src_path, dest_path, file_ids, segments, seg_filenames):
    """
    This function does asm image featurization.
    """
    for (segment, seg_filename) in zip(segments, seg_filenames):
        if os.path.isfile(path=os.path.join(dest_path, seg_filename)):
            print("This file '{}' already exists in the destination, hence skipping.".format(seg_filename))
        else:
            print("The script is working on the segment: {}.".format(segment))
            seg_p_list = list()
            for asm_f_id in tqdm(file_ids[segment[0]:segment[1]]):
                asm_filename = os.path.join(src_path, asm_f_id+'.asm')
                asm_p_arr = asm_to_pixel_flatten(filename=asm_filename)
                seg_p_list.append(asm_p_arr)
            seg_p_matrix = np.matrix(data=seg_p_list)
            seg_df = pd.DataFrame(data=seg_p_matrix.astype(float))
            seg_df.to_csv(path_or_buf=os.path.join(dest_path, seg_filename),
                          sep=',',
                          header=False,
                          float_format='%.2f',
                          index=False)
            print("This segment file: '{}' is completed.\n".format(seg_filename))
    print("All segments are now done successfully.")

In [11]:
asm_image_featurization(src_path=src_path,
                        dest_path=dest_path,
                        file_ids=file_ids,
                        segments=segments,
                        seg_filenames=seg_filenames)

The script is working on the segment: [0, 500].


100%|████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 282.58it/s]


This segment file: '000000_000500.csv' is completed.

The script is working on the segment: [500, 1000].


100%|████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 291.40it/s]


This segment file: '000500_001000.csv' is completed.

The script is working on the segment: [1000, 1500].


100%|████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 278.95it/s]


This segment file: '001000_001500.csv' is completed.

The script is working on the segment: [1500, 2000].


100%|█████████████████████████████████████████████████████████| 500/500 [00:14<00:00, 34.12it/s]


This segment file: '001500_002000.csv' is completed.

The script is working on the segment: [2000, 2500].


100%|█████████████████████████████████████████████████████████| 500/500 [00:15<00:00, 31.37it/s]


This segment file: '002000_002500.csv' is completed.

The script is working on the segment: [2500, 3000].


100%|█████████████████████████████████████████████████████████| 500/500 [00:16<00:00, 30.51it/s]


This segment file: '002500_003000.csv' is completed.

The script is working on the segment: [3000, 3500].


100%|█████████████████████████████████████████████████████████| 500/500 [00:17<00:00, 29.14it/s]


This segment file: '003000_003500.csv' is completed.

The script is working on the segment: [3500, 4000].


100%|█████████████████████████████████████████████████████████| 500/500 [00:14<00:00, 34.87it/s]


This segment file: '003500_004000.csv' is completed.

The script is working on the segment: [4000, 4500].


100%|████████████████████████████████████████████████████████| 500/500 [00:02<00:00, 179.93it/s]


This segment file: '004000_004500.csv' is completed.

The script is working on the segment: [4500, 5000].


100%|████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 258.06it/s]


This segment file: '004500_005000.csv' is completed.

The script is working on the segment: [5000, 5500].


100%|████████████████████████████████████████████████████████| 500/500 [00:02<00:00, 208.59it/s]


This segment file: '005000_005500.csv' is completed.

The script is working on the segment: [5500, 6000].


100%|████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 279.59it/s]


This segment file: '005500_006000.csv' is completed.

The script is working on the segment: [6000, 6500].


100%|████████████████████████████████████████████████████████| 500/500 [00:02<00:00, 234.71it/s]


This segment file: '006000_006500.csv' is completed.

The script is working on the segment: [6500, 7000].


100%|████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 272.83it/s]


This segment file: '006500_007000.csv' is completed.

The script is working on the segment: [7000, 7500].


100%|████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 286.01it/s]


This segment file: '007000_007500.csv' is completed.

The script is working on the segment: [7500, 8000].


100%|████████████████████████████████████████████████████████| 500/500 [00:02<00:00, 195.63it/s]


This segment file: '007500_008000.csv' is completed.

The script is working on the segment: [8000, 8500].


100%|████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 276.37it/s]


This segment file: '008000_008500.csv' is completed.

The script is working on the segment: [8500, 9000].


100%|████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 569.51it/s]


This segment file: '008500_009000.csv' is completed.

The script is working on the segment: [9000, 9500].


100%|████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 685.32it/s]


This segment file: '009000_009500.csv' is completed.

The script is working on the segment: [9500, 10000].


100%|████████████████████████████████████████████████████████| 500/500 [00:02<00:00, 195.43it/s]


This segment file: '009500_010000.csv' is completed.

The script is working on the segment: [10000, 10500].


100%|█████████████████████████████████████████████████████████| 500/500 [00:05<00:00, 92.30it/s]


This segment file: '010000_010500.csv' is completed.

The script is working on the segment: [10500, 10868].


100%|█████████████████████████████████████████████████████████| 368/368 [00:03<00:00, 96.65it/s]


This segment file: '010500_010868.csv' is completed.

All segments are now done successfully.


In [12]:
csv_segments = glob.glob(pathname=os.path.join(dest_path, '*.csv'))

In [13]:
all_asm_pixels = pd.concat(objs=[pd.read_csv(filepath_or_buffer=seg_csv, header=None)
                                 for seg_csv in tqdm(csv_segments)], ignore_index=True)

100%|███████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 33.11it/s]


In [14]:
final_file_name = 'all_asm_pixels.csv'
if not os.path.isfile(path=os.path.join(dest_path, final_file_name)):
    all_asm_pixels.to_csv(path_or_buf=os.path.join(dest_path, final_file_name))
else:
    print("Data already exists!")

In [15]:
display(pd.read_csv(filepath_or_buffer='data/csv_asms/all_asm_pixels.csv', index_col=0).head())
display(pd.read_csv(filepath_or_buffer='data/csv_asms/all_asm_pixels.csv', index_col=0).tail())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,790,791,792,793,794,795,796,797,798,799
0,72.0,69.0,65.0,68.0,69.0,82.0,58.0,49.0,48.0,48.0,...,109.0,111.0,100.0,101.0,108.0,32.0,102.0,108.0,97.0,116.0
1,72.0,69.0,65.0,68.0,69.0,82.0,58.0,52.0,68.0,70.0,...,109.0,111.0,100.0,101.0,108.0,32.0,102.0,108.0,97.0,116.0
2,72.0,69.0,65.0,68.0,69.0,82.0,58.0,49.0,48.0,48.0,...,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0
3,72.0,69.0,65.0,68.0,69.0,82.0,58.0,55.0,68.0,70.0,...,109.0,111.0,100.0,101.0,108.0,32.0,102.0,108.0,97.0,116.0
4,72.0,69.0,65.0,68.0,69.0,82.0,58.0,49.0,48.0,48.0,...,109.0,111.0,100.0,101.0,108.0,32.0,102.0,108.0,97.0,116.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,790,791,792,793,794,795,796,797,798,799
10863,72.0,69.0,65.0,68.0,69.0,82.0,58.0,48.0,48.0,52.0,...,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0
10864,72.0,69.0,65.0,68.0,69.0,82.0,58.0,48.0,48.0,52.0,...,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0
10865,72.0,69.0,65.0,68.0,69.0,82.0,58.0,48.0,48.0,52.0,...,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0
10866,72.0,69.0,65.0,68.0,69.0,82.0,58.0,48.0,48.0,52.0,...,10.0,72.0,69.0,65.0,68.0,69.0,82.0,58.0,48.0,48.0
10867,72.0,69.0,65.0,68.0,69.0,82.0,58.0,48.0,48.0,52.0,...,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0
