#### Add bi-grams and n-gram features on byte files and improve the log-loss.

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import glob
import numpy as np
import os
import pandas as pd
import pickle
import shutil
import random as r

In [3]:
from scipy import sparse
from scipy.sparse import csr_matrix

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
from tqdm import tqdm

In [6]:
features_string = """
00,01,02,03,04,05,06,07,08,09,0a,0b,0c,0d,0e,0f,
10,11,12,13,14,15,16,17,18,19,1a,1b,1c,1d,1e,1f,
20,21,22,23,24,25,26,27,28,29,2a,2b,2c,2d,2e,2f,
30,31,32,33,34,35,36,37,38,39,3a,3b,3c,3d,3e,3f,
40,41,42,43,44,45,46,47,48,49,4a,4b,4c,4d,4e,4f,
50,51,52,53,54,55,56,57,58,59,5a,5b,5c,5d,5e,5f,
60,61,62,63,64,65,66,67,68,69,6a,6b,6c,6d,6e,6f,
70,71,72,73,74,75,76,77,78,79,7a,7b,7c,7d,7e,7f,
80,81,82,83,84,85,86,87,88,89,8a,8b,8c,8d,8e,8f,
90,91,92,93,94,95,96,97,98,99,9a,9b,9c,9d,9e,9f,
a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,aa,ab,ac,ad,ae,af,
b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf,
c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf,
d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,da,db,dc,dd,de,df,
e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,ea,eb,ec,ed,ee,ef,
f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,fa,fb,fc,fd,fe,ff,
??
"""
features_string = features_string.replace('\n', '')

In [7]:
unigram_list = features_string.split(',')

In [8]:
bigram_list = ['{} {}'.format(i, j)
               for i in unigram_list
               for j in unigram_list]

In [9]:
bigram_vect = CountVectorizer(ngram_range=(2, 2), vocabulary=bigram_list, lowercase=False)

In [10]:
src_path = '../../Modules/M06-ML-Case-Studies/06-Microsoft-Malware-Detection/data/train/train_bytes_files/'
dest_path = os.path.join(os.getcwd(), 'data/npz_bytes/')

In [11]:
total_features = len(bigram_list)

In [12]:
file_ids = pd.read_csv(filepath_or_buffer='data/trainLabels.csv')['Id'].to_list()

Save / load scipy sparse csr_matrix in portable data format.

Reference: https://stackoverflow.com/a/8980156/7579443

In [13]:
def bag_of_words(src_path, dest_path, file_ids, total_features, vect, final_file_name):
    """
    This function fetches the bag of word representation.
    """
    feature_matrix = csr_matrix((len(file_ids), total_features))
    k = 0
    for f_id in tqdm(file_ids):
        with open(file=os.path.join(src_path, f_id+'.txt'), mode='r') as b_t_f:
            feature_matrix[k, :] = csr_matrix(vect.transform(raw_documents=[b_t_f.read().replace('\n', ' ').lower()]))
        k += 1
    sparse.save_npz(file=os.path.join(dest_path, final_file_name), matrix=feature_matrix)
    print("Done!")

The below code ran for __11+__ hours to create a unified csr_matrix for bi-gram features.

In [14]:
final_file_name = 'all_bytes_bigram.npz'
if not os.path.isfile(path=os.path.join(dest_path, final_file_name)):
    bag_of_words(src_path=src_path,
                 dest_path=dest_path,
                 file_ids=file_ids,
                 total_features=total_features,
                 vect=bigram_vect,
                 final_file_name=final_file_name)
else:
    print("Data already exists!")

Data already exists!
