In [1]:
import numpy as np
import pandas as pd
import os
from gensim.test.utils import get_tmpfile
from gensim.models import Word2Vec
import itertools
from sklearn.cluster import KMeans
import matplotlib
import matplotlib.pyplot as plt

n = 2
w = 6

In [2]:
path = r"data\\"

In [3]:
abspath, folders, files = next(os.walk(path))
topFolders = [os.path.join(abspath, f) for f in folders]
topFiles = [f for f in files if not f.startswith('.')]

In [4]:
abspath, folders, files

('data\\\\', ['CeeInject', 'Challenge', 'Renos'], ['.DS_Store', 'top20.txt'])

In [5]:
exec(f'top20 = {open(os.path.join(abspath, topFiles[0])).read()}')

In [6]:
top20 = [x.lower() for x in top20]

In [7]:
data = dict(
    (folders[f], os.listdir(g)) for f, g in enumerate(topFolders))

In [8]:
def parse_files(folder, file):
    arr = np.array(open(os.path.join(path, folder, file)).read().split())
    return arr

class DataIterator():
    
    def __init__(self, malwareType):
        self.malwareType = malwareType
        self.filelist = data[malwareType]
        self.fileiter = iter(self.filelist)
        self.nextdata = parse_files(self.malwareType, next(self.fileiter))
        self.dataiter = iter(self.nextdata)
    
    def __iter__(self):
        return self
    
    def __next__(self):
        try:
            code = next(self.dataiter)
            return [code]
        except StopIteration:
            self.nextdata = parse_files(self.malwareType, next(self.fileiter))
            self.dataiter = iter(self.nextdata)
            return next(self)


In [9]:
combos = list(itertools.combinations(top20, 2))

In [10]:
def modelGen(malwareType):
    # Data Iterator for Word2Vec
    d = DataIterator(malwareType)
    
    # Word2Vec
    fname = f'{malwareType}_w2v.model'
    file = get_tmpfile(fname)
    model = Word2Vec(d, size=n, window=w, workers=8)
    model.save(fname)
    
    # DataFrame of cosine similarities
    model_sim = pd.DataFrame(columns=top20, index=top20)
    for i, j in combos:
        model_sim[i][j] = model_sim[j][i] = model.wv.n_similarity([i], [j])
    model_sim = model_sim.fillna(1.0)
    
    return {"file": file, "diter": d, "model": model, "cos_sim": model_sim}

In [11]:
folders[:1]

['CeeInject']

In [12]:
model_gen = dict((f, modelGen(f)) for f in folders)

In [23]:
word_vectors = model_gen['CeeInject']['model'].wv
ls = []
for i in list(word_vectors.vocab.keys()):
    ls.append((i, word_vectors[i]))

In [22]:
word_vectors2 = model_gen['Renos']['model'].wv
ls2 = []
for i in list(word_vectors2.vocab.keys()):
    ls2.append((i, word_vectors2[i]))

In [18]:
def color(x):
    if x == '-':
        return ""
    if x >= 0.95:
        return "background: red"
    if 0.75 <= x and x < 0.95:
        return "background: orange"
    if 0.50 <= x and x < 0.75:
        return "background: yellow"
    return ""

In [19]:
test_df = model_gen['CeeInject']['cos_sim']
test_df.style.apply(lambda x: [color(v) for v in x], axis = 1)

Unnamed: 0,push,mov,inc,dec,pop,add,xor,xchg,or,adc,sub,and,sbb,cmp,imul,out,outsl,jo,js,insl
push,1.0,-0.883398,-0.86965,-0.308219,0.224593,-0.999879,-0.872325,-0.190921,0.0493878,-0.697329,0.733672,-0.969203,0.721847,0.755711,-0.896467,-0.281806,-0.25529,-0.204908,0.234358,0.95
mov,-0.883398,1.0,0.536902,-0.173529,0.258248,0.89057,0.999733,0.628663,-0.511681,0.280132,-0.329692,0.740785,-0.313366,-0.360689,0.999589,-0.200685,0.678618,-0.277666,-0.662605,-0.692899
inc,-0.86965,0.536902,1.0,0.737677,-0.676373,0.861877,0.517251,-0.318552,0.450115,0.96027,-0.973487,0.964441,-0.969399,-0.98051,0.560863,0.718733,-0.255297,0.661391,0.27611,-0.980316
dec,-0.308219,-0.173529,0.737677,1.0,-0.996236,0.293405,-0.196255,-0.874971,0.934932,0.896786,-0.872554,0.533003,-0.880847,-0.855946,-0.145231,0.999618,-0.841108,0.994286,0.852588,-0.589857
pop,0.224593,0.258248,-0.676373,-0.996236,1.0,-0.209429,0.280518,0.913649,-0.962171,-0.855055,0.826921,-0.457649,0.836494,0.807901,0.230452,-0.998251,0.884828,-0.999797,-0.894679,0.517636
add,-0.999879,0.89057,0.861877,0.293405,-0.209429,1.0,0.879815,0.206146,-0.0648961,0.686112,-0.723029,0.96526,-0.71101,-0.745447,0.903241,0.266868,0.270277,0.189679,-0.249431,-0.945035
xor,-0.872325,0.999733,0.517251,-0.196255,0.280518,0.879815,1.0,0.646478,-0.531411,0.25786,-0.307774,0.725054,-0.291323,-0.339025,0.998659,-0.223284,0.695421,-0.299805,-0.679746,-0.676041
xchg,-0.190921,0.628663,-0.318552,-0.874971,0.913649,0.206146,0.646478,1.0,-0.989837,-0.570432,0.526931,-0.0566941,0.541507,0.498578,0.606115,-0.88802,0.99782,-0.921656,-0.999012,0.125132
or,0.0493878,-0.511681,0.450115,0.934932,-0.962171,-0.0648961,-0.531411,-0.989837,1.0,0.681437,-0.64244,0.198098,-0.655558,-0.616783,-0.486845,0.944383,-0.978293,0.967467,0.982538,-0.264951
adc,-0.697329,0.280132,0.96027,0.896786,-0.855055,0.686112,0.25786,-0.570432,0.681437,1.0,-0.998646,0.852364,-0.999394,-0.996383,0.307532,0.884214,-0.51498,0.844431,0.533365,-0.886269


In [20]:
test_df2 = model_gen['Renos']['cos_sim']
test_df2.style.apply(lambda x: [color(v) for v in x], axis = 1)

Unnamed: 0,push,mov,inc,dec,pop,add,xor,xchg,or,adc,sub,and,sbb,cmp,imul,out,outsl,jo,js,insl
push,1.0,-0.883398,-0.86965,-0.308219,0.224593,-0.999879,-0.872325,-0.190921,0.0493878,-0.697329,0.733672,-0.969203,0.721847,0.755711,-0.896467,-0.281806,-0.25529,-0.204908,0.234358,0.95
mov,-0.883398,1.0,0.536902,-0.173529,0.258248,0.89057,0.999733,0.628663,-0.511681,0.280132,-0.329692,0.740785,-0.313366,-0.360689,0.999589,-0.200685,0.678618,-0.277666,-0.662605,-0.692899
inc,-0.86965,0.536902,1.0,0.737677,-0.676373,0.861877,0.517251,-0.318552,0.450115,0.96027,-0.973487,0.964441,-0.969399,-0.98051,0.560863,0.718733,-0.255297,0.661391,0.27611,-0.980316
dec,-0.308219,-0.173529,0.737677,1.0,-0.996236,0.293405,-0.196255,-0.874971,0.934932,0.896786,-0.872554,0.533003,-0.880847,-0.855946,-0.145231,0.999618,-0.841108,0.994286,0.852588,-0.589857
pop,0.224593,0.258248,-0.676373,-0.996236,1.0,-0.209429,0.280518,0.913649,-0.962171,-0.855055,0.826921,-0.457649,0.836494,0.807901,0.230452,-0.998251,0.884828,-0.999797,-0.894679,0.517636
add,-0.999879,0.89057,0.861877,0.293405,-0.209429,1.0,0.879815,0.206146,-0.0648961,0.686112,-0.723029,0.96526,-0.71101,-0.745447,0.903241,0.266868,0.270277,0.189679,-0.249431,-0.945035
xor,-0.872325,0.999733,0.517251,-0.196255,0.280518,0.879815,1.0,0.646478,-0.531411,0.25786,-0.307774,0.725054,-0.291323,-0.339025,0.998659,-0.223284,0.695421,-0.299805,-0.679746,-0.676041
xchg,-0.190921,0.628663,-0.318552,-0.874971,0.913649,0.206146,0.646478,1.0,-0.989837,-0.570432,0.526931,-0.0566941,0.541507,0.498578,0.606115,-0.88802,0.99782,-0.921656,-0.999012,0.125132
or,0.0493878,-0.511681,0.450115,0.934932,-0.962171,-0.0648961,-0.531411,-0.989837,1.0,0.681437,-0.64244,0.198098,-0.655558,-0.616783,-0.486845,0.944383,-0.978293,0.967467,0.982538,-0.264951
adc,-0.697329,0.280132,0.96027,0.896786,-0.855055,0.686112,0.25786,-0.570432,0.681437,1.0,-0.998646,0.852364,-0.999394,-0.996383,0.307532,0.884214,-0.51498,0.844431,0.533365,-0.886269
