In [1]:
# Install some stuff
%pip install --no-cache numpy tqdm pandas gensim sklearn matplotlib pytorch_lightning torchvision

Defaulting to user installation because normal site-packages is not writeable


Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import some things
import pandas as pd
from tqdm import tqdm
import numpy as np

In [3]:
vulns = pd.read_csv('assets/big-vul.csv')
print(vulns.columns)
print(vulns['vulnerability_classification'].unique)

Index(['Unnamed: 0', 'authentication_required', 'availability_impact',
       'cve_id', 'cve_page', 'cwe_id', 'access_complexity',
       'confidentiality_impact', 'integrity_impact', 'publish_date', 'score',
       'summary', 'update_date', 'vulnerability_classification', 'ref_link',
       'commit_id', 'commit_message', 'files_changed', 'lang', 'project',
       'version_after_fix', 'version_before_fix'],
      dtype='object')
<bound method Series.unique of 0       DoS Exec Code Overflow 
1                    Exec Code 
2                    Exec Code 
3                           NaN
4                          DoS 
                 ...           
4427              DoS Overflow 
4428                    Bypass 
4429                        NaN
4430                        NaN
4431                       DoS 
Name: vulnerability_classification, Length: 4432, dtype: object>


In [4]:
# Lets get some assembly data
import os
binary_files = os.listdir('/bin/')
binary_files = binary_files + os.listdir('/usr/bin/')
# All the files in /bin/*, output to ./assembly_files in x86 intel flavor syntax
for binary in tqdm(['ssh', 'sftp', 'ls', 'cp', 'du', 'cat', 
'pwd', 'vi', 'sh', 'print', 'awk', 'ping', 'kill', 'df', 'rm', 'time', 'top', 'unzip', 'wget', 'ps',
'gzip', 'dd', 'more', 'mkdir', 'telnet', 'su', 'uname', 'umount', 'stat', 'strings', 'echo', 'grep']):
    os.system(f'objdump -M intel -D /bin/{binary} > assembly_files/{binary}.asm')

 25%|█████████████                                       | 8/32 [00:01<00:04,  5.13it/s]objdump: /bin/print: file format not recognized
100%|███████████████████████████████████████████████████| 32/32 [00:02<00:00, 14.76it/s]


In [5]:
 # 'Normalize' all the assembly data information
 #  Example:

 # Raw assembly
 #  32a:	36 2d 36 34 2e 73    	ss sub eax,0x732e3436
 #  330:	6f                   	outs   dx,DWORD PTR ds:[rsi]
 #  331:	2e 32 00             	cs xor al,BYTE PTR [rax]

 # Normalized assembly
 #             ss sub eax,0x732e3436
 #             outs   dx,DWORD PTR ds:[rsi]
 #             cs xor al,BYTE PTR [rax]

assembly_files = os.listdir('assembly_files')
for file in tqdm(assembly_files):
    normalized_file = open(f"assembly_files_norm/{file}", "w")
    lines = open(f"assembly_files/{file}", "r").readlines()
    for line in lines:
        try:
            word = line.split('\t')[2]
            word = ' '.join(word.split())
            word = word.split('#', 1)[0]
            normalized_file.write(word + '\n')
        except IndexError:
            pass
    normalized_file.close()

100%|███████████████████████████████████████████████| 1492/1492 [01:27<00:00, 17.05it/s]


In [7]:
histogram = {}
for file in os.listdir('assembly_files_norm'):
    lines = open(f"assembly_files_norm/{file}").readlines()
    for line in lines:
        size = len(line.split(' '))
        if histogram.get(size):
            if size == 17:
                print(line)
            histogram[size] = histogram[size] + 1
        else:
            histogram[size] = 1
print(histogram)

KeyboardInterrupt: 

In [None]:
# how large is corupus?
os.system('du -h ./assembly_files/')
os.system('du -h ./assembly_files_norm/')

In [11]:
# Generate a word2vec model
from gensim.models import Word2Vec
from gensim.models.word2vec import PathLineSentences
from gensim.models import KeyedVectors

if not os.path.exists('x862vec.model'):
    model = Word2Vec(sentences=PathLineSentences('assembly_files_norm'), vector_size=5, window=128, min_count=1, workers=4)
    model.save("x862vec.model")
else:
    model = KeyedVectors.load("x862vec.model", mmap="r")

In [12]:
import numpy as np

# Function in assembly
some_function = """
0000000000000368 <.note.gnu.build-id>:
 36a:	00 00                	add    BYTE PTR [rax],al
 36c:	14 00                	adc    al,0x0
 36e:	00 00                	add    BYTE PTR [rax],al
 370:	03 00                	add    eax,DWORD PTR [rax]
 372:	00 00                	add    BYTE PTR [rax],al
 374:	47                   	rex.RXB
 375:	4e 55                	rex.WRX push rbp
 377:	00 d2                	add    dl,dl
 379:	6e                   	outs   dx,BYTE PTR ds:[rsi]
 37a:	36 82                	ss (bad) 
 37c:	89 ac 83 53 2c 84 86 	mov    DWORD PTR [rbx+rax*4-0x797bd3ad],ebp
 383:	f2 d9 01             	repnz fld DWORD PTR [rcx]
 386:	35 08 af 80 2e       	xor    eax,0x2e80af08
 38b:	8f                   	.byte 0x8f
"""
from normalizer import normalize_assembly
data = list()
lines = some_function.split('\n')
for line in lines:
    try:
        word = line.split('\t')[2]
        word = ' '.join(word.split())
        word = word.split('#', 1)[0]
        data.append(word)
    except IndexError:
        pass

vectors = list()
sentence_vectors = list()
for sentence in data:
    # Get each raw token
    words = sentence.strip().split(' ')
    # How long is instruction? 
    size = len(words)
    # Concat zero arrays for null instruction placeholders as padding
    padding = 17 - size
    
    for word in words:
        vector = model.wv[word]
        vectors.append(vector)
    for pad in range(padding):
        vectors.append(np.zeros(16))
    # Concat all together
    sentence_vector = np.concatenate(vectors)
    sentence_vectors.append(sentence_vector)
    vectors = list()

# Create a matrix from each 'sentence' vector
matrix = np.stack(sentence_vectors)
print(matrix.shape)
print(matrix)

KeyError: "Key 'ds:rsi' not present"

In [None]:
# Train a model
import pytorch_lightning as pl
from assembly_dataset import AssemblyDataset, AssemblyLightDataset
from myconvmodel import ConvNDModel
from gensim.models import KeyedVectors
import pandas as pd

checkpoint_callback = pl.callbacks.ModelCheckpoint(
    monitor='valid_loss',
    dirpath='./',
    filename='models-{epoch:02d}-{valid_loss:.2f}',
    save_top_k=3,
    mode='min') 

w2v_model = KeyedVectors.load("x862vec.model", mmap="r")
mod = ConvNDModel()
df = pd.read_csv("assets/train.csv")
dx = AssemblyLightDataset(
    w2v_model,
    16,
    path='assets/data',
    matrix_size=272,
)
trainer = pl.Trainer(max_epochs=6,callbacks=[checkpoint_callback], accelerator='cpu', devices=2)
trainer.fit(model=mod,datamodule=dx)
trainer.validate()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
----------------------------------------------------------------------------------------------------
distributed_backend=gloo
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")

  | Name    | Type        | Params
----------------------------------------
0 | cnv     | Conv2d      | 26    
1 | rel     | ReLU        | 0     
2 | bn      | BatchNorm2d | 2     
3 | mxpool  | MaxPool2d   | 0     
4 | flat    | Flatten     | 0     
5 | fc1     | Linear      | 127   
6 | fc2     | Linear      | 2     
7 | fc3     | Linear      | 4     
8 | softmax | Softm

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Sanity Checking DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]

ProcessRaisedException: 

-- Process 1 terminated with the following error:
Traceback (most recent call last):
  File "/home/pico/.local/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
    fn(i, *args)
  File "/home/pico/.local/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/multiprocessing.py", line 133, in _wrapping_function
    results = function(*args, **kwargs)
  File "/home/pico/.local/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 735, in _fit_impl
    results = self._run(model, ckpt_path=self.ckpt_path)
  File "/home/pico/.local/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1166, in _run
    results = self._run_stage()
  File "/home/pico/.local/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1252, in _run_stage
    return self._run_train()
  File "/home/pico/.local/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1274, in _run_train
    self._run_sanity_check()
  File "/home/pico/.local/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1343, in _run_sanity_check
    val_loop.run()
  File "/home/pico/.local/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py", line 200, in run
    self.advance(*args, **kwargs)
  File "/home/pico/.local/lib/python3.10/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py", line 155, in advance
    dl_outputs = self.epoch_loop.run(self._data_fetcher, dl_max_batches, kwargs)
  File "/home/pico/.local/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py", line 200, in run
    self.advance(*args, **kwargs)
  File "/home/pico/.local/lib/python3.10/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 143, in advance
    output = self._evaluation_step(**kwargs)
  File "/home/pico/.local/lib/python3.10/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 240, in _evaluation_step
    output = self.trainer._call_strategy_hook(hook_name, *kwargs.values())
  File "/home/pico/.local/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1704, in _call_strategy_hook
    output = fn(*args, **kwargs)
  File "/home/pico/.local/lib/python3.10/site-packages/pytorch_lightning/strategies/ddp_spawn.py", line 291, in validation_step
    return self.model(*args, **kwargs)
  File "/home/pico/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/pico/.local/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1008, in forward
    output = self._run_ddp_forward(*inputs, **kwargs)
  File "/home/pico/.local/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 971, in _run_ddp_forward
    return module_to_run(*inputs, **kwargs)
  File "/home/pico/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/pico/.local/lib/python3.10/site-packages/pytorch_lightning/overrides/base.py", line 90, in forward
    return self.module.validation_step(*inputs, **kwargs)
  File "/home/pico/Documents/x862vec/myconvmodel.py", line 55, in validation_step
    img = x.view(-1, 1, 256, 12)
RuntimeError: shape '[-1, 1, 256, 12]' is invalid for input of size 2560
