## Import Cleaned OPCODES

In [3]:
import pandas as pd

opc = pd.read_csv('CDS_opcode_sv1.csv', dtype={'swc_label': object, 'opcode': object})

opc

Unnamed: 0,_id,address,opcode,swc_label
0,240000,0x8da4feb3dd3643943c614ca793e8f59baae50544,PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2...,0000000000000000000000000000000000000
1,240001,0xe1521029d2591ba2a0f92b6e744a825f665f748b,PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2...,0000000000000000000000000000000000000
2,240002,0xe378696ce7ad55cfacc40337f476faf8fba9aff2,PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2...,0000000000000000000000000000000000000
3,240003,0x1872b33bb33e372d6f039a8b6af2e1be228bda23,PUSH1 PUSH1 MSTORE CALLDATASIZE ISZERO PUSH2 J...,0000000000100000000000000000000000000
4,240004,0x6ef34258c0c8ae7acc688311189dfc783ebb5bfa,ORIGIN PUSH20 EQ ISZERO ADDRESS JUMPI PUSH20 S...,0000000000000001000000000000000000000
...,...,...,...,...
19995,230127,0xee623e3a60d4f67c6490749bc8714a4554caec2a,ORIGIN PUSH20 EQ ISZERO ADDRESS JUMPI PUSH20 S...,0000000000000001000000000000000000000
19996,230128,0x2835e490c49f715e0644db9f4682bdf66a1e0901,PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2...,0000000000000000000000000000000000000
19997,230129,0xef5173286ed4eb1be6a3b517629cd38b48e6ca4e,PUSH1 PUSH1 MSTORE CALLDATASIZE ISZERO PUSH1 J...,0000100000000100000000000000000000000
19998,230130,0xe7f74f8e45cbe8ba091b4bffca5cda624b505f1f,PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2...,0000000000000000000000000000000000000


## Tokenization

### 1. Import Required Libraries

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### 2. Define Hyper-Parameters

In [5]:
OPCODE_SIZE = 150
OPCODE_SEQ_LEN = 1800
TRUNC_TYPE = 'post'
PADDING_TYPE = 'post'
OOV_TOKEN = '<OOV>'

### 3. Create Tokenizer Object

In [6]:
tokenizer = Tokenizer(num_words = OPCODE_SIZE, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(opc['opcode'])
word_index = tokenizer.word_index

In [7]:
word_index.items()

dict_items([('<OOV>', 1), ('push1', 2), ('swap1', 3), ('dup1', 4), ('push2', 5), ('pop', 6), ('dup2', 7), ('jumpdest', 8), ('add', 9), ('mstore', 10), ('and', 11), ('iszero', 12), ('jumpi', 13), ('swap2', 14), ('mload', 15), ('dup3', 16), ('jump', 17), ('sub', 18), ('dup4', 19), ('sload', 20), ('push20', 21), ('revert', 22), ('exp', 23), ('swap3', 24), ('push4', 25), ('eq', 26), ('sha3', 27), ('dup5', 28), ('mul', 29), ('calldataload', 30), ('callvalue', 31), ('div', 32), ('lt', 33), ('sstore', 34), ('dup6', 35), ('caller', 36), ('swap4', 37), ('assert', 38), ('fail', 39), ('return', 40), ('not', 41), ('push32', 42), ('dup7', 43), ('gt', 44), ('dup8', 45), ('returndatasize', 46), ('swap5', 47), ('stop', 48), ('dup9', 49), ('calldatasize', 50), ('extcodesize', 51), ('call', 52), ('or', 53), ('gas', 54), ('push3', 55), ('swap6', 56), ('shl', 57), ('invalid', 58), ('returndatacopy', 59), ('log3', 60), ('dup10', 61), ('push29', 62), ('swap7', 63), ('dup11', 64), ('codecopy', 65), ('address

In [8]:
# Save Tokenizer as Pickle
import pickle
with open('tokenizer.pickle', 'wb') as fh:
   pickle.dump(tokenizer, fh)

### 4. Tokenize and Pad all OPCODES

In [9]:
# Tokenize OPCODES
tokenized_opcodes = tokenizer.texts_to_sequences(opc['opcode'])

In [10]:
len(tokenized_opcodes)

20000

In [11]:
# Pad TOkenized OPCODES
padded_opcodes = pad_sequences(tokenized_opcodes, maxlen=OPCODE_SEQ_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE)

In [12]:
print(len(tokenized_opcodes[1]), len(padded_opcodes[1]))

2404 1800


In [13]:
type(padded_opcodes[0])

numpy.ndarray

### 5. Save modified OPCODES as CSV

In [14]:
import numpy as np
opcodes = np.array(padded_opcodes).tolist()

In [15]:
opc['opcode'] = opcodes

In [16]:
opc

Unnamed: 0,_id,address,opcode,swc_label
0,240000,0x8da4feb3dd3643943c614ca793e8f59baae50544,"[2, 2, 10, 2, 50, 33, 5, 13, 25, 62, 2, 30, 32...",0000000000000000000000000000000000000
1,240001,0xe1521029d2591ba2a0f92b6e744a825f665f748b,"[2, 2, 10, 2, 50, 33, 5, 13, 2, 30, 62, 3, 32,...",0000000000000000000000000000000000000
2,240002,0xe378696ce7ad55cfacc40337f476faf8fba9aff2,"[2, 2, 10, 2, 50, 33, 5, 13, 2, 30, 62, 3, 32,...",0000000000000000000000000000000000000
3,240003,0x1872b33bb33e372d6f039a8b6af2e1be228bda23,"[2, 2, 10, 50, 12, 5, 13, 25, 62, 2, 30, 32, 1...",0000000000100000000000000000000000000
4,240004,0x6ef34258c0c8ae7acc688311189dfc783ebb5bfa,"[95, 21, 26, 12, 66, 13, 21, 97, 0, 0, 0, 0, 0...",0000000000000001000000000000000000000
...,...,...,...,...
19995,230127,0xee623e3a60d4f67c6490749bc8714a4554caec2a,"[95, 21, 26, 12, 66, 13, 21, 97, 0, 0, 0, 0, 0...",0000000000000001000000000000000000000
19996,230128,0x2835e490c49f715e0644db9f4682bdf66a1e0901,"[2, 2, 10, 2, 50, 33, 5, 13, 2, 30, 62, 3, 32,...",0000000000000000000000000000000000000
19997,230129,0xef5173286ed4eb1be6a3b517629cd38b48e6ca4e,"[2, 2, 10, 50, 12, 2, 13, 8, 2, 2, 2, 2, 31, 4...",0000100000000100000000000000000000000
19998,230130,0xe7f74f8e45cbe8ba091b4bffca5cda624b505f1f,"[2, 2, 10, 2, 50, 33, 5, 13, 2, 30, 62, 3, 32,...",0000000000000000000000000000000000000


#### 5.1.A Merge Labels & Convert to Binary Labels

In [17]:
opc['swc_label'] = (opc['swc_label'].str.contains('1')).astype(int)

In [18]:
len(opc[opc['swc_label'] == 1]), len(opc[opc['swc_label'] == 0])

(4331, 15669)

In [19]:
opc

Unnamed: 0,_id,address,opcode,swc_label
0,240000,0x8da4feb3dd3643943c614ca793e8f59baae50544,"[2, 2, 10, 2, 50, 33, 5, 13, 25, 62, 2, 30, 32...",0
1,240001,0xe1521029d2591ba2a0f92b6e744a825f665f748b,"[2, 2, 10, 2, 50, 33, 5, 13, 2, 30, 62, 3, 32,...",0
2,240002,0xe378696ce7ad55cfacc40337f476faf8fba9aff2,"[2, 2, 10, 2, 50, 33, 5, 13, 2, 30, 62, 3, 32,...",0
3,240003,0x1872b33bb33e372d6f039a8b6af2e1be228bda23,"[2, 2, 10, 50, 12, 5, 13, 25, 62, 2, 30, 32, 1...",1
4,240004,0x6ef34258c0c8ae7acc688311189dfc783ebb5bfa,"[95, 21, 26, 12, 66, 13, 21, 97, 0, 0, 0, 0, 0...",1
...,...,...,...,...
19995,230127,0xee623e3a60d4f67c6490749bc8714a4554caec2a,"[95, 21, 26, 12, 66, 13, 21, 97, 0, 0, 0, 0, 0...",1
19996,230128,0x2835e490c49f715e0644db9f4682bdf66a1e0901,"[2, 2, 10, 2, 50, 33, 5, 13, 2, 30, 62, 3, 32,...",0
19997,230129,0xef5173286ed4eb1be6a3b517629cd38b48e6ca4e,"[2, 2, 10, 50, 12, 2, 13, 8, 2, 2, 2, 2, 31, 4...",1
19998,230130,0xe7f74f8e45cbe8ba091b4bffca5cda624b505f1f,"[2, 2, 10, 2, 50, 33, 5, 13, 2, 30, 62, 3, 32,...",0


#### 5.1.B Split the DataFrame into Training & Testing Sets

In [20]:
from sklearn.model_selection import train_test_split

final_data = opc[['opcode', 'swc_label']]
train, test = train_test_split(final_data, test_size=0.3, random_state=69, shuffle=True, stratify=final_data['swc_label'])

In [21]:
train, test

(                                                  opcode  swc_label
 386    [2, 2, 10, 2, 50, 33, 5, 13, 2, 30, 62, 3, 32,...          0
 11807  [2, 2, 10, 31, 4, 12, 5, 13, 2, 4, 22, 8, 6, 2...          0
 1006   [2, 2, 10, 2, 50, 33, 5, 13, 2, 30, 62, 3, 32,...          0
 15101  [2, 2, 10, 2, 50, 33, 5, 13, 2, 30, 62, 3, 32,...          0
 2145   [2, 2, 10, 2, 30, 62, 3, 32, 25, 11, 4, 25, 26...          1
 ...                                                  ...        ...
 12618  [2, 2, 10, 2, 50, 33, 5, 13, 25, 62, 2, 30, 32...          0
 18541  [2, 2, 10, 2, 30, 62, 3, 32, 25, 11, 4, 25, 26...          1
 13532  [2, 2, 10, 50, 12, 5, 13, 25, 62, 2, 30, 32, 1...          0
 13055  [2, 2, 10, 2, 50, 33, 5, 13, 25, 62, 2, 30, 32...          0
 19226  [21, 51, 6, 21, 51, 6, 21, 51, 6, 21, 51, 6, 2...          0
 
 [14000 rows x 2 columns],
                                                   opcode  swc_label
 11795  [2, 2, 10, 2, 50, 33, 5, 13, 25, 62, 2, 30, 32...          0
 9017

In [22]:
len(train[train['swc_label'] == 1]), len(train[train['swc_label'] == 0])

(3032, 10968)

In [23]:
len(test[test['swc_label'] == 1]), len(test[test['swc_label'] == 0])

(1299, 4701)

#### 5.2 Save Train and Test DataSet as CSV

In [24]:
train.to_csv('./split_tt/train.csv', index=False)

In [25]:
test.to_csv('./split_tt/test.csv', index=False)