## Import Cleaned OPCODES

In [22]:
import pandas as pd

opc = pd.read_csv('CDS_opcode_sv1.csv', dtype={'swc_label': object, 'opcode': object})

opc

Unnamed: 0,_id,address,opcode,swc_label
0,240000,0x8da4feb3dd3643943c614ca793e8f59baae50544,PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2...,0000000000000000000000000000000000000
1,240001,0xe1521029d2591ba2a0f92b6e744a825f665f748b,PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2...,0000000000000000000000000000000000000
2,240002,0xe378696ce7ad55cfacc40337f476faf8fba9aff2,PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2...,0000000000000000000000000000000000000
3,240003,0x1872b33bb33e372d6f039a8b6af2e1be228bda23,PUSH1 PUSH1 MSTORE CALLDATASIZE ISZERO PUSH2 J...,0000000000100000000000000000000000000
4,240004,0x6ef34258c0c8ae7acc688311189dfc783ebb5bfa,ORIGIN PUSH20 EQ ISZERO ADDRESS JUMPI PUSH20 S...,0000000000000001000000000000000000000
...,...,...,...,...
253827,29967,0xa8ba70d5c2fae0b92d5ef8607bf7de09916c3027,PUSH1 PUSH1 MSTORE PUSH1 CALLDATALOAD PUSH29 S...,0000001000000000000000000000000000000
253828,29968,0x69677e5c602c2e495ec059dbda9f46413a192cc0,PUSH1 PUSH1 MSTORE CALLDATASIZE ISZERO PUSH2 J...,0000000000000000000000000000000000000
253829,29969,0xf82b842bd96282ed2c50a2e64c8de1a9d6862825,PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2...,0000000000000000000000000000000000000
253830,29970,0x401071960e0201430b5a882ce47300b534bd78ff,PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2...,0000000000100000000000000000000000000


## Tokenization

### 1. Import Required Libraries

In [23]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### 2. Define Hyper-Parameters

In [24]:
OPCODE_SIZE = 150
OPCODE_SEQ_LEN = 1800
TRUNC_TYPE = 'post'
PADDING_TYPE = 'post'
OOV_TOKEN = '<OOV>'

### 3. Create Tokenizer Object

In [25]:
tokenizer = Tokenizer(num_words = OPCODE_SIZE, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(opc['opcode'])
word_index = tokenizer.word_index

In [26]:
word_index.items()

dict_items([('<OOV>', 1), ('push1', 2), ('swap1', 3), ('dup1', 4), ('push2', 5), ('pop', 6), ('dup2', 7), ('jumpdest', 8), ('add', 9), ('mstore', 10), ('and', 11), ('iszero', 12), ('jumpi', 13), ('swap2', 14), ('mload', 15), ('dup3', 16), ('jump', 17), ('sub', 18), ('dup4', 19), ('sload', 20), ('push20', 21), ('revert', 22), ('exp', 23), ('swap3', 24), ('push4', 25), ('eq', 26), ('sha3', 27), ('dup5', 28), ('mul', 29), ('calldataload', 30), ('callvalue', 31), ('div', 32), ('lt', 33), ('sstore', 34), ('dup6', 35), ('caller', 36), ('swap4', 37), ('assert', 38), ('fail', 39), ('return', 40), ('push32', 41), ('not', 42), ('gt', 43), ('dup7', 44), ('dup8', 45), ('returndatasize', 46), ('swap5', 47), ('stop', 48), ('calldatasize', 49), ('dup9', 50), ('extcodesize', 51), ('call', 52), ('or', 53), ('gas', 54), ('push3', 55), ('swap6', 56), ('shl', 57), ('returndatacopy', 58), ('log3', 59), ('dup10', 60), ('invalid', 61), ('push29', 62), ('swap7', 63), ('dup11', 64), ('address', 65), ('codecopy

In [27]:
# Save Tokenizer as Pickle
import pickle
with open('tokenizer.pickle', 'wb') as fh:
   pickle.dump(tokenizer, fh)

### 4. Tokenize and Pad all OPCODES

In [28]:
# Tokenize OPCODES
tokenized_opcodes = tokenizer.texts_to_sequences(opc['opcode'])

In [29]:
len(tokenized_opcodes)

253832

In [30]:
# Pad TOkenized OPCODES
padded_opcodes = pad_sequences(tokenized_opcodes, maxlen=OPCODE_SEQ_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE)

In [31]:
print(len(tokenized_opcodes[1]), len(padded_opcodes[1]))

2404 1800


In [32]:
type(padded_opcodes[0])

numpy.ndarray

### 5. Save modified OPCODES as CSV

In [33]:
import numpy as np
opcodes = np.array(padded_opcodes).tolist()

In [34]:
opc['opcode'] = opcodes

In [35]:
opc

Unnamed: 0,_id,address,opcode,swc_label
0,240000,0x8da4feb3dd3643943c614ca793e8f59baae50544,"[2, 2, 10, 2, 49, 33, 5, 13, 25, 62, 2, 30, 32...",0000000000000000000000000000000000000
1,240001,0xe1521029d2591ba2a0f92b6e744a825f665f748b,"[2, 2, 10, 2, 49, 33, 5, 13, 2, 30, 62, 3, 32,...",0000000000000000000000000000000000000
2,240002,0xe378696ce7ad55cfacc40337f476faf8fba9aff2,"[2, 2, 10, 2, 49, 33, 5, 13, 2, 30, 62, 3, 32,...",0000000000000000000000000000000000000
3,240003,0x1872b33bb33e372d6f039a8b6af2e1be228bda23,"[2, 2, 10, 49, 12, 5, 13, 25, 62, 2, 30, 32, 1...",0000000000100000000000000000000000000
4,240004,0x6ef34258c0c8ae7acc688311189dfc783ebb5bfa,"[92, 21, 26, 12, 65, 13, 21, 91, 0, 0, 0, 0, 0...",0000000000000001000000000000000000000
...,...,...,...,...
253827,29967,0xa8ba70d5c2fae0b92d5ef8607bf7de09916c3027,"[2, 2, 10, 2, 30, 62, 3, 32, 4, 25, 26, 5, 13,...",0000001000000000000000000000000000000
253828,29968,0x69677e5c602c2e495ec059dbda9f46413a192cc0,"[2, 2, 10, 49, 12, 5, 13, 25, 62, 2, 30, 32, 1...",0000000000000000000000000000000000000
253829,29969,0xf82b842bd96282ed2c50a2e64c8de1a9d6862825,"[2, 2, 10, 2, 49, 33, 5, 13, 2, 30, 62, 3, 32,...",0000000000000000000000000000000000000
253830,29970,0x401071960e0201430b5a882ce47300b534bd78ff,"[2, 2, 10, 2, 49, 33, 5, 13, 2, 30, 62, 3, 32,...",0000000000100000000000000000000000000


#### 5.1.A Merge Labels & Convert to Binary Labels

In [36]:
opc['swc_label'] = (opc['swc_label'].str.contains('1')).astype(int)

In [37]:
len(opc[opc['swc_label'] == 1]), len(opc[opc['swc_label'] == 0])

(54449, 199383)

In [38]:
opc

Unnamed: 0,_id,address,opcode,swc_label
0,240000,0x8da4feb3dd3643943c614ca793e8f59baae50544,"[2, 2, 10, 2, 49, 33, 5, 13, 25, 62, 2, 30, 32...",0
1,240001,0xe1521029d2591ba2a0f92b6e744a825f665f748b,"[2, 2, 10, 2, 49, 33, 5, 13, 2, 30, 62, 3, 32,...",0
2,240002,0xe378696ce7ad55cfacc40337f476faf8fba9aff2,"[2, 2, 10, 2, 49, 33, 5, 13, 2, 30, 62, 3, 32,...",0
3,240003,0x1872b33bb33e372d6f039a8b6af2e1be228bda23,"[2, 2, 10, 49, 12, 5, 13, 25, 62, 2, 30, 32, 1...",1
4,240004,0x6ef34258c0c8ae7acc688311189dfc783ebb5bfa,"[92, 21, 26, 12, 65, 13, 21, 91, 0, 0, 0, 0, 0...",1
...,...,...,...,...
253827,29967,0xa8ba70d5c2fae0b92d5ef8607bf7de09916c3027,"[2, 2, 10, 2, 30, 62, 3, 32, 4, 25, 26, 5, 13,...",1
253828,29968,0x69677e5c602c2e495ec059dbda9f46413a192cc0,"[2, 2, 10, 49, 12, 5, 13, 25, 62, 2, 30, 32, 1...",0
253829,29969,0xf82b842bd96282ed2c50a2e64c8de1a9d6862825,"[2, 2, 10, 2, 49, 33, 5, 13, 2, 30, 62, 3, 32,...",0
253830,29970,0x401071960e0201430b5a882ce47300b534bd78ff,"[2, 2, 10, 2, 49, 33, 5, 13, 2, 30, 62, 3, 32,...",1


#### 5.1.B Split the DataFrame into Training & Testing Sets

In [40]:
from sklearn.model_selection import train_test_split

final_data = opc[['opcode', 'swc_label']]
train, test = train_test_split(final_data, test_size=0.3, random_state=69, shuffle=True, stratify=final_data['swc_label'])

In [41]:
train, test

(                                                   opcode  swc_label
 158905  [2, 2, 10, 2, 49, 33, 5, 13, 2, 30, 62, 3, 32,...          0
 166834  [2, 2, 10, 2, 49, 33, 5, 13, 2, 30, 2, 102, 4,...          0
 14325   [2, 2, 10, 2, 49, 33, 5, 13, 2, 30, 62, 3, 32,...          0
 239108  [2, 2, 10, 2, 49, 33, 5, 13, 2, 30, 62, 3, 32,...          0
 221937  [2, 2, 10, 25, 62, 2, 30, 32, 11, 25, 7, 26, 5...          0
 ...                                                   ...        ...
 223912  [2, 2, 10, 49, 12, 5, 13, 25, 2, 2, 23, 2, 30,...          1
 4305    [92, 21, 26, 12, 65, 13, 21, 91, 0, 0, 0, 0, 0...          1
 240781  [2, 2, 10, 2, 49, 33, 5, 13, 25, 62, 2, 30, 32...          0
 122921  [2, 2, 10, 2, 49, 33, 5, 13, 25, 62, 2, 30, 32...          0
 245156  [2, 2, 10, 2, 49, 33, 5, 13, 25, 62, 2, 30, 32...          0
 
 [177682 rows x 2 columns],
                                                    opcode  swc_label
 64057   [2, 2, 10, 2, 49, 33, 5, 13, 2, 30, 62, 3, 32,...  

In [43]:
len(train[train['swc_label'] == 1]), len(train[train['swc_label'] == 0])

(38114, 139568)

In [45]:
len(test[test['swc_label'] == 1]), len(test[test['swc_label'] == 0])

(16335, 59815)

#### 5.2 Save Train and Test DataSet as CSV

In [46]:
train.to_csv('./split_tt/train.csv', index=False)

In [47]:
test.to_csv('./split_tt/test.csv', index=False)