# Preprocessing

In [1]:
import pygments.lexers
import numpy as np
import pandas as pd
from gensim.models import Word2Vec

import warnings
warnings.simplefilter('ignore')

import sys
sys.path.append('./reentrancyContracts')
sys.path.append('./reentrancyContractLabels')
import reentrancyContracts
import reentrancyContractLabels

### Extract Global Variables

In [2]:
# takes in full contract and pull out names of global variables (with reasonable accuracy)
def getGlobalVariables(contract):
    lexer = pygments.lexers.get_lexer_by_name('Solidity')
    tokens = list(pygments.lex(contract, lexer))
    globalVarList = []
    openBracketsCount = 0
    openParenthesisCount = 0
    i = 0
    while i < len(tokens):
        if str(tokens[i][0]) == 'Token.Text.Whitespace' or str(tokens[i][0]) == 'Token.Comment.Single': None
        elif(tokens[i][1] == '{'): openBracketsCount += 1
        elif(tokens[i][1] == '}'): openBracketsCount -= 1
        elif(tokens[i][1] == '('): openParenthesisCount += 1
        elif(tokens[i][1] == ')'): openParenthesisCount -= 1
        elif(str(tokens[i][0]) == 'Token.Keyword.Type' and isType(tokens[i][1]) and openBracketsCount == 1 and openParenthesisCount == 0):
            index = 0
            potentialGlobal = ""
            include = True
            while True:
                if(tokens[i+index][1] == "constant"): 
                    include = False
                if(tokens[i+index][1] == ';'):
                    if include:
                        globalVarList.append(potentialGlobal)
                    break
                if(tokens[i+index][0] == pygments.token.Name.Variable or tokens[i+index][0] == pygments.token.Text):
                    potentialGlobal = tokens[i+index][1]
                    
                index += 1
            i += (index-1)
        i += 1
        
    return globalVarList

### Extract Modifiers

In [3]:
# extracts names of modifiers (function guards) for tokenization process
def getModifiers(contract):
    lexer = pygments.lexers.get_lexer_by_name('Solidity')
    tokens = list(pygments.lex(contract, lexer))
    filteredTokens = []
    for token in tokens:
        if str(token[0]) != 'Token.Text.Whitespace':
            filteredTokens.append(token)
            
    modifierNameList = []
    openBracketsCount = 0
    openParenthesisCount = 0
    i = 0
    while i < len(filteredTokens):
        if(str(filteredTokens[i][0]) == 'Token.Keyword.Type' and str(filteredTokens[i][1]) == 'modifier'):
            modifierNameList.append(filteredTokens[i+1][1])
        
        i += 1
        
    return modifierNameList

In [4]:
getModifiers(reentrancyContracts.contracts[22])

['onlyOwner', 'isOpenToPublic', 'onlyRealPeople', 'onlyPlayers']

### Extract Function Source Code

In [5]:
# given contract source code, extracts lists of basic tokens for individual functions
def getFunctions(contract):
    lexer = pygments.lexers.get_lexer_by_name('Solidity')
    tokens = list(pygments.lex(contract, lexer))
    functionList = []
    i = 0
    while i < len(tokens):
        if str(tokens[i][0]) == "Token.Keyword.Type" and tokens[i][1] == 'function':
            currFunctionNum = len(functionList)
            functionList.append([tokens[i]])
            i += 1
            
            oneLiner = False
            while tokens[i][1] != '{':
                # the one-liner case
                if str(tokens[i][1]) == ';':
                    i += 1
                    oneLiner = True
                    break
                if str(tokens[i][0]) != 'Token.Text.Whitespace' and str(tokens[i][0]) != 'Token.Comment.Single':
                    functionList[currFunctionNum].append(tokens[i])
                i += 1
                
            functionList[currFunctionNum].append(tokens[i])
            openBracketsCount = 1
            i += 1
            
            while openBracketsCount > 0:
                if(tokens[i][1] == '{'): openBracketsCount += 1
                elif(tokens[i][1] == '}'): openBracketsCount -= 1
                
                if str(tokens[i][0]) != 'Token.Text.Whitespace' and str(tokens[i][0]) != 'Token.Comment.Single':
                    functionList[currFunctionNum].append(tokens[i])
                i += 1
        else:
            i += 1
    return functionList

### Tokenize Functions

In [6]:
# given a function's basic tokenization and a list of modifier and global variable names, fully tokenize function
def tokenize(parsedFunction, globalVariablesList, modifierNameList):
    txKeywords = ['call', 'value', 'transfer', 'send']
    tokenList = []
    for tup in parsedFunction:
        if tokenList and tokenList[-1] == "function":
            tokenList.append("Token.FunctionName")
        elif str(tup[1]) in txKeywords:
            tokenList.append(tup[1])
        elif 'Token.Literal' in str(tup[0]):
            tokenList.append("Token.Constant")
        elif tup[1] in globalVariablesList:
            tokenList.append("Token.GlobalVariable")
        elif tup[1] in modifierNameList:
            tokenList.append("Token.ModifierName")
        elif tup[1].startswith("\""):
            tokenList.append("Token.String")
        elif str(tup[0]) == 'Token.Name.Variable' or str(tup[0]) == 'Token.Text':
            tokenList.append("Token.LocalVariable")
        elif str(tup[0]) != 'Token.Text.Whitespace' and str(tup[0]) != 'Token.Comment.Single':
            tokenList.append(tup[1])
    return tokenList

In [7]:
# checks if keyword is for variable declaration (helper for global variable parser)
def isType(word):
    keywords = ['address', 'bool', 'byte', 'bytes', 'int', 'string', 'uint', 'mapping']
    for kw in keywords:
        if kw in word:
            return True
    return False

In [8]:
# processing pipeline:
# - receives contract
# - extracts list of global variable names
# - extracts list of modifier names
# - extracts list of function source code
# - creates tokenized list of functions

def tokenizeContractFunctions(contract):
    globalVariablesList = getGlobalVariables(contract)
    modifierNameList = getModifiers(contract)
    functionList = getFunctions(contract)
    
    tokenizedFunctionList = []
    for function in functionList:
        tokenizedFunction = tokenize(function, globalVariablesList, modifierNameList)
        tokenizedFunctionList.append(tokenizedFunction)
    
    return tokenizedFunctionList

### Create DataFrame (currently not in use)

In [9]:
def createTuples(tokenizedFunctions, labels):
    tuples = []
    for i in range(len(tokenizedFunctions)):
        
        tuples.append([tokenizedFunctions[i], labels[i]])
    return tuples

In [10]:
def createDataFrame(contracts, contractLabels):
    df = pd.DataFrame(columns=['tokenized_function', 'label'])
    for i in range(len(contracts)):
        tokenizedFunctions = tokenizeContractFunctions(contracts[i])
        tuples = createTuples(tokenizedFunctions, contractLabels[i])
        appendable = pd.DataFrame(tuples, columns=['tokenized_function', 'label'])
        df = df.append(appendable, ignore_index=True)
    
    return df

### Create Matching Lists of Functions and Corresponding Labels

In [11]:
def returnData(contracts, contractLabels):
    functions = []
    labels = []
    for i in range(len(contracts)):
        tokenizedFunctions = tokenizeContractFunctions(contracts[i])
        functions.extend(tokenizedFunctions)
        labels.extend(contractLabels[i])
    
    return [functions, labels]

### Create Embeddings for Tokens

In [12]:
# creates Word2Vec mapping from token to vector
def get_w2v_mapping(tokenizedFunctionsDf, token_dim):
    w2v = Word2Vec(tokenizedFunctionsDf, min_count=1, size=token_dim, workers=3, window=3, sg=1)
    return w2v

In [13]:
# given list of tokenized functions, convert into list of list of embeddings
def vectorize_functions(tokenized_functions, w2v_mapping):
    embedding_list = []
    for i in range(len(tokenized_functions)):
        embedding = []
        for token in tokenized_functions[i]:
            embedding.append(w2v_mapping[token])
        
        embedding_list.append(embedding)
    return embedding_list

In [14]:
# given list of embeddings, add padding
def pad_embeddings(embeddings, max_length, token_dim):
    padded_embeddings = []
    for embedding in embeddings:
        zero_padding_cnt = max_length - len(embedding)
        pad = np.zeros((1, token_dim))
        for i in range(zero_padding_cnt):
            embedding = np.concatenate((embedding, pad), axis=0)
        padded_embeddings.append(embedding)
    return padded_embeddings

In [15]:
# compiles Word2Vec mapping, creates embedded function representations, and applies padding
def getFunctionEmbeddings(tokenizedFunctions, max_length, token_dim):
    w2v_mapping = get_w2v_mapping(tokenizedFunctions, token_dim)
    vectorized_fns = vectorize_functions(tokenizedFunctions, w2v_mapping)
    padded_embeddings = pad_embeddings(vectorized_fns, max_length, token_dim)
    return [w2v_mapping, padded_embeddings]

In [16]:
length = 500
token_dim = 15

[functions, labels] = returnData(reentrancyContracts.contracts, reentrancyContractLabels.labels)
[w2v_mapping, fn_embeddings] = getFunctionEmbeddings(functions, length, token_dim)
fn_embeddings = np.array(fn_embeddings)
print("X shape: ", fn_embeddings.shape)

X shape:  (432, 500, 15)


# Models

In [23]:
import warnings
warnings.simplefilter('ignore')

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import classification_report

import tensorflow as tf
from keras import layers, regularizers, Input
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Bidirectional, LSTM, Flatten, Embedding, Dropout
from keras.metrics import Accuracy, Precision, Recall

In [24]:
# compile formatted data and create splits
[functions, labels] = returnData(reentrancyContracts.contracts, reentrancyContractLabels.labels)
[w2v_mapping, X_train] = getFunctionEmbeddings(functions, 500, 15)

X = np.array(X_train)
y = to_categorical(np.array(labels))
X_train = X[30:]
y_train = y[30:]
X_test = X[:30]
y_test = y[:30]

In [25]:
X_train.shape

(402, 500, 15)

In [26]:
y_train.shape

(402, 2)

### BLSTM

In [39]:
model_1 = Sequential()
model_1.add(Bidirectional(LSTM(300)))
model_1.add(Dense(2, activation='softmax'))

model_1.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

print("Training...")
history = model_1.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=15, verbose=0)

Training...


In [40]:
print("Evaluate on train data")
results = model_1.evaluate(X_train, y_train)
print("train loss, train acc:", results)

Evaluate on train data
train loss, train acc: [0.47703068603330584, 0.7786069512367249]


In [41]:
print("Evaluate on test data")
results = model_1.evaluate(X_test, y_test, batch_size=128)
print("test loss, test acc:", results)

Evaluate on test data
test loss, test acc: [0.8837546110153198, 0.46666666865348816]


### CNN

In [42]:
vocab_size = len(w2v_mapping.wv.vocab)
embedding_dim = 15
embeddings_per_example = 500

metrics = [
    Precision(name='precision'),
    Recall(name='recall')
]

model_2 = Sequential()
model_2.add(layers.Conv1D(128, 5, activation='relu'))
model_2.add(layers.GlobalMaxPooling1D())
model_2.add(layers.Dense(10, activation='relu'))
model_2.add(layers.Dense(2, activation='sigmoid'))

model_2.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

print("Training...")
history = model_2.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=30, epochs=300, verbose=0)

Training...


In [43]:
print("Evaluate on train data")
results = model_2.evaluate(X_train, y_train)
print("train loss, train acc:", results)

Evaluate on train data
train loss, train acc: [0.16532784878781334, 0.9402984976768494]


In [44]:
print("Evaluate on test data")
results = model_2.evaluate(X_test, y_test, batch_size=128)
print("test loss, test acc:", results)

Evaluate on test data
test loss, test acc: [1.1498416662216187, 0.800000011920929]
