# Preprocessing

In [2]:
import pygments.lexers
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec

import warnings
warnings.simplefilter('ignore')

import sys
sys.path.append('./reentrancyContracts')
sys.path.append('./reentrancyContractLabels')
import reentrancyContracts
import reentrancyContractLabels

### Extract Global Variables

In [3]:
# takes in full contract and pull out names of global variables (with reasonable accuracy)
def getGlobalVariables(contract):
    lexer = pygments.lexers.get_lexer_by_name('Solidity')
    tokens = list(pygments.lex(contract, lexer))
    globalVarList = []
    openBracketsCount = 0
    openParenthesisCount = 0
    i = 0
    while i < len(tokens):
        if str(tokens[i][0]) == 'Token.Text.Whitespace' or str(tokens[i][0]) == 'Token.Comment.Single': None
        elif(tokens[i][1] == '{'): openBracketsCount += 1
        elif(tokens[i][1] == '}'): openBracketsCount -= 1
        elif(tokens[i][1] == '('): openParenthesisCount += 1
        elif(tokens[i][1] == ')'): openParenthesisCount -= 1
        elif(str(tokens[i][0]) == 'Token.Keyword.Type' and isType(tokens[i][1]) and openBracketsCount == 1 and openParenthesisCount == 0):
            index = 0
            potentialGlobal = ""
            include = True
            while True:
                if(tokens[i+index][1] == "constant"): 
                    include = False
                if(tokens[i+index][1] == ';'):
                    if include:
                        globalVarList.append(potentialGlobal)
                    break
                if(tokens[i+index][0] == pygments.token.Name.Variable or tokens[i+index][0] == pygments.token.Text):
                    potentialGlobal = tokens[i+index][1]
                    
                index += 1
            i += (index-1)
        i += 1
        
    return globalVarList

### Extract Modifiers

In [4]:
def getModifiers(contract):
    lexer = pygments.lexers.get_lexer_by_name('Solidity')
    tokens = list(pygments.lex(contract, lexer))
    filteredTokens = []
    for token in tokens:
        if str(token[0]) != 'Token.Text.Whitespace':
            filteredTokens.append(token)
            
    modifierNameList = []
    openBracketsCount = 0
    openParenthesisCount = 0
    i = 0
    while i < len(filteredTokens):
        if(str(filteredTokens[i][0]) == 'Token.Keyword.Type' and str(filteredTokens[i][1]) == 'modifier'):
            modifierNameList.append(filteredTokens[i+1][1])
        
        i += 1
        
    return modifierNameList

In [5]:
getModifiers(reentrancyContracts.contracts[22])

['onlyOwner', 'isOpenToPublic', 'onlyRealPeople', 'onlyPlayers']

### Tokenize Functions

In [6]:
# tokenize a given function
def tokenize(parsedFunction, globalVariablesList, modifierNameList):
    txKeywords = ['call', 'value', 'transfer', 'send']
    tokenList = []
    for tup in parsedFunction:
        if tokenList and tokenList[-1] == "function":
            tokenList.append("Token.FunctionName")
        elif str(tup[1]) in txKeywords:
            tokenList.append(tup[1])
        elif 'Token.Literal' in str(tup[0]):
            tokenList.append("Token.Constant")
        elif tup[1] in globalVariablesList:
            tokenList.append("Token.GlobalVariable")
        elif tup[1] in modifierNameList:
            tokenList.append("Token.ModifierName")
        elif tup[1].startswith("\""):
            tokenList.append("Token.String")
        elif str(tup[0]) == 'Token.Name.Variable' or str(tup[0]) == 'Token.Text':
            tokenList.append("Token.LocalVariable")
        elif str(tup[0]) != 'Token.Text.Whitespace' and str(tup[0]) != 'Token.Comment.Single':
            tokenList.append(tup[1])
    return tokenList

In [7]:
# checks if keyword is for variable declaration (helper for global variable parser)
def isType(word):
    keywords = ['address', 'bool', 'byte', 'bytes', 'int', 'string', 'uint', 'mapping']
    for kw in keywords:
        if kw in word:
            return True
    return False

In [8]:
def getFunctions(contract):
    lexer = pygments.lexers.get_lexer_by_name('Solidity')
    tokens = list(pygments.lex(contract, lexer))
    functionList = []
    i = 0
    while i < len(tokens):
        if str(tokens[i][0]) == "Token.Keyword.Type" and tokens[i][1] == 'function':
            currFunctionNum = len(functionList)
            functionList.append([tokens[i]])
            i += 1
            
            oneLiner = False
            while tokens[i][1] != '{':
                # the one-liner case
                if str(tokens[i][1]) == ';':
                    i += 1
                    oneLiner = True
                    break
                if str(tokens[i][0]) != 'Token.Text.Whitespace' and str(tokens[i][0]) != 'Token.Comment.Single':
                    functionList[currFunctionNum].append(tokens[i])
                i += 1
                
            functionList[currFunctionNum].append(tokens[i])
            openBracketsCount = 1
            i += 1
            
            while openBracketsCount > 0:
                if(tokens[i][1] == '{'): openBracketsCount += 1
                elif(tokens[i][1] == '}'): openBracketsCount -= 1
                
                if str(tokens[i][0]) != 'Token.Text.Whitespace' and str(tokens[i][0]) != 'Token.Comment.Single':
                    functionList[currFunctionNum].append(tokens[i])
                i += 1
        else:
            i += 1
    return functionList

In [10]:
# processing pipeline:
# - receives contract
# - extracts list of global variable names
# - extracts list of modifier names
# - extracts list of function source code
# - creates tokenized list of functions
# - vectorizes tokenized function list by word2vec vectors

def tokenizeContractFunctions(contract):
    globalVariablesList = getGlobalVariables(contract)
    modifierNameList = getModifiers(contract)
    functionList = getFunctions(contract)
    
    tokenizedFunctionList = []
    for function in functionList:
        tokenizedFunction = tokenize(function, globalVariablesList, modifierNameList)
        tokenizedFunctionList.append(tokenizedFunction)
    
    return tokenizedFunctionList

### Create DataFrame

In [11]:
def createTuples(tokenizedFunctions, labels):
    tuples = []
    for i in range(len(tokenizedFunctions)):
        
        tuples.append([tokenizedFunctions[i], labels[i]])
    return tuples

In [12]:
def createDataFrame(contracts, contractLabels):
    df = pd.DataFrame(columns=['tokenized_function', 'label'])
    for i in range(len(contracts)):
        tokenizedFunctions = tokenizeContractFunctions(contracts[i])
        tuples = createTuples(tokenizedFunctions, contractLabels[i])
        appendable = pd.DataFrame(tuples, columns=['tokenized_function', 'label'])
        df = df.append(appendable, ignore_index=True)
    
    return df

In [14]:
df = createDataFrame(reentrancyContracts.contracts, reentrancyContractLabels.labels)
df

Unnamed: 0,tokenized_function,label
0,"[function, Token.FunctionName, (, uint, Token....",0
1,"[function, Token.FunctionName, (, address, Tok...",0
2,"[function, Token.FunctionName, (, ), public, {...",0
3,"[function, Token.FunctionName, (, ), public, p...",0
4,"[function, Token.FunctionName, (, uint, Token....",1
...,...,...
204,"[function, Token.FunctionName, (, ), public, {...",1
205,"[function, Token.FunctionName, (, address, Tok...",0
206,"[function, Token.FunctionName, (, ), public, {...",1
207,"[function, Token.FunctionName, (, address, Tok...",0


### Create Embeddings

In [15]:
def get_w2v_mapping(tokenizedFunctionsDf, token_dim):
#     tokenPool = []
#     for tokenList in tokenizedFunctionsDf:
#         tokenPool.append(tokenList)
    w2v = Word2Vec(tokenizedFunctionsDf, min_count=1, size=token_dim, workers=3, window=3, sg=1)
    return w2v

In [16]:
# create set of vectorized representations of functions, where each row is a w2v token vector
def vectorize_functions(tokenizedFunctionsDf, w2v_mapping):
    embedding_list = []
    for i in range(len(tokenizedFunctionsDf)):
        embedding = []
        for token in tokenizedFunctionsDf[i]:
            embedding.append(w2v_mapping[token].tolist())
        
        embedding_list.append(embedding)
    return embedding_list

In [17]:
# given list of vectorized functions, return padded version
def pad_embeddings(vectorizedFunctions, max_length, token_dim):
    paddedVectorizedFunctions = []
    for vectorizedFunction in vectorizedFunctions:
        zero_padding_cnt = max_length - len(vectorizedFunction)
        pad = [0] * token_dim
        for i in range(zero_padding_cnt):
            vectorizedFunction.append(pad)
        paddedVectorizedFunctions.append(vectorizedFunction)
    return np.asarray(paddedVectorizedFunctions)

In [18]:
def getFunctionEmbeddings(tokenizedFunctionsDf, max_length, token_dim):
    w2v_mapping = get_w2v_mapping(tokenizedFunctionsDf, token_dim)
    vectorized_fns = vectorize_functions(tokenizedFunctionsDf, w2v_mapping)
    padded_embeddings = pad_embeddings(vectorized_fns, max_length, token_dim)
    return vectorized_fns

In [21]:
fn_embeddings = getFunctionEmbeddings(df['tokenized_function'], 100, 15)
w2v = get_w2v_mapping(df['tokenized_function'], 15)
fn_embeddings[0:5]

[[[-0.4206872880458832,
   0.444353848695755,
   -0.331988662481308,
   0.11048563569784164,
   -0.04064146429300308,
   -0.13379834592342377,
   -0.29912593960762024,
   -0.5149583220481873,
   -0.17833474278450012,
   -0.6847097277641296,
   -0.026206066831946373,
   0.1972750872373581,
   0.13808809220790863,
   0.027201293036341667,
   0.30962157249450684],
  [-0.43766894936561584,
   0.42388445138931274,
   -0.3312615752220154,
   0.17895236611366272,
   0.03743946552276611,
   -0.14181773364543915,
   -0.3394396901130676,
   -0.5583913326263428,
   -0.08960331231355667,
   -0.6934710144996643,
   -0.03964458033442497,
   0.25104957818984985,
   0.13548673689365387,
   -0.04740312695503235,
   0.356416791677475],
  [-0.3248598277568817,
   0.3399144411087036,
   -0.23301918804645538,
   0.049657803028821945,
   -0.09947321563959122,
   -0.16118863224983215,
   -0.24740120768547058,
   -0.5462103486061096,
   -0.24856729805469513,
   -0.5821080207824707,
   0.08783027529716492,
   

# Model

In [46]:
import warnings
warnings.simplefilter('ignore')

from keras.models import Sequential
from keras import layers

In [22]:
X_train = np.array(getFunctionEmbeddings(df['tokenized_function'], 100, 15))
y_train = df['label'].values

In [23]:
X_train.shape

(209,)

In [24]:
y_train.shape

(209,)

In [54]:
vocab_size = len(w2v.wv.vocab)
embedding_dim = 15
embeddings_per_example = 100

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=embeddings_per_example))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit(X_train, y_train,
                    epochs=10)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


ValueError: Error when checking input: expected embedding_1_input to have shape (100,) but got array with shape (1,)

In [217]:
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [218]:
print("Training...")
model.fit(train_X, train_Y, epochs=50)

Training...


ValueError: Error when checking input: expected embedding_8_input to have shape (100,) but got array with shape (1,)