# Preprocessing

In [316]:
import pygments.lexers
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec

import warnings
warnings.simplefilter('ignore')

import sys
sys.path.append('./reentrancyContracts')
sys.path.append('./reentrancyContractLabels')
import reentrancyContracts
import reentrancyContractLabels

### Extract Global Variables

In [317]:
# takes in full contract and pull out names of global variables (with reasonable accuracy)
def getGlobalVariables(contract):
    lexer = pygments.lexers.get_lexer_by_name('Solidity')
    tokens = list(pygments.lex(contract, lexer))
    globalVarList = []
    openBracketsCount = 0
    openParenthesisCount = 0
    i = 0
    while i < len(tokens):
        if str(tokens[i][0]) == 'Token.Text.Whitespace' or str(tokens[i][0]) == 'Token.Comment.Single': None
        elif(tokens[i][1] == '{'): openBracketsCount += 1
        elif(tokens[i][1] == '}'): openBracketsCount -= 1
        elif(tokens[i][1] == '('): openParenthesisCount += 1
        elif(tokens[i][1] == ')'): openParenthesisCount -= 1
        elif(str(tokens[i][0]) == 'Token.Keyword.Type' and isType(tokens[i][1]) and openBracketsCount == 1 and openParenthesisCount == 0):
            index = 0
            potentialGlobal = ""
            include = True
            while True:
                if(tokens[i+index][1] == "constant"): 
                    include = False
                if(tokens[i+index][1] == ';'):
                    if include:
                        globalVarList.append(potentialGlobal)
                    break
                if(tokens[i+index][0] == pygments.token.Name.Variable or tokens[i+index][0] == pygments.token.Text):
                    potentialGlobal = tokens[i+index][1]
                    
                index += 1
            i += (index-1)
        i += 1
        
    return globalVarList

### Extract Modifiers

In [318]:
def getModifiers(contract):
    lexer = pygments.lexers.get_lexer_by_name('Solidity')
    tokens = list(pygments.lex(contract, lexer))
    filteredTokens = []
    for token in tokens:
        if str(token[0]) != 'Token.Text.Whitespace':
            filteredTokens.append(token)
            
    modifierNameList = []
    openBracketsCount = 0
    openParenthesisCount = 0
    i = 0
    while i < len(filteredTokens):
        if(str(filteredTokens[i][0]) == 'Token.Keyword.Type' and str(filteredTokens[i][1]) == 'modifier'):
            modifierNameList.append(filteredTokens[i+1][1])
        
        i += 1
        
    return modifierNameList

In [319]:
getModifiers(reentrancyContracts.contracts[22])

['onlyOwner', 'isOpenToPublic', 'onlyRealPeople', 'onlyPlayers']

### Tokenize Functions

In [320]:
# tokenize a given function
def tokenize(parsedFunction, globalVariablesList, modifierNameList):
    txKeywords = ['call', 'value', 'transfer', 'send']
    tokenList = []
    for tup in parsedFunction:
        if tokenList and tokenList[-1] == "function":
            tokenList.append("Token.FunctionName")
        elif str(tup[1]) in txKeywords:
            tokenList.append(tup[1])
        elif 'Token.Literal' in str(tup[0]):
            tokenList.append("Token.Constant")
        elif tup[1] in globalVariablesList:
            tokenList.append("Token.GlobalVariable")
        elif tup[1] in modifierNameList:
            tokenList.append("Token.ModifierName")
        elif tup[1].startswith("\""):
            tokenList.append("Token.String")
        elif str(tup[0]) == 'Token.Name.Variable' or str(tup[0]) == 'Token.Text':
            tokenList.append("Token.LocalVariable")
        elif str(tup[0]) != 'Token.Text.Whitespace' and str(tup[0]) != 'Token.Comment.Single':
            tokenList.append(tup[1])
    return tokenList

In [321]:
# checks if keyword is for variable declaration (helper for global variable parser)
def isType(word):
    keywords = ['address', 'bool', 'byte', 'bytes', 'int', 'string', 'uint', 'mapping']
    for kw in keywords:
        if kw in word:
            return True
    return False

In [322]:
def getFunctions(contract):
    lexer = pygments.lexers.get_lexer_by_name('Solidity')
    tokens = list(pygments.lex(contract, lexer))
    functionList = []
    i = 0
    while i < len(tokens):
        if str(tokens[i][0]) == "Token.Keyword.Type" and tokens[i][1] == 'function':
            currFunctionNum = len(functionList)
            functionList.append([tokens[i]])
            i += 1
            
            oneLiner = False
            while tokens[i][1] != '{':
                # the one-liner case
                if str(tokens[i][1]) == ';':
                    i += 1
                    oneLiner = True
                    break
                if str(tokens[i][0]) != 'Token.Text.Whitespace' and str(tokens[i][0]) != 'Token.Comment.Single':
                    functionList[currFunctionNum].append(tokens[i])
                i += 1
                
            functionList[currFunctionNum].append(tokens[i])
            openBracketsCount = 1
            i += 1
            
            while openBracketsCount > 0:
                if(tokens[i][1] == '{'): openBracketsCount += 1
                elif(tokens[i][1] == '}'): openBracketsCount -= 1
                
                if str(tokens[i][0]) != 'Token.Text.Whitespace' and str(tokens[i][0]) != 'Token.Comment.Single':
                    functionList[currFunctionNum].append(tokens[i])
                i += 1
        else:
            i += 1
    return functionList

In [323]:
# processing pipeline:
# - receives contract
# - extracts list of global variable names
# - extracts list of modifier names
# - extracts list of function source code
# - creates tokenized list of functions
# - vectorizes tokenized function list by word2vec vectors

def tokenizeContractFunctions(contract):
    globalVariablesList = getGlobalVariables(contract)
    modifierNameList = getModifiers(contract)
    functionList = getFunctions(contract)
    
    tokenizedFunctionList = []
    for function in functionList:
        tokenizedFunction = tokenize(function, globalVariablesList, modifierNameList)
        tokenizedFunctionList.append(tokenizedFunction)
    
    return tokenizedFunctionList

### Create DataFrame

In [324]:
def createTuples(tokenizedFunctions, labels):
    tuples = []
    for i in range(len(tokenizedFunctions)):
        
        tuples.append([tokenizedFunctions[i], labels[i]])
    return tuples

In [325]:
def returnData(contracts, contractLabels):
    functions = []
    labels = []
    for i in range(len(contracts)):
        tokenizedFunctions = tokenizeContractFunctions(contracts[i])
        functions.extend(tokenizedFunctions)
        labels.extend(contractLabels[i])
    
    return [functions, labels]

In [326]:
data = returnData(reentrancyContracts.contracts, reentrancyContractLabels.labels)

In [327]:
def createDataFrame(contracts, contractLabels):
    df = pd.DataFrame(columns=['tokenized_function', 'label'])
    for i in range(len(contracts)):
        tokenizedFunctions = tokenizeContractFunctions(contracts[i])
        tuples = createTuples(tokenizedFunctions, contractLabels[i])
        appendable = pd.DataFrame(tuples, columns=['tokenized_function', 'label'])
        df = df.append(appendable, ignore_index=True)
    
    return df

In [328]:
df = createDataFrame(reentrancyContracts.contracts, reentrancyContractLabels.labels)

### Create Embeddings

In [329]:
def get_w2v_mapping(tokenizedFunctionsDf, token_dim):
    w2v = Word2Vec(tokenizedFunctionsDf, min_count=1, size=token_dim, workers=3, window=3, sg=1)
    return w2v

In [330]:
# create set of vectorized representations of functions, where each row is a w2v token vector
def vectorize_functions(tokenizedFunctionsDf, w2v_mapping):
    embedding_list = []
    for i in range(len(tokenizedFunctionsDf)):
        embedding = []
        for token in tokenizedFunctionsDf[i]:
            embedding.append(w2v_mapping[token])
        
        embedding_list.append(embedding)
    return embedding_list

In [346]:
# given list of vectorized functions, return padded version
def pad_embeddings(vectorizedFunctions, max_length, token_dim):
    paddedVectorizedFunctions = []
    for vectorizedFunction in vectorizedFunctions:
        vectorizedFunction = np.array(vectorizedFunction, dtype=np.float64)
        zero_padding_cnt = max_length - len(vectorizedFunction)
        pad = np.zeros((1, token_dim), dtype=np.float64)
        for i in range(zero_padding_cnt):
            vectorizedFunction = np.concatenate((vectorizedFunction, pad), axis=0)
        
        paddedVectorizedFunctions.append(vectorizedFunction)
    return paddedVectorizedFunctions

In [347]:
def getFunctionEmbeddings(tokenizedFunctionsDf, max_length, token_dim):
    w2v_mapping = get_w2v_mapping(tokenizedFunctionsDf, token_dim)
    vectorized_fns = vectorize_functions(tokenizedFunctionsDf, w2v_mapping)
#     print("vect fns: ", vectorized_fns.dtype)
    padded_embeddings = pad_embeddings(vectorized_fns, max_length, token_dim)
#     print("padded fns: ", padded_embeddings.dtype)
    return [w2v_mapping, padded_embeddings]

In [354]:
length = 100
token_dim = 15

[functions, labels] = returnData(reentrancyContracts.contracts, reentrancyContractLabels.labels)
[w2v_mapping, fn_embeddings] = getFunctionEmbeddings(functions, length, token_dim)
fn_embeddings = np.array(fn_embeddings)
fn_embeddings[0].shape

(100, 15)

# Model

In [164]:
import warnings
warnings.simplefilter('ignore')

import tensorflow as tf
from keras.models import Sequential
from keras import layers, Input
from keras.layers import Dense, LSTM

In [155]:
[w2v_mapping, X_train] = getFunctionEmbeddings(df['tokenized_function'], 100, 15)
X_train = np.array(X_train)
y_train = df['label'].values

vect fns:  object
padded fns:  object


In [156]:
X_train.shape

(209,)

In [166]:
X_train[0].shape

(100, 15)

In [158]:
y_train.shape

(209,)

In [162]:
model = Sequential()
model.add(LSTM(128))
model.add(Dense(2, activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

print("Training...")
model.fit(X_train, y_train, epochs=50)

Training...


ValueError: Input 0 is incompatible with layer lstm_4: expected ndim=3, found ndim=1

In [163]:
vocab_size = len(w2v_mapping.wv.vocab)
embedding_dim = 15
embeddings_per_example = 100

inp = Input(shape=(100,15))

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=embeddings_per_example))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit(X_train, y_train,
                    epochs=10)

ValueError: Error when checking input: expected embedding_9_input to have shape (100,) but got array with shape (1,)

In [39]:
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [218]:
print("Training...")
model.fit(train_X, train_Y, epochs=50)

Training...


ValueError: Error when checking input: expected embedding_8_input to have shape (100,) but got array with shape (1,)