## Preprocessing

In [1]:
import pygments.lexers
import numpy as np
from gensim.models import Word2Vec

import sys
sys.path.append('./reentrancy')
import reentrancy

In [2]:
# tokenize a given function
def tokenize(lexer, fn):
    tokens = list(pygments.lex(fn, lexer))
    tokenList = []
    for token in tokens:
        if tokenList and tokenList[-1] == "function":
            tokenList.append("Token.FunctionName")
        elif token[0] == pygments.token.Name.Variable or token[1].startswith("_"):
            tokenList.append("Token.Variable")
        elif token[1].startswith("\""):
            tokenList.append("Token.String")
        elif str(token[0]) != 'Token.Text.Whitespace' and str(token[0]) != 'Token.Comment.Single':
            tokenList.append(token[1])
    return tokenList

In [3]:
# return list of token lists given list of functions
def getTokenList(functionList):
    lexer = pygments.lexers.get_lexer_by_name('Solidity')
    globalList = []
    for fn in functionList:
        globalList.append(tokenize(lexer, fn))
    
    return globalList

In [4]:
# given tokenized functions (or any tokenized dataset of Solidity code), return w2v mapping
def get_w2v_mapping(tokenized_fns, token_dim):
    w2v = Word2Vec(tokenized_fns, min_count=1, size=token_dim, workers=3, window=3, sg=1)
    return w2v

In [5]:
# create set of vectorized representations of functions, where each row is a w2v token vector
def vectorize_functions(tokenized_fns, w2v_mapping):
    embedding_list = []
    for i in range(len(tokenized_fns)):
        embedding = []
        for token in tokenized_fns[i]:
            embedding.append(w2v_mapping[token])

        embedding_list.append(embedding)
    
    return embedding_list

In [6]:
# given list of vectorized functions, return padded version
def pad_embeddings(embeddings, max_length, token_dim):
    padded_embeddings = []
    for embedding in embeddings:
        zero_padding_cnt = max_length - len(embedding)
        pad = np.zeros((1, token_dim))
        for i in range(zero_padding_cnt):
            embedding = np.concatenate((embedding, pad), axis=0)
        padded_embeddings.append(embedding)
    return padded_embeddings

In [12]:
def preprocess(function_list, max_length, token_dim):
    tokenized_fns = getTokenList(function_list)
    print("INPUT: ", tokenized_fns)
    print("Tokenized Functions: ", tokenized_fns)
    w2v_mapping = get_w2v_mapping(tokenized_fns, token_dim)
    vectorized_fns = vectorize_functions(tokenized_fns, w2v_mapping)
    print("Vectorized Functions: ", vectorized_fns)
    padded_embeddings = pad_embeddings(vectorized_fns, max_length, token_dim)
    print("Padded Vectorizations: ", padded_embeddings)
    return padded_embeddings

## Model

In [8]:
import warnings
warnings.simplefilter('ignore')

import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential

In [9]:
train_X_list = [
    # 0x01f8c4e3fa3edeb29e514cba738d87ce8c091d3f
    """
        function Collect(uint _am)
        public
        payable
        {
            if(balances[msg.sender]>=MinSum && balances[msg.sender]>=_am)
            {
                // <yes> <report> REENTRANCY
                if(msg.sender.call.value(_am)())
                {
                    balances[msg.sender]-=_am;
                    Log.AddMessage(msg.sender,_am,"Collect");
                }
            }
        }
    """,
    # 0x23a91059fdc9579a9fbd0edc5f2ea0bfdb70deb4
    """
        function CashOut(uint _am)
        {
            if(_am<=balances[msg.sender])
            {            
                // <yes> <report> REENTRANCY
                if(msg.sender.call.value(_am)())
                {
                    balances[msg.sender]-=_am;
                    TransferLog.AddMessage(msg.sender,_am,"CashOut");
                }
            }
        }
    """,
    # 0x627fa62ccbb1c1b04ffaecd72a53e37fc0e17839
    """
        function WithdrawToHolder(address _addr, uint _wei) 
        public
        onlyOwner
        payable
        {
            if(Holders[_addr]>0)
            {
                // <yes> <report> REENTRANCY
                if(_addr.call.value(_wei)())
                {
                    Holders[_addr]-=_wei;
                }
            }
        }
    """,
    """
        function payCharity() payable public {
          uint256 ethToPay = SafeMath.sub(totalEthCharityCollected, totalEthCharityRecieved);
          require(ethToPay > 1);
          totalEthCharityRecieved = SafeMath.add(totalEthCharityRecieved, ethToPay);
          if(!giveEthCharityAddress.call.value(ethToPay).gas(400000)()) {
             totalEthCharityRecieved = SafeMath.sub(totalEthCharityRecieved, ethToPay);
          }
        }
    """,
    """
        function withdraw()
            onlyStronghands()
            public
        {

            address _customerAddress = msg.sender;
            uint256 _dividends = myDividends(false);  


            payoutsTo_[_customerAddress] +=  (int256) (_dividends * magnitude);


            _dividends += referralBalance_[_customerAddress];
            referralBalance_[_customerAddress] = 0;


            _customerAddress.transfer(_dividends);


            onWithdraw(_customerAddress, _dividends);
        }
    """
]

train_Y = np.array([
    [1,0],
    [1,0],
    [1,0],
    [0,1],
    [0,1]
])

In [10]:
test = ["""
    function payCharity() payable public {
          uint256 ethToPay = SafeMath.sub(totalEthCharityCollected, totalEthCharityRecieved);
          require(ethToPay > 1);
          totalEthCharityRecieved = SafeMath.add(totalEthCharityRecieved, ethToPay);
          if(!giveEthCharityAddress.call.value(ethToPay).gas(400000)()) {
             totalEthCharityRecieved = SafeMath.sub(totalEthCharityRecieved, ethToPay);
          }
        }
"""]

In [13]:
max_length = 100
token_dim = 15

train_X = preprocess(train_X_list, max_length, token_dim)
train_X = np.array(train_X)

INPUT:  [['function', 'Token.FunctionName', 'Token.Variable', '(', 'uint', 'Token.Variable', ')', 'public', 'payable', '{', 'if', '(', 'balances', '[', 'msg.sender', ']', '>', '=', 'MinSum', '&', '&', 'balances', '[', 'msg.sender', ']', '>', '=', 'Token.Variable', ')', '{', 'if', '(', 'msg.sender', '.', 'call', '.', 'value', '(', 'Token.Variable', ')', '(', ')', ')', '{', 'balances', '[', 'msg.sender', ']', '-', '=', 'Token.Variable', ';', 'Log', '.', 'AddMessage', '(', 'msg.sender', ',', 'Token.Variable', ',', 'Token.String', ')', ';', '}', '}', '}'], ['function', 'Token.FunctionName', 'Token.Variable', '(', 'uint', 'Token.Variable', ')', '{', 'if', '(', 'Token.Variable', '<', '=', 'balances', '[', 'msg.sender', ']', ')', '{', 'if', '(', 'msg.sender', '.', 'call', '.', 'value', '(', 'Token.Variable', ')', '(', ')', ')', '{', 'balances', '[', 'msg.sender', ']', '-', '=', 'Token.Variable', ';', 'TransferLog', '.', 'AddMessage', '(', 'msg.sender', ',', 'Token.Variable', ',', 'Token.Strin

(5, 100, 15)

In [161]:
train_X.shape

(5, 100, 15)

In [165]:
model = Sequential()
model.add(LSTM(128))
model.add(Dense(2, activation='softmax'))

In [166]:
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [167]:
print("Training...")
model.fit(train_X, train_Y, epochs=50)

Training...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7fa1a8f25ed0>