In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import graphviz
import string
from sklearn.model_selection import KFold
from decimal import *
from sklearn.metrics import roc_auc_score


## Cleaning the Data

In [6]:
# transactions.csv and transactions2.csv are data gathered from the same experiment. However, transactions2.csv is the result of the one that has more randomness to the generated transactions.

df = pd.read_csv('data/transactions2.csv')



In [7]:
# The following functions may be used in case we need to do a data transformation into categorical data.

def gas_bining(org_df):
    df = org_df.copy()
    
    df['gas_used'] = pd.cut(df['gas_used'], bins=[i * 100000 for i in range(0, 20)], labels=list(string.ascii_uppercase[:19]))
    
    return df


def transform_balance_delta(org_df):
    df = org_df.copy()


    for i in range(0, df.shape[0]):

        if int(df.at[i, 'victim_balance_delta']) == 0:
            df.at[i, 'victim_balance_delta'] = 'zero'

        elif int(df.at[i, 'victim_balance_delta']) > 0:
            df.at[i, 'victim_balance_delta'] = 'positive'

        elif int(df.at[i, 'victim_balance_delta']) < 0:
            df.at[i, 'victim_balance_delta'] = 'negative'


        if int(df.at[i, 'attacker_balance_delta']) == 0:
            df.at[i, 'attacker_balance_delta'] = 'zero'

        elif int(df.at[i, 'attacker_balance_delta']) > 0:
            df.at[i, 'attacker_balance_delta'] = 'positive'

        elif int(df.at[i, 'attacker_balance_delta']) < 0:
            df.at[i, 'attacker_balance_delta'] = 'negative'

        
    return df


def call_stack_depth_bining(org_df):
    df = org_df.copy()

    df['call_stack_depth'] = pd.cut(df['call_stack_depth'], bins=[i for i in range(0, 20)], labels=list(string.ascii_uppercase[:19]))
    
    return df



new_df = gas_bining(df)
new_df = transform_balance_delta(new_df)
new_df = call_stack_depth_bining(new_df)




In [8]:

class RandomForest:

    def __init__(self):
        self.column_filter = None
        self.imputation = None 
        self.one_hot = None 
        self.labels = None 
        self.model = None
        

        # HELPER
        self.no_trees = None

        # DEBUG
        self.tree_predictions_list = None

    
    def fit(self, df, no_trees=100):
        self.no_trees = no_trees
        df = df.copy()

        # initializing our random forest as a list of all our generated trees:
        self.model = list()
        
        y = df['label'].values
        df.drop(columns=['label'], inplace=True)

        
        #df, self.one_hot = create_one_hot(df)
        #df, self.column_filter = create_column_filter(df)
        #df, self.imputation = create_imputation(df)
        
        x = df.values

        # total number of features:
        F_size = len(df.columns)


        # just to use later!
        df_with_classes = df.copy()
        df_with_classes['label'] = pd.Series(y, index=df_with_classes.index)

        def select_with_replacement():
            '''
            This function will return a newly selected with replacement sample.
            '''
            sample = pd.DataFrame(columns = df_with_classes.columns)
            selections = np.random.choice(df_with_classes.shape[0], df_with_classes.shape[0], replace=True)
            
            for selection in selections:
                sample = sample.append(df_with_classes.iloc[selection])
                

            return sample 



        for trees_i in range(0, no_trees):
            
            sample_df = select_with_replacement()
            
            y = sample_df['label'].values
            sample_df.drop(columns=['label'], inplace=True)
            x = sample_df.values

            self.model.append(DecisionTreeClassifier(max_features='log2'))
            self.model[-1].fit(x, y)

            print('tree # {} is built,'.format(trees_i))


            # I used these lines to get insight into how are my decision trees doing (commented)
            '''
            tree_desc = tree.export_graphviz(self.model[-1], out_file='1_forest/{}.dot'.format(trees_i), feature_names=list(sample_df.columns))
            dot_data = tree.export_graphviz(self.model[-1], feature_names=list(sample_df.columns), class_names=[str(c) for c in self.model[-1].classes_], filled=True, rounded=True)
            graph = graphviz.Source(dot_data)
            graph.render('1_forest/{}.gv'.format(trees_i), view=False)  
            '''
        
        
        
    
        


    
    def predict(self, df):
        test_df = df.copy()

        
        

        #test_df = apply_one_hot(test_df, self.one_hot)
        #test_df = apply_column_filter(test_df, self.column_filter)
        #test_df = apply_imputation(test_df, self.imputation)

        test_x = test_df.values

        # let's add all predictions from all trees to a single list so that we can use it later!
        # each index in this list denotes the prediction of a tree
        tree_predictions_list = list()

        for dt in self.model:
            tree_predictions = dt.predict_proba(test_x)
            tree_predictions_list.append(tree_predictions)

        self.tree_predictions_list = tree_predictions_list



        predictions = pd.DataFrame(columns=['safe' ,'vul'])

        # averaging for each instance:
        for instance_index in range(0, len(test_x)):
            
            total_negative = 0
            total_positive = 0
            
            for model_index in range(0, self.no_trees):
                total_negative += tree_predictions_list[model_index][instance_index][0]
                total_positive += tree_predictions_list[model_index][instance_index][1]
            
            

            predictions = predictions.append({
                'safe': Decimal(total_negative) / Decimal(self.no_trees),
                'vul': Decimal(total_positive) / Decimal(self.no_trees)
            }, ignore_index=True)

        return predictions






def accuracy(original_df, correctlabels):
    
    df = original_df.copy()
    
    '''
    Assumption: I assume that the accuracy is ratio `number of data instances 
    correctly classified / total no of data instances`
    Furthermore, for an instance that does not have a label with highest probability,
    I will choose the first label in column orders as the predicted label for that instance.
    '''
    
    cor_pred = list() # correct predictions list
    inc_pred = list() # incorrect predictions list
    col_list = df.columns
    for index, row in df.iterrows():
        max_prob = row.max()
        for col in col_list:
            if row[col] == max_prob:
                predicted = col
                break
        correct = correctlabels[index]
        if correct == predicted:
            cor_pred.append(index)
        else:
            inc_pred.append(index)
    
    return len(cor_pred)/ len(correctlabels)

    

In [16]:
rf = RandomForest()

kf = KFold(n_splits=5, shuffle=True)
labels = list(df['label'])
accuracies = list()

for train_indices, test_indices in kf.split(df):
    print("TRAIN:", train_indices, "\nTEST:", test_indices)
    
    train_df = df.loc[train_indices, :].copy()
    test_df = df.loc[test_indices, :].copy()
    
    train_df.drop(columns=['tx_hash'], inplace=True)
    train_df.drop(columns=['Unnamed: 0'], inplace=True)
    
    test_df.drop(columns=['tx_hash'], inplace=True)
    test_df.drop(columns=['Unnamed: 0'], inplace=True)

    labels = list(test_df['label'])
    test_df.drop(columns=['label'], inplace=True)
    
    rf.fit(train_df, no_trees=10)
    predictions = rf.predict(test_df)

    ac = accuracy(predictions, labels)
    accuracies.append(ac)

    #print(predictions)
    #print(roc_auc_score(labels, predictions))

    
    print("\nAccuracy of prediction is: {}".format(ac))
    print("========================================================")
    print("========================================================")



print("*********************************************************")
print("\nAverage accuracy: {}".format(sum(accuracies) / len(accuracies)))


TRAIN: [  1   2   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19
  20  21  22  23  24  25  26  28  29  30  31  32  33  34  36  37  38  39
  41  42  43  44  46  47  48  49  50  51  52  53  55  56  57  60  61  62
  64  65  67  68  69  70  72  73  74  76  77  78  79  80  81  84  85  88
  89  90  91  92  95  96  97  98  99 101 102 103 105 106 107 109 110 111
 112 113 114 116 118 121 122 124 125 126 127 129 130 132 133 134 135 137
 140 141 143 144 145 147 149 151 154 155 157 159 161 162 164 165 166 167
 168 169 170 171 172 173 174 175 176 177 178 181 182 183 185 186 187 188
 189 190 191 192 193 194 195 196 197 198 199 201 204 205 206 208 209 210
 211 212 213 214 215 216 217 218 221 222 223 224 225 226 228 229 230 233
 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 251 252
 254 255 257 258 261 263 264 266 268 269 270 271 272 273 274 277 279 280
 281 282 283 284 285 286 287 288 289 290 293 295 296 297 298 300 301 302
 303 304 305 306 307 308 309 310 311 312 313