#### Set up the environment - packages, models and dataset

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV#, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, fbeta_score, make_scorer, confusion_matrix

import xgboost as xgb
import pickle

from time import time

In [34]:
pd.set_option("display.max_columns", None)
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.utils import warnings as skwarn
skwarn.filterwarnings('ignore')

In [2]:
df = pd.read_csv("features_clean.csv").iloc[:,1:]

In [3]:
df.head()

Unnamed: 0,flag,avg min between sent tnx,avg min between received tnx,time diff between first and last (mins),sent tnx,received tnx,number of created contracts,unique received from addresses,unique sent to addresses,min value received,max value received,avg val received,min val sent,max val sent,avg val sent,total tnx,total ether sent,total ether received,total ether balance,total erc20 tnxs,erc20 total ether received,erc20 total ether sent,erc20 total ether sent contract,erc20 uniq sent addr,erc20 uniq rec addr,erc20 uniq sent contract addr,erc20 uniq rec contract addr,erc20 min val rec,erc20 max val rec,erc20 avg val rec,erc20 min val sent,erc20 max val sent,erc20 avg val sent,erc20 uniq sent token name,erc20 uniq rec token name,erc20 most sent token type,erc20 most rec token type,sent rec balance
0,0,844.26,1093.71,704785.63,721,89,0,40,118,0.0,45.806785,6.589513,0.0,31.22,1.200681,810,865.691093,586.466675,-279.224419,265.0,35588540.0,35603170.0,0.0,30.0,54.0,0.0,58.0,0.0,15000000.0,265586.1476,0.0,16831000.0,271779.92,39.0,57.0,1,1,8.101124
1,0,12709.07,2958.44,1218216.73,94,8,0,5,14,0.0,2.613269,0.385685,0.0,1.8,0.032844,102,3.087297,3.085478,-0.001819,8.0,403.4283,2.260809,0.0,1.0,5.0,0.0,7.0,0.0,365.0,57.632615,2.260809,2.260809,2.260809,1.0,7.0,1,1,11.75
2,0,246194.54,2434.02,516729.3,2,10,0,10,2,0.113119,1.165453,0.358906,0.05,3.538616,1.794308,12,3.588616,3.589057,0.000441,8.0,521.5121,0.0,0.0,0.0,7.0,0.0,8.0,0.0,442.8198,65.189009,0.0,0.0,0.0,0.0,8.0,0,1,0.2
3,0,10219.6,15785.09,397555.9,25,9,0,7,13,0.0,500.0,99.48884,0.0,450.0,70.001834,34,1750.045862,895.399559,-854.646303,14.0,17111.05,11412.23,0.0,2.0,11.0,0.0,11.0,0.0,11412.23,1555.550174,100.0,9029.231,3804.076893,1.0,11.0,1,1,2.777778
4,0,36.61,10707.77,382472.42,4598,20,1,7,19,0.0,12.802411,2.671095,0.0,9.0,0.022688,4619,104.318883,53.421897,-50.896986,42.0,162829.7,123539.9,0.0,4.0,23.0,0.0,27.0,0.0,90000.0,4934.232147,0.0,45000.0,13726.65922,6.0,27.0,1,1,229.9


In [7]:
x, xho, y, yho = train_test_split(df.iloc[:,1:], df.flag, test_size=0.2, random_state=2018)
xtr, xte, ytr, yte = train_test_split(x, y, test_size=0.25, random_state=2019)

In [8]:
dflr = df.fillna(df.median())

In [9]:
xl, xlho, yl, ylho = train_test_split(dflr.iloc[:,1:], dflr.flag, test_size=0.2, random_state=2018)
xltr, xlte, yltr, ylte = train_test_split(xl, yl, test_size=0.25, random_state=2019)

In [10]:
scorers = {
            'fbeta_score':make_scorer(fbeta_score),
            'precision_score': make_scorer(precision_score),
            'recall_score': make_scorer(recall_score),
            'accuracy_score': make_scorer(accuracy_score)
          }

# skf = StratifiedKFold(n_splits=5, shuffle = True)
kf = KFold(n_splits=5, shuffle=True, random_state = 2020)

#### Model 1. kNN

In [14]:
knn = KNeighborsClassifier()

In [19]:
t0 = time()

# k_range = np.arange(3,100,10) # narrowed down from here 
k_range = np.arange(3,24,5)

weight_options = ['uniform', 'distance']
param_grid = dict(n_neighbors=k_range, weights=weight_options)

gridknn = GridSearchCV(knn, param_grid, cv=10, scoring='precision')
gridknn.fit(xl, yl)

knntime = time() - t0
print(time() - t0)

9.654690980911255


In [20]:
print("Best params: ", gridknn.best_params_)
print("Best estimator: ", gridknn.best_estimator_)
print("Best score: ", gridknn.best_score_)

Best params:  {'n_neighbors': 8, 'weights': 'uniform'}
Best estimator:  KNeighborsClassifier(n_neighbors=8)
Best score:  0.8135550961520336


In [21]:
knn_preds = gridknn.predict(xlte)

#### Model 2. Random Forest

In [36]:
rf = RandomForestClassifier(bootstrap=True)
# rfxbs = RandomForestClassifier(bootstrap=False)

In [23]:
t0 = time()

# nest = np.arange(1,302,50)
# nest = np.arange(150,250,25)
nest = np.arange(160,170,2)

param_grid = dict(n_estimators=nest)

gridrf = GridSearchCV(rf, param_grid, cv=10, scoring='precision')
gridrf.fit(xl, yl)

rftime = time() - t0
print(time() - t0)

78.55853199958801


In [24]:
print("Best params: ", gridrf.best_params_)
print("Best estimator: ", gridrf.best_estimator_)
print("Best score: ", gridrf.best_score_)

Best params:  {'n_estimators': 164}
Best estimator:  RandomForestClassifier(n_estimators=164)
Best score:  0.9982523746746441


#### Model 3. Extra Trees

In [81]:
et = ExtraTreesClassifier()

In [26]:
t0 = time()

# nest_et = np.arange(1,302,50) #51
# nest_et = np.arange(1,102,25) $51
nest_et = np.arange(1,26,2)

param_grid = dict(n_estimators=nest_et)

gridet = GridSearchCV(et, param_grid, cv=10, scoring='precision')
gridet.fit(xl, yl)

ettime = time() - t0
print(time() - t0)

10.10012698173523


In [27]:
print("Best params: ", gridet.best_params_)
print("Best estimator: ", gridet.best_estimator_)
print("Best score: ", gridet.best_score_)

Best params:  {'n_estimators': 23}
Best estimator:  ExtraTreesClassifier(n_estimators=23)
Best score:  0.9959434413905367


#### Model 4. XGBoost

In [42]:
gbm = xgb.XGBClassifier(early_stopping_rounds = 10, eval_metric = "logloss", n_estimator = 300, verbosity = 0)

# eval_set=[(xtr, ytr),(xte, yte)]

params = {
    "max_depth" : np.arange(3,8,2),
    "learning_rate" : np.logspace(.001,1),
    "subsample" : np.arange(.4,1,.1),
    "min_child_weight" : np.linspace(3,13,3),
    "colsample_bytree" : np.arange(.7,1,.1),
    "scale_pos_weight" : np.arange(1,2.1,.3),
}

In [54]:
t0 = time()

gridxgb = RandomizedSearchCV(gbm, param_distributions=params, cv=10, scoring='precision')
gridxgb.fit(xl, yl)

xgbtime = time() - t0
print(time() - t0)

47.5305061340332


In [63]:
print("Best params: ", gridxgb.best_params_)
print("Best estimator: ", gridxgb.best_estimator_)
print("Best score: ", gridxgb.best_score_)

Best params:  {'subsample': 0.6, 'scale_pos_weight': 1.3, 'min_child_weight': 13.0, 'max_depth': 3, 'learning_rate': 1.2093472472994864, 'colsample_bytree': 0.7}
Best estimator:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7,
              enable_categorical=False, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=1.2093472472994864, max_delta_step=0, max_depth=3,
              min_child_weight=13.0, missing=nan, monotone_constraints='()',
              n_estimators=300, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1.3,
              subsample=0.6, tree_method='exact', validate_parameters=1,
              verbosity=None)
Best score:  0.9793183134435098


In [67]:
print(knntime)
print(rftime)
print(ettime)
print(xgbtime)

9.654668092727661
78.55849385261536
10.100103855133057
47.53047704696655


In [66]:
pickle.dump(gridknn.best_estimator_, open("knn.pickle", "wb"))
pickle.dump(gridrf.best_estimator_, open("rf.pickle", "wb"))
pickle.dump(gridet.best_estimator_, open("et.pickle", "wb"))
pickle.dump(gridxgb.best_estimator_, open("xgb.pickle", "wb"))