In [1]:
import numpy as np 
import pandas as pd 
import sklearn
import xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [2]:
%reload_ext watermark
%watermark -a "Pedro Marcello"

Author: Pedro Marcello



### Carregando os Dados

In [3]:
df = pd.read_csv('transaction_dataset.csv')

In [4]:
df.shape 

(9841, 51)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,Index,Address,FLAG,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Sent tnx,Received Tnx,Number of Created Contracts,...,ERC20 min val sent,ERC20 max val sent,ERC20 avg val sent,ERC20 min val sent contract,ERC20 max val sent contract,ERC20 avg val sent contract,ERC20 uniq sent token name,ERC20 uniq rec token name,ERC20 most sent token type,ERC20_most_rec_token_type
0,0,1,0x00009277775ac7d0d59eaad8fee3d10ac6c805e8,0,844.26,1093.71,704785.63,721,89,0,...,0.0,16831000.0,271779.92,0.0,0.0,0.0,39.0,57.0,Cofoundit,Numeraire
1,1,2,0x0002b44ddb1476db43c868bd494422ee4c136fed,0,12709.07,2958.44,1218216.73,94,8,0,...,2.260809,2.260809,2.260809,0.0,0.0,0.0,1.0,7.0,Livepeer Token,Livepeer Token
2,2,3,0x0002bda54cb772d040f779e88eb453cac0daa244,0,246194.54,2434.02,516729.3,2,10,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,,XENON
3,3,4,0x00038e6ba2fd5c09aedb96697c8d7b8fa6632e5e,0,10219.6,15785.09,397555.9,25,9,0,...,100.0,9029.231,3804.076893,0.0,0.0,0.0,1.0,11.0,Raiden,XENON
4,4,5,0x00062d1dd1afb6fb02540ddad9cdebfe568e0d89,0,36.61,10707.77,382472.42,4598,20,1,...,0.0,45000.0,13726.65922,0.0,0.0,0.0,6.0,27.0,StatusNetwork,EOS


In [6]:
df.FLAG.value_counts()

FLAG
0    7662
1    2179
Name: count, dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9841 entries, 0 to 9840
Data columns (total 51 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   Unnamed: 0                                            9841 non-null   int64  
 1   Index                                                 9841 non-null   int64  
 2   Address                                               9841 non-null   object 
 3   FLAG                                                  9841 non-null   int64  
 4   Avg min between sent tnx                              9841 non-null   float64
 5   Avg min between received tnx                          9841 non-null   float64
 6   Time Diff between first and last (Mins)               9841 non-null   float64
 7   Sent tnx                                              9841 non-null   int64  
 8   Received Tnx                                          9841

### Engenharia de Atributos

In [8]:
df.columns

Index(['Unnamed: 0', 'Index', 'Address', 'FLAG', 'Avg min between sent tnx',
       'Avg min between received tnx',
       'Time Diff between first and last (Mins)', 'Sent tnx', 'Received Tnx',
       'Number of Created Contracts', 'Unique Received From Addresses',
       'Unique Sent To Addresses', 'min value received', 'max value received ',
       'avg val received', 'min val sent', 'max val sent', 'avg val sent',
       'min value sent to contract', 'max val sent to contract',
       'avg value sent to contract',
       'total transactions (including tnx to create contract',
       'total Ether sent', 'total ether received',
       'total ether sent contracts', 'total ether balance',
       ' Total ERC20 tnxs', ' ERC20 total Ether received',
       ' ERC20 total ether sent', ' ERC20 total Ether sent contract',
       ' ERC20 uniq sent addr', ' ERC20 uniq rec addr',
       ' ERC20 uniq sent addr.1', ' ERC20 uniq rec contract addr',
       ' ERC20 avg time between sent tnx', ' ERC20 

In [9]:
df.columns = [x.lower() for x in df.columns]

In [10]:
df.columns

Index(['unnamed: 0', 'index', 'address', 'flag', 'avg min between sent tnx',
       'avg min between received tnx',
       'time diff between first and last (mins)', 'sent tnx', 'received tnx',
       'number of created contracts', 'unique received from addresses',
       'unique sent to addresses', 'min value received', 'max value received ',
       'avg val received', 'min val sent', 'max val sent', 'avg val sent',
       'min value sent to contract', 'max val sent to contract',
       'avg value sent to contract',
       'total transactions (including tnx to create contract',
       'total ether sent', 'total ether received',
       'total ether sent contracts', 'total ether balance',
       ' total erc20 tnxs', ' erc20 total ether received',
       ' erc20 total ether sent', ' erc20 total ether sent contract',
       ' erc20 uniq sent addr', ' erc20 uniq rec addr',
       ' erc20 uniq sent addr.1', ' erc20 uniq rec contract addr',
       ' erc20 avg time between sent tnx', ' erc20 

In [11]:
cols_to_drop = [' erc20 most sent token type',
                ' erc20_most_rec_token_type',
                'address',
                'index', 
                'unnamed: 0']

In [12]:
atributos = [x for x in df.columns if (x != 'flag' and x not in cols_to_drop)]

In [13]:
atributos

['avg min between sent tnx',
 'avg min between received tnx',
 'time diff between first and last (mins)',
 'sent tnx',
 'received tnx',
 'number of created contracts',
 'unique received from addresses',
 'unique sent to addresses',
 'min value received',
 'max value received ',
 'avg val received',
 'min val sent',
 'max val sent',
 'avg val sent',
 'min value sent to contract',
 'max val sent to contract',
 'avg value sent to contract',
 'total transactions (including tnx to create contract',
 'total ether sent',
 'total ether received',
 'total ether sent contracts',
 'total ether balance',
 ' total erc20 tnxs',
 ' erc20 total ether received',
 ' erc20 total ether sent',
 ' erc20 total ether sent contract',
 ' erc20 uniq sent addr',
 ' erc20 uniq rec addr',
 ' erc20 uniq sent addr.1',
 ' erc20 uniq rec contract addr',
 ' erc20 avg time between sent tnx',
 ' erc20 avg time between rec tnx',
 ' erc20 avg time between rec 2 tnx',
 ' erc20 avg time between contract tnx',
 ' erc20 min val

In [14]:
valores_unicos = df.nunique()

In [15]:
valores_unicos

unnamed: 0                                              9841
index                                                   4729
address                                                 9816
flag                                                       2
avg min between sent tnx                                5013
avg min between received tnx                            6223
time diff between first and last (mins)                 7810
sent tnx                                                 641
received tnx                                             727
number of created contracts                               20
unique received from addresses                           256
unique sent to addresses                                 258
min value received                                      4589
max value received                                      6302
avg val received                                        6767
min val sent                                            4719
max val sent            

In [16]:
atributos = [x for x in atributos if x in valores_unicos.loc[(valores_unicos > 1)]]

In [17]:
df[atributos].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9841 entries, 0 to 9840
Data columns (total 38 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   avg min between sent tnx                              9841 non-null   float64
 1   avg min between received tnx                          9841 non-null   float64
 2   time diff between first and last (mins)               9841 non-null   float64
 3   sent tnx                                              9841 non-null   int64  
 4   received tnx                                          9841 non-null   int64  
 5   number of created contracts                           9841 non-null   int64  
 6   unique received from addresses                        9841 non-null   int64  
 7   unique sent to addresses                              9841 non-null   int64  
 8   min value received                                    9841

### Criação dos Pipelines

In [18]:
class PipeSteps(BaseEstimator, TransformerMixin):
    def __init__(self, columns=[]):
        self.columns = columns

    def fit(self, X, y = None):
        return self

    def transform(self, X):
        X = X.copy()
        return X

In [19]:
class SelecionaColuna(PipeSteps):
    def transform(self, X):
        X = X.copy()
        return X[self.columns]

In [20]:
class PrencherDados(PipeSteps):
    def fit(self, X, y = None):
        self.means = {col: X[col].mean() for col in self.columns}
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col].fillna(self.means[col])
        return X

In [21]:
class PadronizaDados(PipeSteps):
    def fit(self, X, y = None):
        self.scaler = StandardScaler()
        self.scaler.fit(X[self.columns])
        return self

    def transform(self, X):
        X = X.copy()
        X[self.columns] = self.scaler.transform(X[self.columns])
        return X

In [22]:
pipe_preprocessamento = Pipeline([('feature_selection', SelecionaColuna(atributos)),
                                 ('fill_missing', PrencherDados(atributos)),
                                 ('standard_scaling', PadronizaDados(atributos))])

In [23]:
pipe_final = Pipeline([
    ('preprocessing', pipe_preprocessamento),
    ('learning', XGBClassifier(random_state = 42, eval_metric = 'auc',
                               objective = 'binary:logistic'))])

### Preparação dos Dados

In [24]:
X = df[atributos]

In [25]:
y = df['flag']

In [26]:
X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size= 0.30, random_state= 42)

In [27]:
X_treino

Unnamed: 0,avg min between sent tnx,avg min between received tnx,time diff between first and last (mins),sent tnx,received tnx,number of created contracts,unique received from addresses,unique sent to addresses,min value received,max value received,...,erc20 uniq sent addr.1,erc20 uniq rec contract addr,erc20 min val rec,erc20 max val rec,erc20 avg val rec,erc20 min val sent,erc20 max val sent,erc20 avg val sent,erc20 uniq sent token name,erc20 uniq rec token name
8460,0.00,0.00,0.00,0,0,0,0,0,0.000000,0.000000,...,0.0,1.0,1.337000,1.337000,1.337000,0.0,0.0,0.0,0.0,1.0
6081,0.00,8337.42,200098.17,0,24,1,3,0,0.000000,1.003651,...,0.0,2.0,0.301638,0.953298,0.627468,0.0,0.0,0.0,0.0,2.0
8966,0.00,318.17,2287.65,1,2,0,2,1,0.005000,0.300000,...,0.0,1.0,20000.000000,542000.000000,281000.000000,0.0,0.0,0.0,0.0,1.0
1535,28.33,3901.86,1021678.35,254,260,0,25,2,0.000047,110.736000,...,0.0,5.0,0.000000,19421.000000,3961.659270,0.0,0.0,0.0,0.0,5.0
7304,0.00,1947.10,319323.80,0,164,1,4,0,0.000000,1.903509,...,0.0,3.0,0.000000,2.335693,1.313279,0.0,0.0,0.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,0.00,16197.17,226760.43,0,14,1,6,0,0.000000,14.419115,...,0.0,2.0,0.518689,0.705159,0.611924,0.0,0.0,0.0,0.0,2.0
5191,0.00,0.00,15369.12,1,1,0,1,1,2.000000,2.000000,...,0.0,7.0,0.000000,312.430205,50.497598,0.0,0.0,0.0,0.0,7.0
5390,0.00,0.00,1.77,1,1,0,1,1,1.990000,1.990000,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
860,165.15,0.00,330.30,2,2,0,2,2,49.770407,51.229593,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0


In [28]:
X_teste

Unnamed: 0,avg min between sent tnx,avg min between received tnx,time diff between first and last (mins),sent tnx,received tnx,number of created contracts,unique received from addresses,unique sent to addresses,min value received,max value received,...,erc20 uniq sent addr.1,erc20 uniq rec contract addr,erc20 min val rec,erc20 max val rec,erc20 avg val rec,erc20 min val sent,erc20 max val sent,erc20 avg val sent,erc20 uniq sent token name,erc20 uniq rec token name
7763,1641.74,2103.12,327679.35,10,148,0,137,4,0.001000,14.341000,...,0.0,5.0,0.00,2082.268173,419.127635,0.0,0.0,0.0,0.0,5.0
8655,2811.51,837.98,9812.92,2,5,0,4,2,0.490000,0.500000,...,0.0,1.0,13.37,13.370000,13.370000,0.0,0.0,0.0,0.0,1.0
106,157.32,0.00,314.65,2,1,0,1,2,101.000000,101.000000,...,0.0,0.0,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
439,20.17,3.92,68.37,3,2,0,2,3,12.690172,88.309828,...,0.0,0.0,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
2420,4.38,24303.06,243074.38,10,10,0,2,1,0.574000,9.000000,...,0.0,0.0,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3574,162.63,0.00,325.27,2,1,0,1,2,101.000000,101.000000,...,0.0,0.0,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
4824,7192.69,58549.99,204420.75,4,3,0,2,1,5.013159,40.391000,...,0.0,0.0,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
1157,1.77,0.41,6.12,3,2,0,2,3,434.001122,1566.998878,...,0.0,0.0,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
7786,0.00,6.27,1310.22,0,209,1,6,0,0.000000,2.000000,...,0.0,1.0,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,1.0


### Treino e Avaliação do Modelo

In [29]:
pipe_final.fit(X_treino, y_treino)

In [30]:
previsoes_teste = pipe_final.predict(X_teste)

In [31]:
score_auc = metrics.roc_auc_score(y_teste, previsoes_teste)

In [32]:
print(f'AUC dados de teste: {score_auc:,.2%}')

AUC dados de teste: 97.34%


In [33]:
%watermark -v -m

Python implementation: CPython
Python version       : 3.13.1
IPython version      : 8.31.0

Compiler    : MSC v.1942 64 bit (AMD64)
OS          : Windows
Release     : 11
Machine     : AMD64
Processor   : Intel64 Family 6 Model 165 Stepping 3, GenuineIntel
CPU cores   : 8
Architecture: 64bit



In [34]:
%watermark --iversions

numpy  : 2.2.1
joblib : 1.4.2
xgboost: 2.0.3
pandas : 2.2.3
sklearn: 1.5.2

