In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import plotly.graph_objs as go 
import plotly.offline as py 
import math
from collections import Counter

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import VotingClassifier
from sklearn.base import clone 
import xgboost as xgb

In [None]:
df_wallets_features_classes_combined = pd.read_csv('/kaggle/input/elliptic/wallets_features_classes_combined.csv')

In [None]:
df_wallets_features_classes_combined.columns

In [None]:
df_wallets_features_classes_combined

In [None]:
#Lọc bỏ timestep và các rows trùng nhau
df_wallets_classification = df_wallets_features_classes_combined
df_wallets_classification = df_wallets_classification.drop(columns=['Time step']).drop_duplicates()
df_wallets_classification

In [None]:
# Bỏ unknown (class == 3) - we're classifying only 2 class (1 & 2)
data = df_wallets_classification.loc[(df_wallets_classification['class'] != 3), 'address']
df_wallets_feature_selected = df_wallets_classification.loc[df_wallets_classification['address'].isin(data)]
df_wallets_feature_selected

In [None]:
df_wallets_feature_selected.info()

In [None]:
# Goal: binary classification of 0,1
# 0: licit, 1: illicit

# change illicit (class-2) to '0' for classification
y = df_wallets_feature_selected[['class']]
y = y['class'].apply(lambda x: 0 if x == 2 else 1 ) 

X_train, X_test, y_train, y_test = train_test_split(df_wallets_feature_selected,y,test_size=0.30,random_state=15, shuffle=False)
X_train = X_train.drop(columns=['address', 'class'])
X_test = X_test.drop(columns=['address', 'class'])

In [None]:
y

In [None]:
# Scaling
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

#Flatten into 1 dimension array
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Test on different models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_fscore_support, f1_score
from lightgbm import LGBMClassifier

In [None]:
def get_model_metrics(y_true, y_pred):
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, zero_division=0)
    micro_f1 = f1_score(y_true, y_pred, average='micro')
    accuracy = accuracy_score(y_true, y_pred)
    return {
        'Precision': prec[1],
        'Recall': rec[1],
        'F1 Score': f1[1],
        'Micro F1': micro_f1,
        'Accuracy': accuracy
    }

In [None]:
results = []

In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
metrics_lr = get_model_metrics(y_test, y_pred_lr)
metrics_lr['Model'] = 'Logistic Regression'
results.append(metrics_lr)

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)
metrics_rf = get_model_metrics(y_test, y_pred_rf)
metrics_rf['Model'] = 'Random Forest'
results.append(metrics_rf)

In [None]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train_scaled, y_train)
y_pred_xgb = xgb.predict(X_test_scaled)
metrics_xgb = get_model_metrics(y_test, y_pred_xgb)
metrics_xgb['Model']='XGB'
results.append(metrics_xgb)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
mlp.fit(X_train_scaled, y_train)
y_pred_mlp = mlp.predict(X_test_scaled)
metrics_mlp = get_model_metrics(y_test, y_pred_mlp)
metrics_mlp['Model'] = 'Multi-Layer Perceptron'
results.append(metrics_mlp)

In [None]:
lgbm = LGBMClassifier(random_state=42, n_estimators=100)
lgbm.fit(X_train_scaled, y_train)
y_pred_lgbm = lgbm.predict(X_test_scaled)
metrics_lgbm = get_model_metrics(y_test, y_pred_lgbm)
metrics_lgbm['Model']='LGBM'
results.append(metrics_lgbm)

# Run Ensemble Model

In [None]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_fscore_support, f1_score
from lightgbm import LGBMClassifier


In [None]:
ensembles = {
    "RF + MLP": [("rf", rf), ("mlp", mlp)],
    "RF + XGB": [("rf", rf), ("xgb", xgb)],
    "MLP + XGB": [("mlp", mlp), ("xgb", xgb)],
    "RF + MLP + XGB": [("rf", rf), ("mlp", mlp), ("xgb", xgb)],
}

In [None]:
for name, estimators in ensembles.items():
    clf = VotingClassifier(estimators=estimators, voting='hard')
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    metrics_ensemble = get_model_metrics(y_test, y_pred)
    metrics_ensemble['Model']= name
    results.append(metrics_ensemble)

In [None]:
df_results = pd.DataFrame(results)
df_results = df_results[['Model', 'Precision', 'Recall', 'F1 Score', 'Micro F1', 'Accuracy']]  # order columns

print(df_results)
