In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py 
import plotly.graph_objs as go 
py.init_notebook_mode(connected=True)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(15)
from sklearn import metrics
import tensorflow as tf
from numpy import array
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from mlxtend.plotting import plot_confusion_matrix
import seaborn as sns
from time import time
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from keras.layers import Input, Dropout, TimeDistributed, RepeatVector
from keras.models import Model
from keras import regularizers
pip install pyod
from pyod.models.auto_encoder import AutoEncoder

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
classes = pd.read_csv("/content/gdrive/My Drive/elliptic_bitcoin_dataset/elliptic_txs_classes.csv")
features = pd.read_csv("/content/gdrive/My Drive/elliptic_bitcoin_dataset/elliptic_txs_features.csv", header = None)

In [None]:
features.columns = ['txId', 'time_step'] + [f'txn{i}' for i in range(93)] + [f'agg{i}' for i in range(72)]
features_class = pd.merge(features, classes, left_on = "txId", right_on = "txId", how = "left")
features_class = features_class.rename(columns = {"class": "target"})
features_class["target"] = features_class["target"].replace({"1": 1, "2": 0, "unknown": -1})

In [None]:
g = pd.DataFrame(features_class.groupby(["target"]).count()["txId"]).reset_index()
sns.barplot(x = "target", y = "txId", data = g)
plt.title('Transaction label');
plt.show()

In [None]:
grouped = features_class.groupby(['time_step', 'target'])['txId'].count().reset_index().rename(columns = {'txId': 'count'})
sns.lineplot(x = 'time_step', y='count', hue='target', data = grouped);
plt.legend(loc = (1.01, 0.78));
plt.title('Number of transactions in each time step by class');

In [None]:
count_by_class = features_class[["time_step",'target']].groupby(['time_step','target']).size().to_frame().reset_index()
illicit_count = count_by_class[count_by_class['target'] == 0]
licit_count = count_by_class[count_by_class['target'] == 1]
unknown_count = count_by_class[count_by_class['target'] == -1]

In [None]:
x_list = list(range(1,50))
fig = go.Figure(data = [
    go.Bar(name = "Unknown", x = x_list, y = unknown_count[0], marker = dict(color = 'rgba(120, 100, 180, 0.6)',
        line = dict(
            color = 'rgba(120, 100, 180, 1.0)', width = 1))),
    go.Bar(name = "Licit", x = x_list, y = licit_count[0], marker = dict(color = 'rgba(246, 78, 139, 0.6)',
        line = dict(
            color = 'rgba(246, 78, 139, 1.0)', width = 1))),
    go.Bar(name = "Illicit", x = x_list, y = illicit_count[0], marker = dict(color = 'rgba(58, 190, 120, 0.6)',
        line = dict(
            color = 'rgba(58, 190, 120, 1.0)', width = 1)))

])
fig.update_layout(barmode = 'stack', xaxis_title = "time_step", yaxis_title = "Transactions by class", title = "Number of transactions per timestep by class")
py.iplot(fig)

Logistic Regression

In [None]:
classes = pd.read_csv("/content/gdrive/My Drive/elliptic_bitcoin_dataset/elliptic_txs_classes.csv")
features = pd.read_csv("/content/gdrive/My Drive/elliptic_bitcoin_dataset/elliptic_txs_features.csv", header = None)

In [None]:
local_features = ["local_feat_"+ str(i) for i in range(2,95)]
nonlocal_features = ["nonlocal_feat_"+ str(i) for i in range(1,73)]
features.columns = ["txId","time_step"] + local_features + nonlocal_features
features = pd.merge(features, classes, left_on = "txId", right_on = "txId", how = 'left')
features['class'] = features['class'].replace({"unknown": 0})

In [None]:
data = features[(features['class'] == '1') | (features['class'] == '2')]

In [None]:
X = data[local_features + nonlocal_features]
data['class'] = data['class'].replace({"2": 0})
y = data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 15, shuffle = False)

In [None]:
#Logistic Regression
reg = LogisticRegression().fit(X_train, y_train)
model = reg.predict(X_test)

In [None]:
print(classification_report(y_test, model))

In [None]:
f1_score(y_test, model, average = 'micro')

In [None]:
cmatrix = metrics.confusion_matrix(y_test, model)
print(cmatrix)
fig, ax = plot_confusion_matrix(conf_mat = cmatrix)
plt.show()

In [None]:
k = reg.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, preds)
fped, tped, threshold = metrics.roc_curve(y_test, k)
plt.figure()
plt.plot(fped, tped, label = 'Logistic Regression (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC_AUC CURVE')
plt.legend(loc = "lower right")
plt.show()

Random Forest

In [None]:
model = RandomForestClassifier().fit(X_train, y_train)
preds1 = model.predict(X_test)

In [None]:
print(classification_report(y_test, preds1))
cm1 = metrics.confusion_matrix(y_test, preds1)
print(cm1)

In [None]:
f1_score(y_test, preds1, average = 'micro')

In [None]:
fig, ax = plot_confusion_matrix(conf_mat = cm1)
plt.show()

In [None]:
k = model.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test, preds1)
fped, tped, threshold = metrics.roc_curve(y_test, k)
plt.figure()
plt.plot(fped, tped, label = 'Random Forest (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC_AUC CURVE')
plt.legend(loc = "lower right")
plt.show()

MLP

In [None]:
class LoadData(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
         
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        features = self.X.iloc[idx]
        features = np.array([features])
        label = y.iloc[idx]

        return features,label

In [None]:
train = LoadData(X_train, y_train)
trloader = DataLoader(train, batch_size = 128, shuffle = True)  
test = LoadData(X_test, y_test)
tsloader = DataLoader(test, batch_size = 128, shuffle = False)

In [None]:
class Network(nn.Module):
    def __init__(self):
        super().__init__()

        self.hidden = nn.Linear(165,50 )

        self.output = nn.Linear(50,1)
        self.out = nn.Sigmoid()
        
    def forward(self, x):

        x = F.relu(self.hidden(x))

        x = self.out(self.output(x))
        
        return x

model = Network()

In [None]:
opt = torch.optim.Adam(model.parameters(), lr = 0.001)
criterion = nn.BCELoss()
n_epochs=10

In [None]:
for i in range(n_epochs):
        model.to('cuda')
        model.train()
        losses = 0.
        for data in trloader:
            x, label = data
            x, label = x.cuda(), label.cuda()
            output = model.forward(x.float())
            output = output.squeeze()
            ls = criterion(output.float(), label.float())
            ls.backward()
            opt.step()
            losses = losses + ls.item()
        else:
            print(f"Loss: {losses/len(trloader)}")

In [None]:
pr = []
for i in tsloader:
    x, labels = data
    x, labels = x.cuda(), labels.cuda()
    preds = model.forward(x.float())
    pr.extend(preds.squeeze().detach().cpu().numpy())

prs = pd.Series(pr).apply(lambda x: round(x))

In [None]:
print(classification_report(y_test, preds2))
cm3 = metrics.confusion_matrix(y_test, preds2)
print(cm3)

In [None]:
f1_score(y_test, preds2, average = 'micro')

In [None]:
fig, ax = plot_confusion_matrix(conf_mat = cm3)
plt.show()

In [None]:
auc = roc_auc_score(y_test, preds2)
fped, tped, threshold = metrics.roc_curve(y_test, preds2)
plt.figure()
plt.plot(fped, tped, label = 'MLP (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC_AUC CURVE')
plt.legend(loc = "lower right")
plt.show()

K-Means Clustering

In [None]:
classes = pd.read_csv("/content/gdrive/My Drive/elliptic_bitcoin_dataset/elliptic_txs_classes.csv")
features = pd.read_csv("/content/gdrive/My Drive/elliptic_bitcoin_dataset/elliptic_txs_features.csv", header = None)
features.columns = ['txId', 'time_step'] + [f'txn{i}' for i in range(93)] + [f'agg{i}' for i in range(72)]

features_classes = pd.merge(features, classes, left_on = "txId", right_on = "txId", how = "left")
features_classes = features_classes.rename(columns = {"class": "target"})
features_classes["target"] = features_classes["target"].replace({"1": 1, "2": 0, "unknown": -1})

features_classes["target"].value_counts()

original_Xtrain = features_classes.query('time_step < 35 and target != -1').drop(['target', "time_step", "txId"], axis =1)
original_ytrain = features_classes.query('time_step < 35 and target != -1')['target']
unlabeled_df_train1 = features_classes.query('time_step < 35 and target == -1').drop(['target'], axis =1)

original_Xtest = features_classes.query('time_step >= 35 and target != -1').drop(['target', "time_step", "txId"], axis = 1)
original_ytest = features_classes.query('time_step >= 35 and target != -1')['target']

models = {}
errors = []

for p in range(1, 11):
    kmeans = KMeans(n_clusters = p)
    kmeans.fit(original_Xtrain)    
    models[p] = kmeans    
    labels = kmeans.labels_
    errors.append(kmeans.inertia_)
    print("\nk = "+str(p))

plt.style.use('seaborn')
plt.plot(range(1, 11), errors)
plt.xticks(range(1, 11))
plt.xlabel("Clusters")
plt.ylabel("SSD")
plt.title("Elbow Method")
plt.show()

tpr = {}
licit = pd.value_counts(original_ytrain)[0]
illicit = pd.value_counts(original_ytrain)[1]
for p in models:
    print('p='+str(1)+'\n')
    tpr[p] = models.get(p).predict(original_Xtrain)
    print("For p =" + str(p))
    m = 0
    c = {}
    n = set(np.where(original_ytrain == 1)[0])
    for i in tpr[p]:
        c[i] = c.get(i, 0) + 0
        if m in n:
            c[i] = c.get(i, 0) + 1
        m = m + 1
    for key, value in sorted(c.items()):
        good = len((np.where(tpr[p] == key)[0]))-value
        print('Cluster '+str(key+1)+': \t Illicit: '+str(value)+ ' ('+str(round((value*100)/illicit,3))+'%)' +' \t Licit: '+str(good)+' ('+str(round((good*100)/licit,3))+'%)\n')
    print('\n')

pr = {}
licit = pd.value_counts(original_ytest)[0]
illicit = pd.value_counts(original_ytest)[1]
for p in models:
    print('p ='+str(p)+'\n')
    pr[p] = models.get(p).predict(original_Xtest)
    print("For p ="+ str(p))
    m = 0
    c = {}
    n = set(np.where(original_ytest == 1)[0])
    for i in pr[p]:
        c[i] = c.get(i, 0) + 0
        if m in n:
            c[i] = c.get(i, 0) + 1
        m = m + 1
    for key, value in sorted(c.items()):
        good = len((np.where(pr[p] == key)[0]))-value
        print('Cluster '+str(key+1)+': \t Illicit: '+str(value)+ ' ('+str(round((value*100)/illicit,3))+'%)' +' \t Licit: '+str(good)+' ('+str(round((good*100)/licit,3))+'%)\n')
    print('\n')

p = 6
new_df = original_Xtest.copy()
anomaly = []
m = 0
n = np.where(original_ytest == 1)[0]
for i in pr[p]:
    if m in n:
        anomaly.append(1)
    else:
        anomaly.append(0)
    m = m + 1
    
new_df['cluster'] = pr[p]
new_df['anomaly'] = anomaly
final = new_df.copy()


print('For p ='+str(p))
l = [1,6]

prs = pr[p].copy()
tprs = tpr[p].copy()


if len(pd.value_counts(tprs)) > 2 and len(pd.value_counts(prs)) > 2:
    for i in range(0, p):
        if (i + 1) in l:
            prs[prs == i] = -1
            tprs[tprs == i] = -1
        else:
            prs[prs == i] = 0
            tprs[tprs == i] = 0

    prs[prs == -1] = 1
    tprs[tprs == -1] = 1
    print(pd.value_counts(tprs))
    print(pd.value_counts(prs))

print(classification_report(original_ytest, prs))
f1_score(original_ytest, prs, average='micro')
cm = confusion_matrix(original_ytest, prs)
print(cm)
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.show()

Autoencoders

In [None]:
classes = pd.read_csv("/content/gdrive/My Drive/elliptic_bitcoin_dataset/elliptic_txs_classes.csv")
features = pd.read_csv("/content/gdrive/My Drive/elliptic_bitcoin_dataset/elliptic_txs_features.csv", header = None)

In [None]:
tx_features = ["tx_feat_"+str(i) for i in range(2,95)]
agg_features = ["agg_feat_"+str(i) for i in range(1,73)]
features.columns = ["txId","time_step"] + tx_features + agg_features
features = pd.merge(features,classes,left_on="txId",right_on="txId",how='left')
features.rename(columns = {"class" : "target"}, inplace = True)
features['target'] = features['target'].apply(lambda x: '0' if x == "unknown" else x)

In [None]:
data = features[(features['target']=='1') | (features['target']=='2')]

In [None]:
X = data[tx_features+agg_features]
y = data['target']
y = y.apply(lambda x: 0 if x == '2' else 1 )
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 15, shuffle = False)

In [None]:
epochs = 100
random_state = 42
batch_size = 256
outliers_fraction = 0.15

def fit_model(X_train):
    clf = AutoEncoder(hidden_neurons=[6, 4, 4, 6], 
                      hidden_activation='tanh', 
                      output_activation='sigmoid', 
                      loss='mean_squared_logarithmic_error', 
                      optimizer='adam',
                      epochs=epochs, 
                      batch_size=batch_size, 
                      dropout_rate=0.1, 
                      l2_regularizer=0.000001, 
                      validation_size=0.20, 
                      preprocessing=False, 
                      verbose=1, 
                      random_state=random_state, 
                      contamination=outliers_fraction)
    clf.fit(X_train)
    return clf

In [None]:
training_evaluations = {}
test_evaluations = {}
model = {}

clf = fit_model(X_train)

In [None]:
y_test_pred = clf.predict(X_test)

In [None]:
cm = metrics.confusion_matrix(y_test, y_test_pred)
print(cm)
print(classification_report(y_test, y_test_pred))

In [None]:
f1_score(y_test, y_test_pred, average='micro')

In [None]:
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.show()

In [None]:
k = clf.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test, y_test_pred)
fped, tped, threshold = metrics.roc_curve(y_test, k)
plt.figure()
plt.plot(fped, tped, label = 'Autoencoder (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC_AUC CURVE')
plt.legend(loc="lower right")
plt.show()

LSTM

In [None]:
classes = pd.read_csv("/content/gdrive/My Drive/elliptic_bitcoin_dataset/elliptic_txs_classes.csv")
features = pd.read_csv("/content/gdrive/My Drive/elliptic_bitcoin_dataset/elliptic_txs_features.csv", header = None)

In [None]:
features.columns = ['txId', 'time_step'] + [f'txn{i}' for i in range(93)] + [f'agg{i}' for i in range(72)]

In [None]:
features_classes = pd.merge(features, classes, left_on = "txId", right_on = "txId", how = "left")
features_classes = features_classes.rename(columns = {"class": "target"})
features_classes["target"] = features_classes["target"].replace({"1": 1, "2": 0, "unknown": -1})

In [None]:
X_train1 = features_classes.query('target == 0').drop(['target', "time_step", "txId"], axis =1)
y_train1 = features_classes.query('target == 0')['target']
X_test1 = features_classes.query('target == 1').drop(['target', "time_step", "txId"], axis = 1)
y_test1 = features_classes.query('target == 1')['target']

In [None]:
scaler = MinMaxScaler()
x_train1 = scaler.fit_transform(X_train1)
x_test1 = scaler.fit_transform(X_test1)

In [None]:
x_train = x_train1.reshape(42019, 1, 165)
x_test = x_test1.reshape(4545, 1, 165)
y_train1.shape, y_test1.shape

In [None]:
y_train1.value_counts()

LSTM Auto Encoder - [16, 4, 4, 16]

In [None]:
model10 = Sequential([
    LSTM(128, input_shape=(1, 165)),
    Dropout(0.2),
    RepeatVector(1),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    TimeDistributed(Dense(165))                 
])

model.compile(loss = 'mae', optimizer = 'adam', metrics = "acc")
model.summary()

In [None]:
stp = tf.keras.callbacks.EarlyStopping(restore_best_weights = True, patience = 5)
Model = model.fit(x_train, y_train1, epochs = 100, batch_size = 256, validation_split = 0.05, callbacks = [stp], shuffle = False)

In [None]:
plt.plot(model.history.history['loss'], Label = 'Loss')
plt.plot(model.history.history['val_loss'], Label = 'Val_Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.grid()
plt.legend()

In [None]:
Result = model.evaluate(x_test, y_test1, batch_size = 256)
print(Result)

In [None]:
Xtr = model.predict(x_train)
Xtr = Xtr.reshape(Xtr.shape[0], Xtr.shape[2])
Xtr = pd.DataFrame(Xtr, columns = X_train1.columns)
Xtr.index = X_train1.index
Xts = model.predict(x_test)
Xts = Xts.reshape(Xts.shape[0], Xts.shape[2])
Xts = pd.DataFrame(Xts, columns = X_test1.columns)
Xts.index = X_test1.index

In [None]:
prtrain = pd.DataFrame(index = X_train1.index)
newtrain = x_train.reshape(x_train.shape[0], x_train.shape[2])
prtrain["Loss_mae"] = np.mean(np.abs(Xtr - newtrain), axis = 1)
plt.title("Loss_mae Distribuion")
sns.distplot(prtrain["Loss_mae"], bins = 20, kde = True, color = "blue");

In [None]:
prtest = pd.DataFrame(index = X_test1.index)
newtest = x_test.reshape(x_test.shape[0], x_test.shape[2])
prtest["Loss_mae"] = np.mean(np.abs(prtest - newtest), axis = 1)
prtest["Threshold"] = 0.35
prtest["Anomaly"] = prtest["Loss_mae"] > prtest["Threshold"]

In [None]:
new data = pd.concat(prtrain, prtest])

In [None]:
newdata.plot(logy = True, color = ["blue", "red"]);