In [169]:
import pandas as pd
from sklearn.metrics import accuracy_score
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from detect_outlier import *
from sklearn.model_selection import train_test_split


In [170]:
df = read_data()

In [171]:
df.loc[df['sub_label'].isna(), 'sub_label'] = df['main_label']
df = df.dropna()


In [172]:
df_normalized = df.drop(["address", "createdAt"], axis=1)
main_label_column = df['main_label']
sub_label_column = df['sub_label']

for column in df_normalized.columns:
    if column not in ["numberOfLiquidation", "borrow_per_deposit"]:
        min_val = df_normalized[column].min()
        max_val = df_normalized[column].max()
        df_normalized[column] = ((df_normalized[column] - min_val) /
                                         (max_val - min_val)) * 550 + 300

min_max_columns = ["numberOfLiquidation", "borrow_per_deposit"]

for column in min_max_columns:
    min_val = df_normalized[column].min()
    max_val = df_normalized[column].max()
    df_normalized[column] = ((max_val - df_normalized[column]) /
                                     (max_val - min_val)) * 550 + 300

df_normalized['main_label'] = main_label_column
df_normalized['sub_label'] = sub_label_column

In [173]:
def score(label):
    if label == 0:
        return np.random.randint(300, 580)
    if label == 1:
        return np.random.randint(580, 670)
    if label == 2:
        return np.random.randint(670, 740)
    if label == 3:
        return np.random.randint(740, 800)
    if label == 4:
        return np.random.randint(800, 850)


def labeling(scores):
    label = []
    for score in scores:
        if score < 580:
            label.append(0)
        elif score >= 580 and score < 670:
            label.append(1)
        elif score >= 670 and score < 740:
            label.append(2)
        elif score >= 740 and score < 800:
            label.append(3)
        elif score >= 800:
            label.append(4)
    return np.array(label)

In [174]:
X = np.array(
    df_normalized[
        [
            "balanceInUSD",
            # "borrowInUSD",
            # "depositInUSD",
            "averageTotalAsset",
            "frequencyMountOfTransaction",
            "borrow_per_balance",
            "deposit_per_balance",
            "borrow_per_deposit",
            "totalValueOfLiquidation",
            "numberOfLiquidation",
            "frequencyOfTransaction",
            "frequencyOfDappTransactions",
            "numberOfInteractedDapps",
            "typesOfInteractedDapps",
            "numberOfReputableDapps",
            "age",
        ]
    ].values
)
main_y = df_normalized["main_label"].values
sub_y = df_normalized["sub_label"].values

X_train, X_test, main_y_train, main_y_test, sub_y_train, sub_y_test = train_test_split(
    X, main_y, sub_y, test_size=0.2, random_state=42
)
# Chuẩn bị dữ liệu

In [175]:
X_train.shape[0]

97604

In [176]:
import numpy as np

# def new_error_func(y, sub_y, y_pred):
#     errors = [0] * len(y_pred)
#     # Lặp qua từng phần tử trong mảng
#     for i in range(len(y_pred)):
#         if y[i] > sub_y[i]:
#             errors[i] = y_pred[i] - y[i] 
#         else:
#             errors[i] = y_pred[i] - sub_y[i] 
#     return errors

def new_error_func(y, sub_y, y_pred):
    errors = [0] * len(y_pred)
    # Lặp qua từng phần tử trong mảng
    for i in range(len(y_pred)):
        if y[i] == sub_y[i]:
            errors[i] = y_pred[i] - y[i] 
        else :
            min_value = min(y[i], sub_y[i])
            max_value = max(y[i], sub_y[i])
            if y_pred[i] <= min_value:
                errors[i] = min_value - y_pred[i]
            if y_pred[i] >= max_value:
                errors[i] = y_pred[i] - max_value
    return errors

# Hàm tính gradient của hàm lỗi (MSE)
def gradient_mean_squared_error(X, y, sub_y, w):
    n = len(y)
    y_pred = labeling(np.dot(X, w))
    # error = y_pred - y
    error = new_error_func(y, sub_y, y_pred)
    gradient = 2 * np.dot(X.T, error) / n
    return gradient

# Gradient descent để tối ưu hóa hàm lỗi (MSE)
def adam(X, y, sub_y, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, num_iterations=1000):
    # Khởi tạo vector trọng số ngẫu nhiên
    w = np.random.uniform(0, 1, X.shape[1])
    m = np.zeros_like(w)
    v = np.zeros_like(w)
    t = 0
    
    for _ in range(num_iterations):
        # Chọn ngẫu nhiên một mẫu từ dữ liệu
        random_index = np.random.randint(0, len(X))
        X_sample = X[random_index:random_index+1]
        y_sample = y[random_index:random_index+1]
        sub_y_sample = sub_y[random_index:random_index+1]
        
        
        # Tính gradient của hàm lỗi cho mẫu đã chọn
        grad = gradient_mean_squared_error(X_sample, y_sample, sub_y_sample, w)
        
        # Tính trung bình độ lớn của gradient và gradient bình phương
        m = beta1 * m + (1 - beta1) * grad
        v = beta2 * v + (1 - beta2) * (grad ** 2)
        
        # Bias correction
        m_hat = m / (1 - beta1**(t+1))
        v_hat = v / (1 - beta2**(t+1))
        
        # Cập nhật vector trọng số
        w -= learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)
        
        t += 1
        
    
    return w

# Áp dụng gradient descent để tối ưu hóa hàm lỗi (MSE)
learned_weights = adam(X_train, main_y_train, sub_y_train)

print("Vector trọng số tối ưu:", learned_weights)


Vector trọng số tối ưu: [ 0.35275793  0.26863327 -0.10669593 -0.2026697   0.47079281  0.23282609
 -0.20826085  0.05674303 -0.24578297  0.20636567  0.33419137 -0.28034013
 -0.00806293  0.36499005]


In [177]:
def new_accuracy(y_train, sub_y_train, y_pred):
    y_train = np.array(y_train)
    sub_y_train = np.array(sub_y_train)
    y_pred = np.array(y_pred)
    condition = np.logical_or(y_pred == sub_y_train, y_pred == y_train)
    count = np.sum(condition)
    accuracy = count / len(y_train)
    return accuracy

In [178]:
max1 = 0
res = None
for i in range(1000):
    learned_weights = adam(X_train, main_y_train, sub_y_train)
    pred = labeling(X_test.dot(learned_weights))
    acc = new_accuracy(main_y_test, sub_y_test, pred)
    if acc > max1:
        max1 = acc
        res = learned_weights

In [181]:
print(max1, res)

0.8373084173428408 [ 0.27917011  0.25496641 -0.40879111 -0.10001058  0.32614511 -0.18935922
  0.0361325   0.38295403 -0.09942797  0.10893354  0.33322976  0.34286977
  0.29081977  0.02028977]


In [179]:
pred = labeling(X_test.dot(learned_weights))
print(new_accuracy(main_y_test, sub_y_test, pred))

0.29780345873289077


In [180]:
import numpy as np

unique_elements, counts = np.unique(pred, return_counts=True)
    
for element, count in zip(unique_elements, counts):
    print("Phần tử", element, "xuất hiện", count, "lần")


Phần tử 0 xuất hiện 16182 lần
Phần tử 1 xuất hiện 8219 lần
Phần tử 3 xuất hiện 1 lần
