In [154]:
import pandas as pd
from sklearn.metrics import accuracy_score
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from detect_outlier import *
from sklearn.model_selection import train_test_split


In [155]:
df = read_data()

In [156]:
df.loc[df['sub_label'].isna(), 'sub_label'] = df['main_label']
df = df.dropna()

In [157]:
df_normalized = df.drop(["address", "createdAt"], axis=1)
main_label_column = df['main_label']
sub_label_column = df['sub_label']

for column in df_normalized.columns:
    if column not in ["numberOfLiquidation", "borrow_per_deposit"]:
        min_val = df_normalized[column].min()
        max_val = df_normalized[column].max()
        df_normalized[column] = ((df_normalized[column] - min_val) /
                                         (max_val - min_val)) * 550 + 300

min_max_columns = ["numberOfLiquidation", "borrow_per_deposit"]

for column in min_max_columns:
    min_val = df_normalized[column].min()
    max_val = df_normalized[column].max()
    df_normalized[column] = ((max_val - df_normalized[column]) /
                                     (max_val - min_val)) * 550 + 300

df_normalized['main_label'] = main_label_column
df_normalized['sub_label'] = sub_label_column

In [158]:
def score(label):
    if label == 0:
        return np.random.randint(300, 580)
    if label == 1:
        return np.random.randint(580, 670)
    if label == 2:
        return np.random.randint(670, 740)
    if label == 3:
        return np.random.randint(740, 800)
    if label == 4:
        return np.random.randint(800, 850)


def labeling(scores):
    label = []
    for score in scores:
        if score < 580:
            label.append(0)
        elif score >= 580 and score < 670:
            label.append(1)
        elif score >= 670 and score < 740:
            label.append(2)
        elif score >= 740 and score < 800:
            label.append(3)
        elif score >= 800:
            label.append(4)
    return np.array(label)

In [159]:
X = np.array(
    df_normalized[
        [
            "balanceInUSD",
            # "borrowInUSD",
            # "depositInUSD",
            "averageTotalAsset",
            "frequencyMountOfTransaction",
            "borrow_per_balance",
            "deposit_per_balance",
            "borrow_per_deposit",
            "totalValueOfLiquidation",
            "numberOfLiquidation",
            "frequencyOfTransaction",
            "frequencyOfDappTransactions",
            "numberOfInteractedDapps",
            "typesOfInteractedDapps",
            "numberOfReputableDapps",
            "age",
        ]
    ].values
)
main_y = df_normalized["main_label"].values
sub_y = df_normalized["sub_label"].values

X_train, X_test, main_y_train, main_y_test, sub_y_train, sub_y_test = train_test_split(
    X, main_y, sub_y, test_size=0.2, random_state=42
)
# Chuẩn bị dữ liệu

In [160]:
X_train.shape[0]

97604

In [161]:
# a= [-1.21308685  4.47542451 -5.51931091  3.71804369  2.0969602   4.80669811
#  -2.03392303  7.7603754   4.16328747 -4.93222656  4.23313938 -8.20674732
#  -6.96695354 -0.1264665 ]

In [162]:
import numpy as np

def new_error_func(y, sub_y, y_pred):
    errors = [0] * len(y_pred)
    # Lặp qua từng phần tử trong mảng
    for i in range(len(y_pred)):
        if y[i] == sub_y[i]:
            errors[i] = y_pred[i] - y[i] 
        else :
            min_value = min(y[i], sub_y[i])
            max_value = max(y[i], sub_y[i])
            if y_pred[i] <= min_value:
                errors[i] = min_value - y_pred[i]
            if y_pred[i] >= max_value:
                errors[i] = y_pred[i] - max_value
    return errors

# Hàm tính giá trị của hàm lỗi (MSE)
def mean_squared_error(X, y, w):
    n = len(y)
    y_pred = labeling(np.dot(X, w))
    error = y_pred - y
    mse = np.sum(error ** 2) / n
    return mse

# Hàm tính gradient của hàm lỗi (MSE)
def gradient_mean_squared_error(X, y, sub_y, w):
    n = len(y)
    y_pred = labeling(np.dot(X, w))
    # error = y_pred - y
    error = new_error_func(y, sub_y, y_pred)
    gradient = 2 * np.dot(X.T, error) / n
    return gradient

# Gradient descent để tối ưu hóa hàm lỗi (MSE)
def stochastic_gradient_descent(X, y, sub_y, learning_rate=0.001, num_iterations=350, decay_rate=0.95, tol=1e-5):
    w = np.random.uniform(0, 1, X.shape[1])
    prev_loss = float('inf')
    for i in range(num_iterations):
        random_index = np.random.randint(0, len(X))
        X_sample = X[random_index:random_index+1]
        y_sample = y[random_index:random_index+1]
        sub_y_sample = sub_y[random_index:random_index+1]
        
        grad = gradient_mean_squared_error(X_sample, y_sample, sub_y_sample, w)
        
        w -= learning_rate * grad
        learning_rate *= decay_rate  # Decay learning rate
        
        # if i % (num_iterations/10) == 0:
        #     loss = mean_squared_error(X, y, w)
        #     print(f"Iteration {i}: Loss = {loss}")
    return w

# Áp dụng gradient descent để tối ưu hóa hàm lỗi (MSE)
learned_weights = stochastic_gradient_descent(X_train, main_y_train, sub_y_train)

print("Vector trọng số tối ưu:", learned_weights)


Vector trọng số tối ưu: [-0.18204834  0.74520797  0.62702427  0.26669942  0.39286332 -0.29246011
  0.77407043 -0.30844323  0.53951053  0.26871575 -0.08264221  0.65244038
  0.58675265 -0.52372434]


In [163]:
def new_accuracy(y_train, sub_y_train, y_pred):
    y_train = np.array(y_train)
    sub_y_train = np.array(sub_y_train)
    y_pred = np.array(y_pred)
    condition = np.logical_or(y_pred == sub_y_train, y_pred == y_train)
    count = np.sum(condition)
    accuracy = count / len(y_train)
    return accuracy

In [164]:
max1 = 0
res = None
for i in range(1000):
    learned_weights = stochastic_gradient_descent(X_train, main_y_train, sub_y_train)
    pred = labeling(X_test.dot(learned_weights))
    acc = new_accuracy(main_y_test, sub_y_test, pred)
    if acc > max1:
        max1 = acc
        res = learned_weights

In [167]:
print(max1, res)

0.8356282271944923 [ 0.24493971  0.58347622 -0.12170834 -0.0315793   0.41891079 -0.34808733
  0.61980345  0.37808545 -0.15132842 -0.06983328  0.10666237  0.30315643
 -0.08807829  0.07299206]


In [165]:
pred = labeling(X_test.dot(learned_weights))
print(new_accuracy(main_y_test, sub_y_test, pred))

0.39566428981231044


In [166]:
unique_elements, counts = np.unique(pred, return_counts=True)

for element, count in zip(unique_elements, counts):
    print("Phần tử", element, "xuất hiện", count, "lần")


Phần tử 0 xuất hiện 12618 lần
Phần tử 1 xuất hiện 11781 lần
Phần tử 3 xuất hiện 2 lần
Phần tử 4 xuất hiện 1 lần
