In [1]:
import pandas as pd 

df = pd.read_csv('data.csv')

In [2]:
x = df[[i for i in df.columns if i not in ['index','rdkit_smi'] and 'pIC50' not in i]]
y = df[['herg_pIC50', 'hepg2_pIC50', 'cyp1_pIC50','cyp2_pIC50', 'cyp3_pIC50']]

In [3]:
# https://github.com/YyzHarry/imbalanced-regression
# sts-b-dir 폴더 git clone 하기!
# sts-b-dir 폴더 내에 util.py가 있는데 이거 이름을 utils.py로 바꿔야 밑에 함수가 불러와짐
import sys
from collections import Counter
from scipy.ndimage import convolve1d

sys.path.append(r'D:\\toxic_prediction\\sts-b-dir') # LDS를 위한 함수를 불러오기 위해서

from utils import get_lds_kernel_window

def get_bin_idx(label, num_bins, label_min, label_max):
    if label_max == label_min:
        return 0
    
    normalized = (label - label_min) / (label_max - label_min)
    bin_idx = int(normalized * num_bins)   
    return max(0, min(num_bins - 1, bin_idx))

def lds(labels):
    # preds, labels: [Ns,], "Ns" is the number of total samples
    # assign each label to its corresponding bin (start from 0)
    # with your defined get_bin_idx(), return bin_index_per_label: [Ns,] 
    label_min = min(labels)
    label_max = max(labels)
    num_bins = 50

    bin_index_per_label = [get_bin_idx(label, num_bins, label_min, label_max) for label in labels]

    # calculate empirical (original) label distribution: [Nb,]
    # "Nb" is the number of bins
    Nb = max(bin_index_per_label) + 1
    num_samples_of_bins = dict(Counter(bin_index_per_label))
    emp_label_dist = [num_samples_of_bins.get(i, 0) for i in range(Nb)]

    # lds_kernel_window: [ks,], here for example, we use gaussian, ks=5, sigma=2
    lds_kernel_window = get_lds_kernel_window(kernel='gaussian', ks=5, sigma=2)
    # calculate effective label distribution: [Nb,]
    eff_label_dist = convolve1d(np.array(emp_label_dist), weights=lds_kernel_window, mode='constant')

    # Use re-weighting based on effective label distribution, sample-wise weights: [Ns,]
    eff_num_per_label = [eff_label_dist[bin_idx] for bin_idx in bin_index_per_label]
    lds_pIC50 = [np.float32(1 / x) for x in eff_num_per_label]
    lds_pIC50 = lds_pIC50 / np.mean(lds_pIC50) # 원본 코드에는 이게 없는데 이렇게 해야 가중치 간 값의 차이가 좀 생김
    # 이거 안하면 0.62, 0.0054 이런식이라 모델이 제대로 가중치 적용을 못하는듯 하여 이렇게 한 줄 추가함

    return lds_pIC50

In [4]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.999)  
df_pca = pca.fit_transform(x)
df_pca = pd.DataFrame(df_pca).reset_index(drop=True)
df_pca.columns = ['pca' + str(i) for i in df_pca.columns]

In [5]:
y_col = ['herg_pIC50', 'hepg2_pIC50', 'cyp1_pIC50','cyp2_pIC50', 'cyp3_pIC50']

In [None]:
import numpy as np
from sklearn.datasets import load_linnerud
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge
regr = MultiOutputRegressor(Ridge(random_state=123)).fit(df_pca, df[y_col])

array([[176.16484296,  35.0548407 ,  57.09000136]])

In [20]:
df[y_col]

Unnamed: 0,herg_pIC50,hepg2_pIC50,cyp1_pIC50,cyp2_pIC50,cyp3_pIC50
0,-3.158943,-4.082785,-3.805099,-3.095509,-3.688620
1,-3.227800,-4.190332,-3.925421,-3.002700,-3.768460
2,-3.124063,-4.910000,-3.928708,-3.966669,-4.088627
3,-3.273625,-4.480000,-3.801309,-3.237118,-4.538229
4,-3.891140,-4.640000,-3.682381,-3.217334,-3.889728
...,...,...,...,...,...
44414,-5.000000,-3.409001,-4.477121,-4.477121,-4.477121
44415,-3.805735,-3.980368,-4.823700,-3.570359,-4.146392
44416,-3.921612,-3.774654,-4.030989,-3.602060,-3.110931
44417,-4.367977,-3.723841,-4.053462,-3.379220,-4.137978


In [26]:
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
import deepchem as dc
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score

model = LGBMRegressor(
            random_state=42, n_estimators=500, min_child_weight=5, n_jobs=-1,
            learning_rate = 0.05
        )

def kfold(model, X_resampled, y_resampled):

    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    mse_list, rmse_list, r2_list = [], [], []

    # 인덱스 리셋(정렬/정합 문제 방지)
    Xr = X_resampled.reset_index(drop=True)
    yr = y_resampled.reset_index(drop=True)
    # w_all = np.asarray(lds(yr), dtype=float)  # 전체에서 만든 가중치라면

    for tr_idx, te_idx in kf.split(Xr):
        X_train, X_test = Xr.iloc[tr_idx], Xr.iloc[te_idx]
        y_train, y_test = yr.iloc[tr_idx], yr.iloc[te_idx]
                      
        multi_model = MultiOutputRegressor(model)
        multi_model.fit(X_train, y_train)
        y_pred = multi_model.predict(X_test)
        
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)

        mse_list.append(mse)
        rmse_list.append(rmse)
        r2_list.append(r2)

    print("각 Fold의 MSE:", mse_list)
    print("평균 MSE:", np.mean(mse_list))
    print("각 Fold의 RMSE:", rmse_list)
    print("평균 RMSE:", np.mean(rmse_list))
    print("각 Fold의 R2:", r2_list)
    print("평균 R2:", np.mean(r2_list))

kfold(model, df_pca, df[y_col])
# pred = model.predict(ligand_df_pca)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009144 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8925
[LightGBM] [Info] Number of data points in the train set: 39977, number of used features: 35
[LightGBM] [Info] Start training from score -3.764351
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008220 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8925
[LightGBM] [Info] Number of data points in the train set: 39977, number of used features: 35
[LightGBM] [Info] Start training from score -3.861830
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009843 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8925
[LightGBM] [Info] Number of data points in the train set: 39977, number of used features: 35
[LightGBM] [Info] Start t