In [1]:
# Import necessary libraries for data manipulation and machine learning
import pandas as pd
import numpy as np
import torch

In [2]:
# Load the BindDB dataset from a PyTorch file
bind_db = torch.load('D:\Drugllm\Drugtargetdata\BindDB.pt', weights_only=False)

  bind_db = torch.load('D:\Drugllm\Drugtargetdata\BindDB.pt', weights_only=False)


In [3]:
# Import additional libraries for machine learning and evaluation
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib
from xgboost import XGBRegressor

In [4]:
# Initialize a DataFrame to store evaluation metrics
metrics_df = pd.DataFrame(columns=['Model', 'Dataset', 'RMSE', 'MAE', 'MSE', 'R2'])

def calculate_metrics(y_true, y_pred):
    """Calculate regression metrics: RMSE, MAE, MSE, and R2."""
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, mse, r2


def train_model_predict(model_name, model, train_df, val_df, test_df, X_train, y_train, X_val, y_val, X_test, y_test, save_predictions=True, predictions_dir='D:/Drugllm/predictions',datasetname="bind"):
    """Train and evaluate a model, save metrics to the DataFrame, and optionally save predictions to CSV.

    Parameters
    ----------
    model_name : str
        Human-readable name of the model (used in metrics and filenames).
    model : object
        Fitted model object with a .predict method.
    X_train, X_val, X_test : array-like
        Feature matrices for each split.
    y_train, y_val, y_test : array-like or pandas.Series
        True target values for each split.
    save_predictions : bool, optional
        If True, save per-split prediction CSV files to `predictions_dir`.
    predictions_dir : str, optional
        Directory where prediction CSVs will be saved.

    Returns
    -------
    pandas.DataFrame
        Updated metrics DataFrame (appended to global `metrics_df`).
    """
    import os
    os.makedirs(predictions_dir, exist_ok=True)

    # Train predictions
    train_pred = model.predict(X_train)

    # Validation predictions
    val_pred = model.predict(X_val)

    # Test predictions
    test_pred = model.predict(X_test)

    # Optionally save predictions to CSV files (true vs predicted)
    if save_predictions:
        try:
            # Ensure arrays are 1-D
            y_train_arr = np.array(y_train).flatten()
            y_val_arr = np.array(y_val).flatten()
            y_test_arr = np.array(y_test).flatten()
            train_pred_arr = np.array(train_pred).flatten()
            val_pred_arr = np.array(val_pred).flatten()
            test_pred_arr = np.array(test_pred).flatten()

            train_df['Predicted'] = train_pred_arr
            train_df['True'] = y_train_arr
            train_df['Split'] = 'Train'
            val_df['Predicted'] = val_pred_arr
            val_df['True'] = y_val_arr
            val_df['Split'] = 'Validation'
            test_df['Predicted'] = test_pred_arr
            test_df['True'] = y_test_arr
            test_df['Split'] = 'Test'

            # Combine all splits
            predict_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
            predict_df_out = predict_df[['Index_ID', 'Drug_ID', 'Target_ID', 'Affinity', 'Predicted', 'True', 'Split']]
            # File paths
            safe_name = model_name.replace(' ', '_')
            output_path = os.path.join(predictions_dir, f'{safe_name}_predictions_{datasetname}.csv')
            # Save to CSV
            predict_df_out.to_csv(output_path, index=False)

        except Exception as e:
            print(f'Warning: could not save predictions for {model_name}: {e}')

    return predict_df_out


In [5]:
def prepare_for_psichic(dti_data):
    # Process the DTI dataset to create a simplified DataFrame
    bind_psichic = dti_data[['Index_ID', 'Target', 'Drug']]
    # Rename columns for consistency
    bind_psichic.rename(columns={'Index_ID': 'ID', 'Drug': 'Ligand', 'Target': 'Protein'}, inplace=True)
    # Reorder columns for better readability
    bind_psichic = bind_psichic[['ID', 'Protein', 'Ligand']]
    return bind_psichic

In [6]:
# Set a random seed for reproducibility
SEED = 42

# Create a copy of the BindDB dataset for training and testing
dti_data = bind_db.copy().reset_index(drop=True)

dti_data['Index_ID'] = 'Bind' + dti_data.index.astype(str)

# Prepare and save the dataset for Psichic
bind_psichic = prepare_for_psichic(dti_data)
bind_psichic.to_csv('./Psichic_data/bind_db_psichic_input.csv', index=False)

# Split the dataset into training, validation, and test sets
train_df, temp_df = train_test_split(dti_data, test_size=0.3, random_state=SEED)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=SEED)

# Prepare feature matrices and target variables
X_train = np.hstack((np.vstack(train_df['Drug_Features']), np.vstack(train_df['Target_Features'])))
X_val = np.hstack((np.vstack(val_df['Drug_Features']), np.vstack(val_df['Target_Features'])))
X_test = np.hstack((np.vstack(test_df['Drug_Features']), np.vstack(test_df['Target_Features'])))
y_train, y_val, y_test = train_df['Affinity'], val_df['Affinity'], test_df['Affinity']

# Display the shapes of the feature matrices
print(X_train.shape, X_val.shape, X_test.shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bind_psichic.rename(columns={'Index_ID': 'ID', 'Drug': 'Ligand', 'Target': 'Protein'}, inplace=True)


(29565, 1088) (6335, 1088) (6336, 1088)


In [10]:
dti_data.shape

(25772, 8)

In [13]:
dti_data.loc[1]['Drug_Features'].shape

(768,)

In [14]:
dti_data.loc[1]['Target_Features'].shape

(320,)

In [6]:
dti_data.head()

Unnamed: 0,Drug_ID,Drug,Target_ID,Target,Y,Affinity,Drug_Features,Target_Features
0,11314340,Cc1[nH]nc2ccc(-c3cncc(OCC(N)Cc4ccccc4)c3)cc12,AAK1,MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQV...,43.0,7.366532,"[0.7605508, 0.46918112, 0.022348084, -0.290485...","[-0.009276379, -0.14671455, 0.065141745, 0.037..."
1,11314340,Cc1[nH]nc2ccc(-c3cncc(OCC(N)Cc4ccccc4)c3)cc12,ABL1p,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,10000.0,5.0,"[0.7605508, 0.46918112, 0.022348084, -0.290485...","[-0.052989967, -0.19866328, 0.01483496, 0.0618..."
2,11314340,Cc1[nH]nc2ccc(-c3cncc(OCC(N)Cc4ccccc4)c3)cc12,ABL2,MVLGTVLLPPNSYGRDQDTSLCCLCTEASESALPDLTDHFASCVED...,10000.0,5.0,"[0.7605508, 0.46918112, 0.022348084, -0.290485...","[-0.028185444, -0.17497379, 0.061383072, 0.063..."
3,11314340,Cc1[nH]nc2ccc(-c3cncc(OCC(N)Cc4ccccc4)c3)cc12,ACVR1,MVDGVMILPVLIMIALPSPSMEDEKPKVNPKLYMCVCEGLSCGNED...,10000.0,5.0,"[0.7605508, 0.46918112, 0.022348084, -0.290485...","[0.060061518, -0.12135677, 0.12382316, 0.06401..."
4,11314340,Cc1[nH]nc2ccc(-c3cncc(OCC(N)Cc4ccccc4)c3)cc12,ACVR2A,MGAAAKLAFAVFLISCSSGAILGRSETQECLFFNANWEKDRTNQTG...,10000.0,5.0,"[0.7605508, 0.46918112, 0.022348084, -0.290485...","[0.10380349, -0.11144593, 0.11350971, 0.110970..."


In [6]:
# Scale the feature matrices for models that require normalized input
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [7]:
# Train and evaluate a Random Forest model
print('Random Forest')
rf_model = RandomForestRegressor(n_estimators=100, random_state=SEED)
rf_model.fit(X_train_scaled, y_train)
predict_rf = train_model_predict("Random Forest", rf_model, train_df, val_df, test_df, X_train, y_train, X_val, y_val, X_test, y_test, save_predictions=True, predictions_dir='D:\Github\llm-drug-agent\Prediction_Results',datasetname="bind")

  predict_rf = train_model_predict("Random Forest", rf_model, train_df, val_df, test_df, X_train, y_train, X_val, y_val, X_test, y_test, save_predictions=True, predictions_dir='D:\Github\llm-drug-agent\Prediction_Results',datasetname="bind")


Random Forest


In [8]:
# Train and evaluate an SVR model
print('SVR')
svr_model = SVR(kernel='rbf')
svr_model.fit(X_train_scaled, y_train)
predict_rf = train_model_predict("SVR", svr_model, train_df, val_df, test_df, X_train, y_train, X_val, y_val, X_test, y_test, save_predictions=True, predictions_dir='D:\Github\llm-drug-agent\Prediction_Results',datasetname="bind")

  predict_rf = train_model_predict("SVR", svr_model, train_df, val_df, test_df, X_train, y_train, X_val, y_val, X_test, y_test, save_predictions=True, predictions_dir='D:\Github\llm-drug-agent\Prediction_Results',datasetname="bind")


SVR


In [9]:
# Train and evaluate a Gradient Boosting Machine (GBM) model
print('GBM')
gbm_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=SEED)
gbm_model.fit(X_train, y_train)
predict_rf = train_model_predict("GBM", gbm_model, train_df, val_df, test_df, X_train, y_train, X_val, y_val, X_test, y_test, save_predictions=True, predictions_dir='D:\Github\llm-drug-agent\Prediction_Results',datasetname="bind")

  predict_rf = train_model_predict("GBM", gbm_model, train_df, val_df, test_df, X_train, y_train, X_val, y_val, X_test, y_test, save_predictions=True, predictions_dir='D:\Github\llm-drug-agent\Prediction_Results',datasetname="bind")


GBM


In [10]:
# Train and evaluate a Linear Regression model
print('Linear Regression')
lin_reg_model = LinearRegression()
lin_reg_model.fit(X_train_scaled, y_train)
predict_rf = train_model_predict("Linear Regression", lin_reg_model, train_df, val_df, test_df, X_train, y_train, X_val, y_val, X_test, y_test, save_predictions=True, predictions_dir='D:\Github\llm-drug-agent\Prediction_Results',datasetname="bind")

  predict_rf = train_model_predict("Linear Regression", lin_reg_model, train_df, val_df, test_df, X_train, y_train, X_val, y_val, X_test, y_test, save_predictions=True, predictions_dir='D:\Github\llm-drug-agent\Prediction_Results',datasetname="bind")


Linear Regression


In [11]:
# Train and evaluate a Multi-Layer Perceptron (MLP) model
print('MLP')
mlp_model = MLPRegressor(hidden_layer_sizes=(512, 256), activation='relu', max_iter=200, random_state=SEED)
mlp_model.fit(X_train_scaled, y_train)
predict_rf = train_model_predict("MLP", mlp_model, train_df, val_df, test_df, X_train, y_train, X_val, y_val, X_test, y_test, save_predictions=True, predictions_dir='D:\Github\llm-drug-agent\Prediction_Results',datasetname="bind")

  predict_rf = train_model_predict("MLP", mlp_model, train_df, val_df, test_df, X_train, y_train, X_val, y_val, X_test, y_test, save_predictions=True, predictions_dir='D:\Github\llm-drug-agent\Prediction_Results',datasetname="bind")


MLP


In [12]:
# Train and evaluate an XGBoost model
print('XGBoost')
xgb_model = XGBRegressor(random_state=SEED, eval_metric='rmse')
xgb_model.fit(X_train, y_train)
predict_rf = train_model_predict("XGBoost", xgb_model, train_df, val_df, test_df, X_train, y_train, X_val, y_val, X_test, y_test, save_predictions=True, predictions_dir='D:\Github\llm-drug-agent\Prediction_Results',datasetname="bind")

  predict_rf = train_model_predict("XGBoost", xgb_model, train_df, val_df, test_df, X_train, y_train, X_val, y_val, X_test, y_test, save_predictions=True, predictions_dir='D:\Github\llm-drug-agent\Prediction_Results',datasetname="bind")


XGBoost
