In [3]:
import rdkit.Chem as Chem
from rdkit.Chem import rdFingerprintGenerator
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import os
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error

In [4]:
glycans_df = pd.read_csv("data/model_training/Glycans-CFG611.txt", sep="\t")
glycans_df.head()

Unnamed: 0,Name,IUPAC,SMILES
0,CFG-7-Sp8,Gal(α-Sp8,OC[C@@H](O1)[C@H](O)[C@H](O)[C@@H](O)[C@H]1-OC...
1,CFG-8-Sp8,Glc(α-Sp8,OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H](O)[C@H]1-O...
2,CFG-9-Sp8,Man(α-Sp8,OC[C@@H](O1)[C@@H](O)[C@H](O)[C@H](O)[C@H]1-OC...
3,CFG-10-Sp8,GalNAc(α-Sp8,OC[C@@H](O1)[C@H](O)[C@H](O)[C@@H](NC(=O)C)[C@...
4,CFG-10-Sp15,GalNAc(α-Sp15,OC[C@@H](O1)[C@H](O)[C@H](O)[C@@H](NC(=O)C)[C@...


In [5]:
lectin_esm_df = pd.read_csv("data/model_training/Protein-Feature-Table-2025-07-23.txt", sep="\t", index_col=0)
lectin_esm_df.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,1271,1272,1273,1274,1275,1276,1277,1278,1279,1280
Lectin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.203471,0.187753,-0.026359,0.016505,-0.040728,-0.044468,-0.053233,0.068222,-0.236435,-0.040679,...,0.12068,-0.032572,-0.136013,0.013133,-0.630671,0.19518,0.17923,-0.054426,-0.042039,0.188738
2,0.120131,0.13517,-0.082275,0.115452,-0.109133,-0.208142,0.072962,-0.011479,-0.066582,-0.065606,...,0.171328,-0.018229,-0.09666,0.098843,-1.036492,-0.017578,0.013231,0.002841,-0.04752,0.227151
3,0.055755,0.187735,0.031612,0.053205,-0.107346,-0.035376,-0.165884,0.151764,-0.107026,0.022265,...,0.27115,-0.127574,-0.048594,-0.009104,-0.588789,-0.120662,-0.159802,0.09237,0.170785,0.165332
4,0.051498,0.204848,-0.073019,0.087792,-0.1161,-0.054806,-0.131069,0.102224,-0.053619,0.029918,...,0.268785,-0.112583,-0.064447,0.00459,-0.608291,-0.09302,-0.082011,0.045429,0.162284,0.119839
5,-0.047237,0.211114,0.141265,-0.02035,-0.106777,-0.230495,-0.077538,0.029638,-0.231603,-0.034559,...,0.351602,-0.005711,-0.153086,0.17541,-0.394899,0.075253,0.321678,-0.151691,-0.050353,0.19488


In [6]:
fbound_df = pd.read_csv("data/model_training/Fraction_Bound.txt", sep="\t")
fbound_df.head()

Unnamed: 0,Glycan,Lectin,Concentration,Fraction_Bound,Fold
0,CFG-7-Sp8,1,200.0,5.8e-05,7
1,CFG-8-Sp8,1,200.0,0.001217,1
2,CFG-9-Sp8,1,200.0,0.0,1
3,CFG-10-Sp15,1,200.0,0.000723,8
4,CFG-10-Sp8,1,200.0,0.0,8


In [7]:
def get_morgan_fingerprint(smiles, radius, fpSize):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return torch.zeros(fpSize)

    mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=fpSize, includeChirality=True)
    fp = mfpgen.GetFingerprint(mol)  
    return torch.tensor(fp)

In [8]:
# morgan fingerprint representation
glycan_reps = {row.Name: get_morgan_fingerprint(row.SMILES, 3, 1024) for row in glycans_df.itertuples(index=False)}

In [9]:
# esm representation
lectin_reps = {i: torch.tensor(row.values) for i, row in lectin_esm_df.iterrows()}

In [10]:
mf_len = len(list(glycan_reps.values())[0])
esm_len = len(list(lectin_reps.values())[0])

X = torch.zeros((len(fbound_df), mf_len + esm_len + 1), dtype=torch.float)
Y = torch.tensor(fbound_df["Fraction_Bound"], dtype=torch.float)
folds = fbound_df["Fold"]

for i, row in enumerate(fbound_df.itertuples(index=False)):
    glycan_rep = glycan_reps[row.Glycan].to(dtype=torch.float)
    lectin_rep = lectin_reps[row.Lectin].to(dtype=torch.float)
    conc = torch.tensor([row.Concentration], dtype=torch.float)
    
    X[i] = torch.cat([glycan_rep, lectin_rep, conc])

In [11]:
# Feedforward Network
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dims):
        super().__init__()
        self.network = nn.Sequential(
            *[nn.Sequential(
                nn.Linear(input_dim if i==0 else hidden_dims[i-1], hidden_dim),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_dim),
                nn.Dropout(0.1)
            ) for i, hidden_dim in enumerate(hidden_dims)],
            nn.Linear(hidden_dims[-1], 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.network(x).squeeze()

In [12]:
def train_torch_model(model_name, params, save_folder, folds):
    torch.save(params, f"models/{save_folder}/params.pt")
    num_folds = folds.nunique()
    folds = torch.tensor(folds, device=X.device)
    losses = []

    # K-Fold Cross-Validation
    for fold in range(num_folds):
        print(f"--- Fold {fold + 1}/{num_folds} ---")

        # Split data
        test_mask = (folds == fold)
        train_mask = ~test_mask

        X_train, Y_train = X[train_mask], Y[train_mask]
        X_test, Y_test = X[test_mask], Y[test_mask]

        # Create Dataloaders
        train_loader = DataLoader(TensorDataset(X_train, Y_train), batch_size=params["batch_size"], shuffle=True)
        test_loader = DataLoader(TensorDataset(X_test, Y_test), batch_size=params["batch_size"])

        # Initialize model, loss, optimizer
        if model_name == "mlp":
            model = MLP(X.shape[1], params["hidden_dims"]).to(X.device)
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"])

        # Training loop
        for epoch in range(params["epochs"]):
            model.train()
            total_loss = 0
            for xb, yb in train_loader:
                optimizer.zero_grad()
                preds = model(xb)
                loss = criterion(preds, yb)
                loss.backward()
                optimizer.step()
                total_loss += loss.item() * xb.size(0)

            avg_loss = total_loss / len(train_loader.dataset)

            # Evaluation
            model.eval()
            with torch.no_grad():
                total_test_loss = 0
                for xb, yb in test_loader:
                    preds = model(xb)
                    loss = criterion(preds, yb)
                    total_test_loss += loss.item() * xb.size(0)
                avg_test_loss = total_test_loss / len(test_loader.dataset)

            print(f"Epoch {epoch+1}/{params['epochs']} - Train Loss: {avg_loss:.4f}, Test Loss: {avg_test_loss:.4f}")
        losses.append(avg_test_loss)
        torch.save(model, f"models/{save_folder}/fold-{fold}.pt")
    torch.save(losses, f"models/{save_folder}/losses.pt")

In [13]:
def train_nontorch_model(model_name, params, save_folder, folds):
    os.makedirs(f"models/{save_folder}", exist_ok=True)
    torch.save(params, f"models/{save_folder}/params.pt")

    num_folds = folds.nunique()
    folds = torch.tensor(folds)
    losses = []

    # K-Fold Cross-Validation
    for fold in range(num_folds):
        print(f"\n--- Fold {fold + 1}/{num_folds} ---")

        test_mask = (folds == fold)
        train_mask = ~test_mask

        X_train = X[train_mask].cpu().numpy()
        Y_train = Y[train_mask].cpu().numpy()
        X_test = X[test_mask].cpu().numpy()
        Y_test = Y[test_mask].cpu().numpy()

        # Train Model
        if model_name == "random_forest":
            model = RandomForestRegressor(
                n_estimators=params["n_estimators"],
                max_depth=params["max_depth"],
                random_state=params["random_state"]
            )
        elif model_name == "xgboost":
            model = xgb.XGBRegressor(**params)
        model.fit(X_train, Y_train)

        # Evaluate
        preds = model.predict(X_test)
        mse = mean_squared_error(Y_test, preds)
        print(f"Test MSE: {mse:.4f}")
        losses.append(mse)

        # Save model
        joblib.dump(model, f"models/{save_folder}/fold-{fold}.joblib")

    # Save losses
    torch.save(losses, f"models/{save_folder}/losses.pt")


In [14]:
def train_model(model_name, params, save_folder):
    # NOTE: idk why, but the variable 'folds' is not recognized within the follwoing functions, 
    # but is recognized within this function
    if model_name in ["mlp"]:
        train_torch_model(model_name, params, save_folder, folds)
    elif model_name in ["random_forest", "xgboost"]:
        train_nontorch_model(model_name, params, save_folder, folds)
    else:
        raise Exception(f"Invalid model name: {model_name}")

In [None]:
save_folder = '256_nn'
params = {
    "batch_size": 64,
    "epochs": 5,
    "learning_rate": 1e-3,
    "hidden_dims": [256],
}
train_model("mlp", params, save_folder)

In [None]:
save_folder = '128_nn'
params = {
    "batch_size": 64,
    "epochs": 5,
    "learning_rate": 1e-3,
    "hidden_dims": [128],
}
train_model("mlp", params, save_folder)

In [None]:
save_folder = '512_nn'
params = {
    "batch_size": 64,
    "epochs": 5,
    "learning_rate": 1e-3,
    "hidden_dims": [512],
}
train_model("mlp", params, save_folder)

  folds = torch.tensor(folds, device=X.device)


--- Fold 1/10 ---
Epoch 1/5 - Train Loss: 0.0042, Test Loss: 0.0030
Epoch 2/5 - Train Loss: 0.0029, Test Loss: 0.0027
Epoch 3/5 - Train Loss: 0.0027, Test Loss: 0.0029
Epoch 4/5 - Train Loss: 0.0025, Test Loss: 0.0024
Epoch 5/5 - Train Loss: 0.0024, Test Loss: 0.0022
--- Fold 2/10 ---
Epoch 1/5 - Train Loss: 0.0042, Test Loss: 0.0034
Epoch 2/5 - Train Loss: 0.0029, Test Loss: 0.0024
Epoch 3/5 - Train Loss: 0.0027, Test Loss: 0.0025
Epoch 4/5 - Train Loss: 0.0026, Test Loss: 0.0023
Epoch 5/5 - Train Loss: 0.0024, Test Loss: 0.0020
--- Fold 3/10 ---
Epoch 1/5 - Train Loss: 0.0041, Test Loss: 0.0032
Epoch 2/5 - Train Loss: 0.0028, Test Loss: 0.0029
Epoch 3/5 - Train Loss: 0.0026, Test Loss: 0.0028
Epoch 4/5 - Train Loss: 0.0025, Test Loss: 0.0025
Epoch 5/5 - Train Loss: 0.0024, Test Loss: 0.0024
--- Fold 4/10 ---
Epoch 1/5 - Train Loss: 0.0046, Test Loss: 0.0024
Epoch 2/5 - Train Loss: 0.0029, Test Loss: 0.0023
Epoch 3/5 - Train Loss: 0.0027, Test Loss: 0.0023
Epoch 4/5 - Train Loss: 0.00

In [None]:
save_folder = '512_256_nn'
params = {
    "batch_size": 64,
    "epochs": 5,
    "learning_rate": 1e-3,
    "hidden_dims": [512, 256],
}
train_model("mlp", params, save_folder)

In [None]:
save_folder = 'rf_model'
params = {
    "n_estimators": 25,
    "max_depth": 6,
    "random_state": 42
}
train_model("random_forest", params, save_folder)


--- Fold 1/10 ---


In [None]:
save_folder = 'xgb_model'

params = {
    "n_estimators": 50,
    "max_depth": 6,
    "learning_rate": 0.1,
    "objective": "reg:squarederror",
    "verbosity": 1
}
train_model("xgboost", params, save_folder)

In [None]:
all_losses = {}

for save_folder in os.listdir("models"):
    folder_path = os.path.join("models", save_folder)
    if not os.path.isdir(folder_path):
        continue

    pt_path = os.path.join(folder_path, "losses.pt")
    npy_path = os.path.join(folder_path, "losses.npy")

    if os.path.isfile(pt_path):
        losses = torch.load(pt_path)
    elif os.path.isfile(npy_path):
        losses = np.load(npy_path).tolist()
    else:
        print(f"No losses file found in {save_folder}")
        continue

    all_losses[save_folder] = losses

  losses = torch.load(pt_path)


In [19]:
for model, losses in all_losses.items():
    print(model, np.mean(losses))

128_nn 0.0024162003398329733
256_nn 0.0024528425573110833
512_256_nn 0.0023599095073404096
512_nn 0.0023166615587496475
rf_model 0.0026659936915438775
xgb_model 0.0020314463414251803
