## Alternative Models for O'neil Dataset:

In [None]:
import pandas as pd 

unprocessed = pd.read_excel("./O'neil files/data_combination_response.xls")
unprocessed['ic50'] = unprocessed['X/X0'].apply(lambda x: 1 if x>=.45 and x<=0.55 else 0 )

### ElasticNet

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import gc
from sklearn.linear_model import ElasticNet

for fold in range(5):
    data_dir = f"./O'neil files/ComboFM Real Data Fold{fold + 1}/"

    # Combine all features excluding drug concentration
    features_tensor = pd.DataFrame()
    for i in ["drug1__one-hot_encoding.csv", "drug2__one-hot_encoding.csv", "cell_lines__one-hot_encoding.csv"]:
        temp = pd.read_csv(data_dir + i)
        features_tensor = pd.concat([features_tensor, temp], axis=1)

    features_auxiliary = pd.DataFrame()
    for i in ["drug1__estate_fingerprints.csv", "drug2__estate_fingerprints.csv", "cell_lines__gene_expression.csv"]:
        temp = pd.read_csv(data_dir + i)
        features_auxiliary = pd.concat([features_auxiliary, temp], axis=1)

    # Add X/X0 as a feature
    y = pd.read_csv(data_dir + f"train_fold_{fold}.csv")
    features_tensor['X/X0'] = y['X/X0']
    # Combine all features
    X = pd.concat([features_tensor, features_auxiliary], axis=1).dropna()
    X = X.loc[:, ~X.columns.duplicated()].copy()


    # Set drug concentration as the label
    y = y[["drugA Conc (µM)", "drugB Conc (µM)"]]
    X = pd.merge(X, y, left_index=True, right_index=True)
    y = X[["drugA Conc (µM)", "drugB Conc (µM)"]]
    y['drugA Conc (µM)'] = np.log2(y['drugA Conc (µM)'])
    y['drugB Conc (µM)'] = np.log2(y['drugB Conc (µM)'])
    X.drop(["drugA Conc (µM)", "drugB Conc (µM)"], axis=1, inplace=True)

    
    # Train model
    print(f"Start Training for fold {fold}: ")
    model = ElasticNet()
    model.fit(X, y)
    del X, y
    gc.collect()

    # Validation
    features_tensor = pd.DataFrame()
    for i in ["validation_data_drug1__one-hot_encoding.csv", "validation_data_drug2__one-hot_encoding.csv", "validation_data_cell_lines__one-hot_encoding.csv"]:
        temp = pd.read_csv(data_dir + i)
        features_tensor = pd.concat([features_tensor, temp], axis=1)

    features_auxiliary = pd.DataFrame()
    for i in ["validation_data_drug1__estate_fingerprints.csv", "validation_data_drug2__estate_fingerprints.csv", "validation_data_cell_lines__gene_expression.csv"]:
        temp = pd.read_csv(data_dir + i)
        features_auxiliary = pd.concat([features_auxiliary, temp], axis=1)

    # Add X/X0 as a feature for validation
    y_val = pd.read_csv(data_dir + f"test_fold_{fold}.csv")
    features_tensor['X/X0'] = y_val['X/X0']
    # Combine all features for validation
    X_test = pd.concat([features_tensor, features_auxiliary], axis=1).dropna()
    X_test = X_test.loc[:, ~X_test.columns.duplicated()].copy()

    # Validation label (drug concentrations)
    y_test = y_val[["drugA Conc (µM)", "drugB Conc (µM)"]]
    y_test = y_test[["drugA Conc (µM)", "drugB Conc (µM)"]]
    X_test = pd.merge(X_test, y_test, left_index=True, right_index=True)
    y_test = X_test[["drugA Conc (µM)", "drugB Conc (µM)"]]
    X_test.drop(["drugA Conc (µM)", "drugB Conc (µM)"], axis=1, inplace=True)
    y_test['drugA Conc (µM)'] = np.log2(y_test['drugA Conc (µM)'])
    y_test['drugB Conc (µM)'] = np.log2(y_test['drugB Conc (µM)'])

    

    # Predict and evaluate
    pred = model.predict(X_test)
    mse = mean_squared_error(y_test, pred)
    print(f"Mean squared error for fold {fold}: {mse}")
    p1 = list()
    p2 = list()
    for i in pred:
        p1.append(complex(i[0], i[1]))
    for i in y_test.to_numpy():
        p2.append(complex(i[0], i[1]))
    mse = np.mean(pow(abs(np.array(p1)-np.array(p2)), 2))
    mae = np.mean(abs(np.array(p1)-np.array(p2)))
    print(f"Mean squared error for fold {fold}: {mse}")
    print(f"Mean squared error for fold {fold}: {mae}")


### Random Forest

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import gc

for fold in range(5):
    data_dir = f"./O'neil files/ComboFM Real Data Fold{fold + 1}/"

    # Combine all features excluding drug concentration
    features_tensor = pd.DataFrame()
    for i in ["drug1__one-hot_encoding.csv", "drug2__one-hot_encoding.csv", "cell_lines__one-hot_encoding.csv"]:
        temp = pd.read_csv(data_dir + i)
        features_tensor = pd.concat([features_tensor, temp], axis=1)

    features_auxiliary = pd.DataFrame()
    for i in ["drug1__estate_fingerprints.csv", "drug2__estate_fingerprints.csv", "cell_lines__gene_expression.csv"]:
        temp = pd.read_csv(data_dir + i)
        features_auxiliary = pd.concat([features_auxiliary, temp], axis=1)

    # Add X/X0 as a feature
    y = pd.read_csv(data_dir + f"train_fold_{fold}.csv")
    features_tensor['X/X0'] = y['X/X0']
    # Combine all features
    X = pd.concat([features_tensor, features_auxiliary], axis=1).dropna()
    X = X.loc[:, ~X.columns.duplicated()].copy()


    # Set drug concentration as the label
    y = y[["drugA Conc (µM)", "drugB Conc (µM)"]]
    X = pd.merge(X, y, left_index=True, right_index=True)
    y = X[["drugA Conc (µM)", "drugB Conc (µM)"]]
    y['drugA Conc (µM)'] = np.log2(y['drugA Conc (µM)'])
    y['drugB Conc (µM)'] = np.log2(y['drugB Conc (µM)'])
    X.drop(["drugA Conc (µM)", "drugB Conc (µM)"], axis=1, inplace=True)

    
    # Train model
    print(f"Start Training for fold {fold}: ")
    model = RandomForestRegressor()
    model.fit(X, y)
    del X, y
    gc.collect()

    # Validation
    features_tensor = pd.DataFrame()
    for i in ["validation_data_drug1__one-hot_encoding.csv", "validation_data_drug2__one-hot_encoding.csv", "validation_data_cell_lines__one-hot_encoding.csv"]:
        temp = pd.read_csv(data_dir + i)
        features_tensor = pd.concat([features_tensor, temp], axis=1)

    features_auxiliary = pd.DataFrame()
    for i in ["validation_data_drug1__estate_fingerprints.csv", "validation_data_drug2__estate_fingerprints.csv", "validation_data_cell_lines__gene_expression.csv"]:
        temp = pd.read_csv(data_dir + i)
        features_auxiliary = pd.concat([features_auxiliary, temp], axis=1)

    # Add X/X0 as a feature for validation
    y_val = pd.read_csv(data_dir + f"test_fold_{fold}.csv")
    features_tensor['X/X0'] = y_val['X/X0']
    # Combine all features for validation
    X_test = pd.concat([features_tensor, features_auxiliary], axis=1).dropna()
    X_test = X_test.loc[:, ~X_test.columns.duplicated()].copy()

    # Validation label (drug concentrations)
    y_test = y_val[["drugA Conc (µM)", "drugB Conc (µM)"]]
    y_test = y_test[["drugA Conc (µM)", "drugB Conc (µM)"]]
    X_test = pd.merge(X_test, y_test, left_index=True, right_index=True)
    y_test = X_test[["drugA Conc (µM)", "drugB Conc (µM)"]]
    X_test.drop(["drugA Conc (µM)", "drugB Conc (µM)"], axis=1, inplace=True)
    y_test['drugA Conc (µM)'] = np.log2(y_test['drugA Conc (µM)'])
    y_test['drugB Conc (µM)'] = np.log2(y_test['drugB Conc (µM)'])

    

    # Predict and evaluate
    pred = model.predict(X_test)
    mse = mean_squared_error(y_test, pred)
    print(f"Mean squared error for fold {fold}: {mse}")
    p1 = list()
    p2 = list()
    for i in pred:
        p1.append(complex(i[0], i[1]))
    for i in y_test.to_numpy():
        p2.append(complex(i[0], i[1]))
    mse = np.mean(pow(abs(np.array(p1)-np.array(p2)), 2))
    mae = np.mean(abs(np.array(p1)-np.array(p2)))
    print(f"Mean squared error for fold {fold}: {mse}")
    print(f"Mean squared error for fold {fold}: {mae}")


### MLP

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error
import gc

# Define a simple MLP with 4 layers
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim1=128, hidden_dim2=64, hidden_dim3=32, output_dim=2):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, hidden_dim3)
        self.fc4 = nn.Linear(hidden_dim3, output_dim)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x

for fold in range(5):
    data_dir = f"./O'neil files/ComboFM Real Data Fold{fold + 1}/"
    
    features_tensor = pd.DataFrame()
    for fname in ["drug1__one-hot_encoding.csv", "drug2__one-hot_encoding.csv", "cell_lines__one-hot_encoding.csv"]:
        temp = pd.read_csv(data_dir + fname)
        features_tensor = pd.concat([features_tensor, temp], axis=1)
    
    # Combine auxiliary features
    features_auxiliary = pd.DataFrame()
    for fname in ["drug1__estate_fingerprints.csv", "drug2__estate_fingerprints.csv", "cell_lines__gene_expression.csv"]:
        temp = pd.read_csv(data_dir + fname)
        features_auxiliary = pd.concat([features_auxiliary, temp], axis=1)
    
    # Read training labels and add 'X/X0' as a feature
    y_train = pd.read_csv(data_dir + f"train_fold_{fold}.csv")
    features_tensor['X/X0'] = y_train['X/X0']
    
    # Combine all features and remove duplicate columns
    X = pd.concat([features_tensor, features_auxiliary], axis=1).dropna()
    X = X.loc[:, ~X.columns.duplicated()].copy()
    
    # Merge drug concentration labels with features
    y = y_train[["drugA Conc (µM)", "drugB Conc (µM)"]]
    X = pd.merge(X, y, left_index=True, right_index=True)
    y = X[["drugA Conc (µM)", "drugB Conc (µM)"]]
    
    # Log-transform the labels
    y['drugA Conc (µM)'] = np.log2(y['drugA Conc (µM)'])
    y['drugB Conc (µM)'] = np.log2(y['drugB Conc (µM)'])
    X.drop(["drugA Conc (µM)", "drugB Conc (µM)"], axis=1, inplace=True)
    
    # Convert training data to torch tensors
    X_tensor = torch.tensor(X.values, dtype=torch.float32)
    y_tensor = torch.tensor(y.values, dtype=torch.float32)
   
    input_dim = X_tensor.shape[1]
    model = MLP(input_dim=input_dim, hidden_dim1=128, hidden_dim2=64, hidden_dim3=32, output_dim=2)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    epochs = 100
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(X_tensor)
        loss = criterion(outputs, y_tensor)
        loss.backward()
        optimizer.step()
        
        if (epoch + 1) % 10 == 0:
            print(f"Fold {fold}, Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}")
    
    # Free training memory
    del X, y, X_tensor, y_tensor
    gc.collect()
    
    features_tensor = pd.DataFrame()
    for fname in ["validation_data_drug1__one-hot_encoding.csv", 
                  "validation_data_drug2__one-hot_encoding.csv", 
                  "validation_data_cell_lines__one-hot_encoding.csv"]:
        temp = pd.read_csv(data_dir + fname)
        features_tensor = pd.concat([features_tensor, temp], axis=1)
    
    features_auxiliary = pd.DataFrame()
    for fname in ["validation_data_drug1__estate_fingerprints.csv", 
                  "validation_data_drug2__estate_fingerprints.csv", 
                  "validation_data_cell_lines__gene_expression.csv"]:
        temp = pd.read_csv(data_dir + fname)
        features_auxiliary = pd.concat([features_auxiliary, temp], axis=1)
    
    # Read validation labels and add 'X/X0' as a feature
    y_val = pd.read_csv(data_dir + f"test_fold_{fold}.csv")
    features_tensor['X/X0'] = y_val['X/X0']
    
    # Combine features and remove duplicate columns
    X_test = pd.concat([features_tensor, features_auxiliary], axis=1).dropna()
    X_test = X_test.loc[:, ~X_test.columns.duplicated()].copy()
    
    # Merge drug concentration labels with validation features
    y_test = y_val[["drugA Conc (µM)", "drugB Conc (µM)"]]
    X_test = pd.merge(X_test, y_test, left_index=True, right_index=True)
    y_test = X_test[["drugA Conc (µM)", "drugB Conc (µM)"]]
    X_test.drop(["drugA Conc (µM)", "drugB Conc (µM)"], axis=1, inplace=True)
    
    y_test['drugA Conc (µM)'] = np.log2(y_test['drugA Conc (µM)'])
    y_test['drugB Conc (µM)'] = np.log2(y_test['drugB Conc (µM)'])
    
    # Convert validation data to torch tensors
    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

    model.eval()
    with torch.no_grad():
        predictions = model(X_test_tensor)
    
    # Convert predictions to NumPy arrays for evaluation
    pred_np = predictions.numpy()
    y_test_np = y_test_tensor.numpy()
    
    mse = mean_squared_error(y_test_np, pred_np)
    mae = np.mean(np.abs(y_test_np - pred_np))
    print(f"Fold {fold} - Validation MSE: {mse:.4f}")
    print(f"Fold {fold} - Validation MAE: {mae:.4f}")
    
    p1 = list()
    p2 = list()
    for i in predictions:
        p1.append(complex(i[0], i[1]))
    for i in y_test.to_numpy():
        p2.append(complex(i[0], i[1]))
    mse = np.mean(pow(abs(np.array(p1)-np.array(p2)), 2))
    mae = np.mean(abs(np.array(p1)-np.array(p2)))
    print(f"Mean squared error for fold {fold}: {mse}")
    print(f"Mean squared error for fold {fold}: {mae}")

    # Free validation memory and model for the next fold
    del X_test, y_test, X_test_tensor, y_test_tensor, model
    gc.collect()


## Alternative Models for NCI Dataset:

### ElasticNet

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import gc
from sklearn.linear_model import ElasticNet

for fold in range(5):
    data_dir = f"./NCI files/NCI Real Data Fold{fold + 1}/"

    # Combine all features excluding drug concentration
    features_tensor = pd.DataFrame()
    for i in ["drug1__one-hot_encoding.csv", "drug2__one-hot_encoding.csv", "cell_lines__one-hot_encoding.csv"]:
        temp = pd.read_csv(data_dir + i)
        features_tensor = pd.concat([features_tensor, temp], axis=1)

    features_auxiliary = pd.DataFrame()
    for i in ["drug1__estate_fingerprints.csv", "drug2__estate_fingerprints.csv", "cell_lines__gene_expression.csv"]:
        temp = pd.read_csv(data_dir + i)
        features_auxiliary = pd.concat([features_auxiliary, temp], axis=1)

    # Add X/X0 as a feature
    y = pd.read_csv(data_dir + f"NCI_train_fold_{fold}.csv")
    features_tensor['PercentageGrowth'] = y['PercentageGrowth']
    # Combine all features
    X = pd.concat([features_tensor, features_auxiliary], axis=1).dropna()
    X = X.loc[:, ~X.columns.duplicated()].copy()


    # Set drug concentration as the label
    y = y[["Conc1", "Conc2"]]
    X = pd.merge(X, y, left_index=True, right_index=True)
    y = X[["Conc1", "Conc2"]]
    y['Conc1'] = np.log2(y['Conc1'])
    y['Conc2'] = np.log2(y['Conc2'])
    X.drop(["Conc1", "Conc2"], axis=1, inplace=True)

    
    # Train model
    print(f"Start Training for fold {fold}: ")
    model = ElasticNet()
    model.fit(X, y)
    del X, y
    gc.collect()

    # Validation
    features_tensor = pd.DataFrame()
    for i in ["validation_data_drug1__one-hot_encoding.csv", "validation_data_drug2__one-hot_encoding.csv", "validation_data_cell_lines__one-hot_encoding.csv"]:
        temp = pd.read_csv(data_dir + i)
        features_tensor = pd.concat([features_tensor, temp], axis=1)

    features_auxiliary = pd.DataFrame()
    for i in ["validation_data_drug1__estate_fingerprints.csv", "validation_data_drug2__estate_fingerprints.csv", "validation_data_cell_lines__gene_expression.csv"]:
        temp = pd.read_csv(data_dir + i)
        features_auxiliary = pd.concat([features_auxiliary, temp], axis=1)

    # Add X/X0 as a feature for validation
    y_val = pd.read_csv(data_dir + f"NCI_test_fold_{fold}.csv")
    features_tensor['PercentageGrowth'] = y_val['PercentageGrowth']
    # Combine all features for validation
    X_test = pd.concat([features_tensor, features_auxiliary], axis=1).dropna()
    X_test = X_test.loc[:, ~X_test.columns.duplicated()].copy()

    # Validation label (drug concentrations)
    y_test = y_val[["Conc1", "Conc2"]]
    y_test = y_test[["Conc1", "Conc2"]]
    X_test = pd.merge(X_test, y_test, left_index=True, right_index=True)
    y_test = X_test[["Conc1", "Conc2"]]
    X_test.drop(["Conc1", "Conc2"], axis=1, inplace=True)
    y_test['Conc1'] = np.log2(y_test['Conc1'])
    y_test['Conc2'] = np.log2(y_test['Conc2'])

    

    # Predict and evaluate
    pred = model.predict(X_test)
    mse = mean_squared_error(y_test, pred)
    print(f"Mean squared error for fold {fold}: {mse}")
    p1 = list()
    p2 = list()
    for i in pred:
        p1.append(complex(i[0], i[1]))
    for i in y_test.to_numpy():
        p2.append(complex(i[0], i[1]))
    mse = np.mean(pow(abs(np.array(p1)-np.array(p2)), 2))
    mae = np.mean(abs(np.array(p1)-np.array(p2)))
    print(f"Mean squared error for fold {fold}: {mse}")
    print(f"Mean squared error for fold {fold}: {mae}")


### Random Forest

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import gc

for fold in range(5):
    data_dir = f"./NCI files/NCI Real Data Fold{fold + 1}/"

    # Combine all features excluding drug concentration
    features_tensor = pd.DataFrame()
    for i in ["drug1__one-hot_encoding.csv", "drug2__one-hot_encoding.csv", "cell_lines__one-hot_encoding.csv"]:
        temp = pd.read_csv(data_dir + i)
        features_tensor = pd.concat([features_tensor, temp], axis=1)

    features_auxiliary = pd.DataFrame()
    for i in ["drug1__estate_fingerprints.csv", "drug2__estate_fingerprints.csv", "cell_lines__gene_expression.csv"]:
        temp = pd.read_csv(data_dir + i)
        features_auxiliary = pd.concat([features_auxiliary, temp], axis=1)

    # Add X/X0 as a feature
    y = pd.read_csv(data_dir + f"NCI_train_fold_{fold}.csv")
    features_tensor['PercentageGrowth'] = y['PercentageGrowth']
    # Combine all features
    X = pd.concat([features_tensor, features_auxiliary], axis=1).dropna()
    X = X.loc[:, ~X.columns.duplicated()].copy()


    # Set drug concentration as the label
    y = y[["Conc1", "Conc2"]]
    X = pd.merge(X, y, left_index=True, right_index=True)
    y = X[["Conc1", "Conc2"]]
    y['Conc1'] = np.log2(y['Conc1'])
    y['Conc2'] = np.log2(y['Conc2'])
    X.drop(["Conc1", "Conc2"], axis=1, inplace=True)

    
    # Train model
    print(f"Start Training for fold {fold}: ")
    model = RandomForestRegressor(
    n_estimators=50,         # Reduce the number of trees (default is 100)
    max_depth=10,            # Limit the depth of trees (default is None)
    min_samples_split=10,     # Increase minimum samples per split (default is 2)
    min_samples_leaf=4,       # Increase minimum samples per leaf (default is 1)
    max_features='sqrt',      # Use a subset of features (default is 'auto')
    n_jobs=-1,                # Use all CPU cores
    random_state=42,          # Ensure reproducibility
    verbose=0                 # Reduce logging overhead
)
    model.fit(X, y)
    del X, y
    gc.collect()

    # Validation
    features_tensor = pd.DataFrame()
    for i in ["validation_data_drug1__one-hot_encoding.csv", "validation_data_drug2__one-hot_encoding.csv", "validation_data_cell_lines__one-hot_encoding.csv"]:
        temp = pd.read_csv(data_dir + i)
        features_tensor = pd.concat([features_tensor, temp], axis=1)

    features_auxiliary = pd.DataFrame()
    for i in ["validation_data_drug1__estate_fingerprints.csv", "validation_data_drug2__estate_fingerprints.csv", "validation_data_cell_lines__gene_expression.csv"]:
        temp = pd.read_csv(data_dir + i)
        features_auxiliary = pd.concat([features_auxiliary, temp], axis=1)

    # Add X/X0 as a feature for validation
    y_val = pd.read_csv(data_dir + f"NCI_test_fold_{fold}.csv")
    features_tensor['PercentageGrowth'] = y_val['PercentageGrowth']
    # Combine all features for validation
    X_test = pd.concat([features_tensor, features_auxiliary], axis=1).dropna()
    X_test = X_test.loc[:, ~X_test.columns.duplicated()].copy()

    # Validation label (drug concentrations)
    y_test = y_val[["Conc1", "Conc2"]]
    y_test = y_test[["Conc1", "Conc2"]]
    X_test = pd.merge(X_test, y_test, left_index=True, right_index=True)
    y_test = X_test[["Conc1", "Conc2"]]
    X_test.drop(["Conc1", "Conc2"], axis=1, inplace=True)
    y_test['Conc1'] = np.log2(y_test['Conc1'])
    y_test['Conc2'] = np.log2(y_test['Conc2'])

    

    # Predict and evaluate
    pred = model.predict(X_test)
    mse = mean_squared_error(y_test, pred)
    print(f"Mean squared error for fold {fold}: {mse}")
    p1 = list()
    p2 = list()
    for i in pred:
        p1.append(complex(i[0], i[1]))
    for i in y_test.to_numpy():
        p2.append(complex(i[0], i[1]))
    mse = np.mean(pow(abs(np.array(p1)-np.array(p2)), 2))
    mae = np.mean(abs(np.array(p1)-np.array(p2)))
    print(f"Mean squared error for fold {fold}: {mse}")
    print(f"Mean squared error for fold {fold}: {mae}")


### MLP

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error
import gc

# Define a simple MLP with 4 layers
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim1=128, hidden_dim2=64, hidden_dim3=32, output_dim=2):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, hidden_dim3)
        self.fc4 = nn.Linear(hidden_dim3, output_dim)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x

for fold in range(5):
    data_dir = f"./NCI files/NCI Real Data Fold{fold + 1}/"
    

    # Combine one-hot encoded features
    features_tensor = pd.DataFrame()
    for fname in ["drug1__one-hot_encoding.csv", "drug2__one-hot_encoding.csv", "cell_lines__one-hot_encoding.csv"]:
        temp = pd.read_csv(data_dir + fname)
        features_tensor = pd.concat([features_tensor, temp], axis=1)
    
    # Combine auxiliary features
    features_auxiliary = pd.DataFrame()
    for fname in ["drug1__estate_fingerprints.csv", "drug2__estate_fingerprints.csv", "cell_lines__gene_expression.csv"]:
        temp = pd.read_csv(data_dir + fname)
        features_auxiliary = pd.concat([features_auxiliary, temp], axis=1)
    
    # Read training labels and add 'X/X0' as a feature
    y_train = pd.read_csv(data_dir + f"NCI_train_fold_{fold}.csv")
    features_tensor['PercentageGrowth'] = y_train['PercentageGrowth']
    
    # Combine all features and remove duplicate columns
    X = pd.concat([features_tensor, features_auxiliary], axis=1).dropna()
    X = X.loc[:, ~X.columns.duplicated()].copy()
    
    # Merge drug concentration labels with features
    y = y_train[["Conc1", "Conc2"]]
    X = pd.merge(X, y, left_index=True, right_index=True)
    y = X[["Conc1", "Conc2"]]
    
    # Log-transform the labels
    y['Conc1'] = np.log2(y['Conc1'])
    y['Conc2'] = np.log2(y['Conc2'])
    X.drop(["Conc1", "Conc2"], axis=1, inplace=True)
    
    # Convert training data to torch tensors
    X_tensor = torch.tensor(X.values, dtype=torch.float32)
    y_tensor = torch.tensor(y.values, dtype=torch.float32)
    

    input_dim = X_tensor.shape[1]
    model = MLP(input_dim=input_dim, hidden_dim1=128, hidden_dim2=64, hidden_dim3=32, output_dim=2)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    epochs = 100
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(X_tensor)
        loss = criterion(outputs, y_tensor)
        loss.backward()
        optimizer.step()
        
        if (epoch + 1) % 10 == 0:
            print(f"Fold {fold}, Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}")
    
    # Free training memory
    del X, y, X_tensor, y_tensor
    gc.collect()
    

    features_tensor = pd.DataFrame()
    for fname in ["validation_data_drug1__one-hot_encoding.csv", 
                  "validation_data_drug2__one-hot_encoding.csv", 
                  "validation_data_cell_lines__one-hot_encoding.csv"]:
        temp = pd.read_csv(data_dir + fname)
        features_tensor = pd.concat([features_tensor, temp], axis=1)
    
    features_auxiliary = pd.DataFrame()
    for fname in ["validation_data_drug1__estate_fingerprints.csv", 
                  "validation_data_drug2__estate_fingerprints.csv", 
                  "validation_data_cell_lines__gene_expression.csv"]:
        temp = pd.read_csv(data_dir + fname)
        features_auxiliary = pd.concat([features_auxiliary, temp], axis=1)
    
    # Read validation labels and add 'X/X0' as a feature
    y_val = pd.read_csv(data_dir + f"NCI_test_fold_{fold}.csv")
    features_tensor['PercentageGrowth'] = y_val['PercentageGrowth']
    
    # Combine features and remove duplicate columns
    X_test = pd.concat([features_tensor, features_auxiliary], axis=1).dropna()
    X_test = X_test.loc[:, ~X_test.columns.duplicated()].copy()
    
    # Merge drug concentration labels with validation features
    y_test = y_val[["Conc1", "Conc2"]]
    X_test = pd.merge(X_test, y_test, left_index=True, right_index=True)
    y_test = X_test[["Conc1", "Conc2"]]
    X_test.drop(["Conc1", "Conc2"], axis=1, inplace=True)
    
    # Log-transform the validation labels
    y_test['Conc1'] = np.log2(y_test['Conc1'])
    y_test['Conc2'] = np.log2(y_test['Conc2'])
    
    # Convert validation data to torch tensors
    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)
    

    model.eval()
    with torch.no_grad():
        predictions = model(X_test_tensor)
    
    # Convert predictions to NumPy arrays for evaluation
    pred_np = predictions.numpy()
    y_test_np = y_test_tensor.numpy()
    
    mse = mean_squared_error(y_test_np, pred_np)
    mae = np.mean(np.abs(y_test_np - pred_np))
    print(f"Fold {fold} - Validation MSE: {mse:.4f}")
    print(f"Fold {fold} - Validation MAE: {mae:.4f}")
    
    p1 = list()
    p2 = list()
    for i in predictions:
        p1.append(complex(i[0], i[1]))
    for i in y_test.to_numpy():
        p2.append(complex(i[0], i[1]))
    mse = np.mean(pow(abs(np.array(p1)-np.array(p2)), 2))
    mae = np.mean(abs(np.array(p1)-np.array(p2)))
    print(f"Mean squared error for fold {fold}: {mse}")
    print(f"Mean squared error for fold {fold}: {mae}")

    # Free validation memory and model for the next fold
    del X_test, y_test, X_test_tensor, y_test_tensor, model
    gc.collect()


## Alternative Models for AZ-Dream Dataset:

### ElasticNet

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import gc
from sklearn.linear_model import ElasticNet
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)


for fold in range(5):
    data_dir = f"./Astra files/Astra Real Data Fold{fold + 1}/"

    features_tensor = pd.DataFrame()
    for i in ["drug_row__one-hot_encoding.csv", "drug_col__one-hot_encoding.csv", "cell_lines__one-hot_encoding.csv"]:
        temp = pd.read_csv(data_dir+i)
        features_tensor = pd.concat([features_tensor, temp], axis=1)

    features_auxiliary = pd.DataFrame()
    for i in ["drug_row__estate_fingerprints.csv", "drug_col__estate_fingerprints.csv", "cell_lines__gene_expression.csv"]:
        temp = pd.read_csv(data_dir+i)
        features_auxiliary = pd.concat([features_tensor, temp], axis=1)

     # Add X/X0 as a feature
    y = pd.read_csv(data_dir + f"astra_train_fold_{fold}.csv")
    features_tensor['inhibition'] = y['inhibition']
    # Combine all features
    X = pd.concat([features_tensor, features_auxiliary], axis=1).dropna()
    X = X.loc[:, ~X.columns.duplicated()].copy()
    
    # Set drug concentration as the label
    y = y[["conc_r", "conc_c"]]
    X = pd.merge(X, y, left_index=True, right_index=True)
    y = X[["conc_r", "conc_c"]]
    y['conc_r'] = np.log2(y['conc_r'])
    y['conc_c'] = np.log2(y['conc_c'])
    X.drop(["conc_r", "conc_c"], axis=1, inplace=True)

    
    # Train model
    print(f"Start Training for fold {fold}: ")
    model = ElasticNet()
    X.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X.columns.values]

    model.fit(X, y)
    features = X.columns
    del X, y
    gc.collect()


    seed = 123 # Random seed
    #data_dir = "./our data/final/"
    data_dir = f"./Astra files/Astra Real Data Fold{fold+1}/"

    features_tensor = pd.DataFrame()
    for i in ["validation_data_drug_row__one-hot_encoding.csv", "validation_data_drug_col__one-hot_encoding.csv", "validation_data_cell_lines__one-hot_encoding.csv"]:
        temp = pd.read_csv(data_dir+i)
        features_tensor = pd.concat([features_tensor, temp], axis=1)

    features_auxiliary = pd.DataFrame()
    for i in ["validation_data_drug_row__estate_fingerprints.csv", "validation_data_drug_col__estate_fingerprints.csv", "validation_data_cell_lines__gene_expression.csv"]:
        temp = pd.read_csv(data_dir+i)
        features_auxiliary = pd.concat([features_tensor, temp], axis=1)

    # Add X/X0 as a feature for validation
    y_val = pd.read_csv(data_dir + f"astra_test_fold_{fold}.csv")
    features_tensor['inhibition'] = y_val['inhibition']
    # Combine all features for validation
    X_test = pd.concat([features_tensor, features_auxiliary], axis=1).dropna()
    X_test = X_test.loc[:, ~X_test.columns.duplicated()].copy()
    

    # Validation label (drug concentrations)
    y_test = y_val[["conc_r", "conc_c"]]
    y_test = y_test[["conc_r", "conc_c"]]
    X_test = pd.merge(X_test, y_test, left_index=True, right_index=True)
    y_test = X_test[["conc_r", "conc_c"]]
    X_test.drop(["conc_r", "conc_c"], axis=1, inplace=True)
    X_test = X_test.reindex(columns=features, fill_value=0)

    y_test['conc_r'] = np.log2(y_test['conc_r'])
    y_test['conc_c'] = np.log2(y_test['conc_c'])

    

    # Predict and evaluate
    pred = model.predict(X_test)
    mse = mean_squared_error(y_test, pred)
    print(f"Mean squared error for fold {fold}: {mse}")
    p1 = list()
    p2 = list()
    for i in pred:
        p1.append(complex(i[0], i[1]))
    for i in y_test.to_numpy():
        p2.append(complex(i[0], i[1]))
    mse = np.mean(pow(abs(np.array(p1)-np.array(p2)), 2))
    mae = np.mean(abs(np.array(p1)-np.array(p2)))
    print(f"Mean squared error for fold {fold}: {mse}")
    print(f"Mean squared error for fold {fold}: {mae}")


### Random Forest

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import gc
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)


for fold in range(5):
    data_dir = f"./Astra files/Astra Real Data Fold{fold + 1}/"

    features_tensor = pd.DataFrame()
    for i in ["drug_row__one-hot_encoding.csv", "drug_col__one-hot_encoding.csv", "cell_lines__one-hot_encoding.csv"]:
        temp = pd.read_csv(data_dir+i)
        features_tensor = pd.concat([features_tensor, temp], axis=1)

    features_auxiliary = pd.DataFrame()
    for i in ["drug_row__estate_fingerprints.csv", "drug_col__estate_fingerprints.csv", "cell_lines__gene_expression.csv"]:
        temp = pd.read_csv(data_dir+i)
        features_auxiliary = pd.concat([features_tensor, temp], axis=1)

     # Add X/X0 as a feature
    y = pd.read_csv(data_dir + f"astra_train_fold_{fold}.csv")
    features_tensor['inhibition'] = y['inhibition']
    # Combine all features
    X = pd.concat([features_tensor, features_auxiliary], axis=1).dropna()
    X = X.loc[:, ~X.columns.duplicated()].copy()
    
    # Set drug concentration as the label
    y = y[["conc_r", "conc_c"]]
    X = pd.merge(X, y, left_index=True, right_index=True)
    y = X[["conc_r", "conc_c"]]
    y['conc_r'] = np.log2(y['conc_r'])
    y['conc_c'] = np.log2(y['conc_c'])
    X.drop(["conc_r", "conc_c"], axis=1, inplace=True)

    
    # Train model
    print(f"Start Training for fold {fold}: ")
    model = RandomForestRegressor(
    n_estimators=50,         # Reduce the number of trees (default is 100)
    max_depth=10,            # Limit the depth of trees (default is None)
    min_samples_split=10,     # Increase minimum samples per split (default is 2)
    min_samples_leaf=4,       # Increase minimum samples per leaf (default is 1)
    max_features='sqrt',      # Use a subset of features (default is 'auto')
    n_jobs=-1,                # Use all CPU cores
    random_state=42,          # Ensure reproducibility
    verbose=0                 # Reduce logging overhead
)
    X.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X.columns.values]

    model.fit(X, y)
    features = X.columns
    del X, y
    gc.collect()


    seed = 123 # Random seed
    #data_dir = "./our data/final/"
    data_dir = f"./Astra files/Astra Real Data Fold{fold+1}/"

    features_tensor = pd.DataFrame()
    for i in ["validation_data_drug_row__one-hot_encoding.csv", "validation_data_drug_col__one-hot_encoding.csv", "validation_data_cell_lines__one-hot_encoding.csv"]:
        temp = pd.read_csv(data_dir+i)
        features_tensor = pd.concat([features_tensor, temp], axis=1)

    features_auxiliary = pd.DataFrame()
    for i in ["validation_data_drug_row__estate_fingerprints.csv", "validation_data_drug_col__estate_fingerprints.csv", "validation_data_cell_lines__gene_expression.csv"]:
        temp = pd.read_csv(data_dir+i)
        features_auxiliary = pd.concat([features_tensor, temp], axis=1)

    # Add X/X0 as a feature for validation
    y_val = pd.read_csv(data_dir + f"astra_test_fold_{fold}.csv")
    features_tensor['inhibition'] = y_val['inhibition']
    # Combine all features for validation
    X_test = pd.concat([features_tensor, features_auxiliary], axis=1).dropna()
    X_test = X_test.loc[:, ~X_test.columns.duplicated()].copy()
    

    # Validation label (drug concentrations)
    y_test = y_val[["conc_r", "conc_c"]]
    y_test = y_test[["conc_r", "conc_c"]]
    X_test = pd.merge(X_test, y_test, left_index=True, right_index=True)
    y_test = X_test[["conc_r", "conc_c"]]
    X_test.drop(["conc_r", "conc_c"], axis=1, inplace=True)
    X_test = X_test.reindex(columns=features, fill_value=0)

    y_test['conc_r'] = np.log2(y_test['conc_r'])
    y_test['conc_c'] = np.log2(y_test['conc_c'])

    

    # Predict and evaluate
    pred = model.predict(X_test)
    mse = mean_squared_error(y_test, pred)
    print(f"Mean squared error for fold {fold}: {mse}")
    p1 = list()
    p2 = list()
    for i in pred:
        p1.append(complex(i[0], i[1]))
    for i in y_test.to_numpy():
        p2.append(complex(i[0], i[1]))
    mse = np.mean(pow(abs(np.array(p1)-np.array(p2)), 2))
    mae = np.mean(abs(np.array(p1)-np.array(p2)))
    print(f"Mean squared error for fold {fold}: {mse}")
    print(f"Mean squared error for fold {fold}: {mae}")


### MLP

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error
import gc

# Define a simple MLP with 4 layers
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim1=128, hidden_dim2=64, hidden_dim3=32, output_dim=2):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, hidden_dim3)
        self.fc4 = nn.Linear(hidden_dim3, output_dim)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x

for fold in range(5):
    import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)


for fold in range(5):
    data_dir = f"./Astra files/Astra Real Data Fold{fold + 1}/"

    features_tensor = pd.DataFrame()
    for i in ["drug_row__one-hot_encoding.csv", "drug_col__one-hot_encoding.csv", "cell_lines__one-hot_encoding.csv"]:
        temp = pd.read_csv(data_dir+i)
        features_tensor = pd.concat([features_tensor, temp], axis=1)

    features_auxiliary = pd.DataFrame()
    for i in ["drug_row__estate_fingerprints.csv", "drug_col__estate_fingerprints.csv", "cell_lines__gene_expression.csv"]:
        temp = pd.read_csv(data_dir+i)
        features_auxiliary = pd.concat([features_tensor, temp], axis=1)

     # Add X/X0 as a feature
    y = pd.read_csv(data_dir + f"astra_train_fold_{fold}.csv")
    features_tensor['inhibition'] = y['inhibition']
    # Combine all features
    X = pd.concat([features_tensor, features_auxiliary], axis=1).dropna()
    X = X.loc[:, ~X.columns.duplicated()].copy()
    
    # Set drug concentration as the label
    y = y[["conc_r", "conc_c"]]
    X = pd.merge(X, y, left_index=True, right_index=True)
    y = X[["conc_r", "conc_c"]]
    y['conc_r'] = np.log2(y['conc_r'])
    y['conc_c'] = np.log2(y['conc_c'])
    X.drop(["conc_r", "conc_c"], axis=1, inplace=True)
    X.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X.columns.values]
    features = X.columns

    # Convert training data to torch tensors
    X_tensor = torch.tensor(X.values, dtype=torch.float32)
    y_tensor = torch.tensor(y.values, dtype=torch.float32)
    
    # ---------------------------
    # Train the MLP Model
    # ---------------------------
    input_dim = X_tensor.shape[1]
    model = MLP(input_dim=input_dim, hidden_dim1=128, hidden_dim2=64, hidden_dim3=32, output_dim=2)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    epochs = 100
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(X_tensor)
        loss = criterion(outputs, y_tensor)
        loss.backward()
        optimizer.step()
        
        if (epoch + 1) % 10 == 0:
            print(f"Fold {fold}, Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}")
    
    # Free training memory
    del X, y, X_tensor, y_tensor
    gc.collect()
    
    # ---------------------------
    # Prepare Validation Data
    # ---------------------------
    # Validation phase
    data_dir = f"./Astra files/Astra Real Data Fold{fold+1}/"

    features_tensor = pd.DataFrame()
    for i in ["validation_data_drug_row__one-hot_encoding.csv", "validation_data_drug_col__one-hot_encoding.csv", "validation_data_cell_lines__one-hot_encoding.csv"]:
        temp = pd.read_csv(data_dir+i)
        features_tensor = pd.concat([features_tensor, temp], axis=1)

    features_auxiliary = pd.DataFrame()
    for i in ["validation_data_drug_row__estate_fingerprints.csv", "validation_data_drug_col__estate_fingerprints.csv", "validation_data_cell_lines__gene_expression.csv"]:
        temp = pd.read_csv(data_dir+i)
        features_auxiliary = pd.concat([features_tensor, temp], axis=1)

    # Add X/X0 as a feature for validation
    y_val = pd.read_csv(data_dir + f"astra_test_fold_{fold}.csv")
    features_tensor['inhibition'] = y_val['inhibition']
    # Combine all features for validation
    X_test = pd.concat([features_tensor, features_auxiliary], axis=1).dropna()
    X_test = X_test.loc[:, ~X_test.columns.duplicated()].copy()
    

    # Validation label (drug concentrations)
    y_test = y_val[["conc_r", "conc_c"]]
    y_test = y_test[["conc_r", "conc_c"]]
    X_test = pd.merge(X_test, y_test, left_index=True, right_index=True)
    y_test = X_test[["conc_r", "conc_c"]]
    X_test.drop(["conc_r", "conc_c"], axis=1, inplace=True)
    X_test = X_test.reindex(columns=features, fill_value=0)

    y_test['conc_r'] = np.log2(y_test['conc_r'])
    y_test['conc_c'] = np.log2(y_test['conc_c'])
    
    # Convert validation data to torch tensors
    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)
    
    # ---------------------------
    # Validate the Model
    # ---------------------------
    model.eval()
    with torch.no_grad():
        predictions = model(X_test_tensor)
    
    # Convert predictions to NumPy arrays for evaluation
    pred_np = predictions.numpy()
    y_test_np = y_test_tensor.numpy()
    
    mse = mean_squared_error(y_test_np, pred_np)
    mae = np.mean(np.abs(y_test_np - pred_np))
    print(f"Fold {fold} - Validation MSE: {mse:.4f}")
    print(f"Fold {fold} - Validation MAE: {mae:.4f}")
    
    p1 = list()
    p2 = list()
    for i in predictions:
        p1.append(complex(i[0], i[1]))
    for i in y_test.to_numpy():
        p2.append(complex(i[0], i[1]))
    mse = np.mean(pow(abs(np.array(p1)-np.array(p2)), 2))
    mae = np.mean(abs(np.array(p1)-np.array(p2)))
    print(f"Mean squared error for fold {fold}: {mse}")
    print(f"Mean squared error for fold {fold}: {mae}")

    # Free validation memory and model for the next fold
    del X_test, y_test, X_test_tensor, y_test_tensor, model
    gc.collect()
