In [23]:
import pandas as pd
dataset_path='./data/ProcessedFile/Prism_secondary_auc.csv'
drug_info_path = "./data/ProcessedFile/sec_auc_drug.csv"
cell_info_path = "./data/ProcessedFile/22Q1expressions.csv"

# Random partitioning repeated 5 times for random testing, unseen drug testing, and unseen cell line testing

In [4]:
response = pd.read_csv(dataset_path)

In [41]:
import os
import numpy as np
np.random.seed(42)

seed_list = np.random.randint(1,100000,size = 5)
save_path=f'./data/ProcessedFile/data_split'
os.makedirs(save_path, exist_ok=True)
dataset = response
split = "random"  
train_dict,val_dict,test_dict = {},{},{}
for seed in seed_list:
    np.random.seed(seed)
    
    if split == 'cell_blind':
        col = "depmap_id"
        objs = np.random.permutation(dataset[col].unique())
    elif split == 'drug_blind':
        col = "name"
        objs = np.random.permutation(dataset[col].unique())
    elif split == 'random':
        objs = np.random.permutation(len(dataset))
    
    train_nums = int(0.8*len(objs))
    val_nums = int(0.1*len(objs))+1
    test_nums = len(objs) - train_nums - val_nums  
    
    if split == 'cell_blind' or split == 'drug_blind':
        targets=[]
        other_objs  = [obj for obj in objs if obj not in targets]
        train_objs = other_objs[:train_nums]
        valid_objs = other_objs[train_nums:train_nums+val_nums]
        test_objs = other_objs[train_nums+val_nums:]+targets
        train = dataset.loc[dataset[col].isin(train_objs)]
        val = dataset.loc[dataset[col].isin(valid_objs)]
        test = dataset.loc[dataset[col].isin(test_objs)]
    elif split == 'random':
        train_index = objs[:train_nums]
        val_index = objs[train_nums:train_nums+val_nums]
        test_index = objs[train_nums+val_nums:]

        train = dataset.iloc[train_index ,:]
        val = dataset.iloc[val_index,:]
        test = dataset.iloc[test_index,:]

    train_dict[seed]=train
    val_dict[seed]=val
    test_dict[seed] = test

    train.to_csv(os.path.join(save_path,f'{seed}{split}_Training.csv'),index=False)
    val.to_csv(os.path.join(save_path,f'{seed}{split}_Validation.csv'),index=False)
    test.to_csv(os.path.join(save_path,f'{seed}{split}_Test.csv'),index=False)

# Grouping drugs/cell lines by similarity and partitioning data

In [9]:
from rdkit import Chem
import pandas as pd
from rdkit.ML.Cluster import Butina
from rdkit.Chem import AllChem, DataStructs
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
import random

import numpy as np
from rdkit.ML.Cluster import Butina
from scipy.spatial.distance import cosine
import warnings

## Partitioning based on drug similarity

### Clustering based on drug similarity

In [11]:
def get_morgan_fp(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)


def calculate_tanimoto_distances(fps):
    dist_matrix = []
    N = len(fps)
    for i in range(N):
        for j in range(i + 1, N):
            dist = 1 - DataStructs.TanimotoSimilarity(fps[i], fps[j])  
            dist_matrix.append(dist) 
    return dist_matrix

def adjust_clusters(dist_matrix, num_samples,num_groups, threshold=0.3, random_seed=42):
    clusters = list(Butina.ClusterData(dist_matrix , num_samples, distThresh=threshold, isDistData=True))
    target_size = num_samples // num_groups
    final_clusters = []

    random.seed(random_seed)  
    random.shuffle(clusters)

    current_cluster = []
    current_size = 0
    for i, cluster in tqdm(enumerate(clusters)):
        current_cluster.extend(cluster)  
        current_size += len(cluster)  

        if current_size >= target_size:
            final_clusters.append(current_cluster)  
            current_cluster = []  
            current_size = 0 


    if current_cluster:
        final_clusters.append(current_cluster)  


    while len(final_clusters) > num_groups:
        small_cluster = final_clusters.pop()  
        final_clusters[-1].extend(small_cluster)  


    if len(final_clusters) < num_groups:
        print(len(final_clusters),num_groups)
        raise ValueError(f"Cannot reach the target number of clusters ({num_groups}) with the current threshold. Please adjust the threshold or the target cluster count.")
    return final_clusters

In [12]:
data_df = pd.read_csv(dataset_path)
drug_info = pd.read_csv(drug_info_path)

smiles_dict = dict(zip(drug_info.name, drug_info.canonical_smiles))
smiles_list = list(smiles_dict.values())
drug_ids = list(smiles_dict.keys())

In [None]:
drug_smiles = smiles_list

fps = [get_morgan_fp(smiles) for smiles in drug_smiles]

dist_matrix = calculate_tanimoto_distances(fps)

num_groups = 10  
threshold = 0.3 


drug_random_seed = 42
clusters = adjust_clusters(dist_matrix,len(drug_smiles), num_groups, threshold, random_seed=drug_random_seed)

smiles_df = pd.DataFrame({'drug_id': drug_ids, 'smiles': smiles_list})

for i,cluster in tqdm(enumerate(clusters)):
    smiles_df.loc[cluster, 'group'] = i+1

In [18]:
smiles_df.to_csv(f'./data/ProcessedFile/data_split/{drug_random_seed}_clustered_drug_groups.csv',index=False)

### Dataset partitioning based on drug grouping information

In [15]:
drug_group_info = smiles_df

In [16]:
drug_group_dict = {}
for i in range(1,11):
    cgroup_df = drug_group_info[drug_group_info['group']==i]
    drug_group_dict[i]= cgroup_df['drug_id'].tolist()

In [17]:
select_name = "name"
split_type="drug_sim_blind"
clusterway = drug_random_seed 

save_path = f'./data/ProcessedFile/data_split'
for i in range(1,11):
    testset = data_df[data_df[select_name].isin(drug_group_dict[i])].copy(True)
    val_index = i+1 if i<10 else 1  
    valset = data_df[data_df[select_name].isin(drug_group_dict[val_index])].copy(True)
    train_index = [j for j in range(1,11) if j not in [i,val_index]]
    train_objs = [v for k,v in drug_group_dict.items() if k in train_index]
    train_objs_list = []
    for train_obj in train_objs:
        train_objs_list.extend(train_obj)
    trainset = data_df[data_df[select_name].isin(train_objs_list)].copy(True)
    trainset.to_csv(os.path.join(save_path,f'{clusterway}{i}{split_type}_Training.csv'),index=False)
    valset.to_csv(os.path.join(save_path,f'{clusterway}{i}{split_type}_Validation.csv'),index=False)
    testset.to_csv(os.path.join(save_path,f'{clusterway}{i}{split_type}_Test.csv'),index=False)

## Partitioning based on cell line similarity

### Clustering based on cell line similarity

In [19]:
def calculate_cosine_distances(fps):
    dist_matrix = []
    N = len(fps)
    for i in range(N):
        for j in range(i + 1, N):
            dist = cosine(fps[i], fps[j])  
            dist_matrix.append(dist)  
    return dist_matrix

In [26]:
data_df = pd.read_csv(dataset_path)
cell_info = pd.read_csv(cell_info_path,index_col=0)

In [28]:
use_cells = list(set(data_df['depmap_id']))
use_cell_exp = cell_info.loc[use_cells].copy(deep=True)

cell_dict = dict()  
for index in use_cell_exp.index:
    exp_values = use_cell_exp.loc[index].values
    cell_dict[index] = exp_values

cell_list = list(cell_dict.values())
cell_ids = list(cell_dict.keys())
cell_exps = list(cell_dict.values())

In [29]:
dist_matrix = calculate_cosine_distances(cell_exps)

num_groups = 10  
threshold = 0.03  

raw_clusters = list(Butina.ClusterData(dist_matrix , len(cell_list),distThresh=threshold, isDistData=True))

In [45]:
import random
from tqdm import tqdm

def adjust_cell_clusters_balanced(clusters, num_samples, num_groups, random_seed=None):
    random.seed(random_seed)
    target_size = num_samples // num_groups

    final_clusters = [[] for _ in range(num_groups)]
    cluster_sizes = [0] * num_groups  

    clusters.sort(key=len)

    for cluster in clusters:
        available_buckets = [i for i in range(num_groups) if cluster_sizes[i] + len(cluster) <= target_size]

        if available_buckets:
            if random_seed is not None:
                chosen_bucket = random.choice(available_buckets)
            else:
                chosen_bucket = min(available_buckets, key=lambda i: cluster_sizes[i])
        else:
            chosen_bucket = cluster_sizes.index(min(cluster_sizes))

        final_clusters[chosen_bucket].extend(cluster)
        cluster_sizes[chosen_bucket] += len(cluster)

    return final_clusters

In [46]:
cell_random_seed = 42
clusters = adjust_cell_clusters_balanced(raw_clusters ,len(cell_list), num_groups,random_seed=cell_random_seed) 

cell_df = pd.DataFrame({'cell_id': cell_ids})
for i,cluster in enumerate(clusters):
    cell_df.loc[cluster, 'group'] = i+1

In [34]:
cell_df.to_csv(f'./data/ProcessedFile/data_split/{cell_random_seed}_clustered_cell_groups.csv',index=False)

### Dataset partitioning based on cell line grouping information

In [35]:
cell_group_info = cell_df

cell_group_dict = {}
for i in range(1,11):
    cgroup_df = cell_group_info[cell_group_info['group']==i]
    cell_group_dict[i]= cgroup_df['cell_id'].tolist()

In [36]:
select_name = "depmap_id"
split_type="cell_sim_blind"
clusterway =cell_random_seed

save_path = f'./data/ProcessedFile/data_split'
for i in range(1,11):
    testset = data_df[data_df[select_name].isin(cell_group_dict[i])].copy(True)

    val_index = i+1 if i<10 else 1  
    valset = data_df[data_df[select_name].isin(cell_group_dict[val_index])].copy(True)

    train_index = [j for j in range(1,11) if j not in [i,val_index]]
    train_objs = [v for k,v in cell_group_dict.items() if k in train_index]
    train_objs_list = []
    for train_obj in train_objs:
        train_objs_list.extend(train_obj)
    trainset = data_df[data_df[select_name].isin(train_objs_list)].copy(True)

    trainset.to_csv(os.path.join(save_path,f'{clusterway}{i}{split_type}_Training.csv'),index=False)
    valset.to_csv(os.path.join(save_path,f'{clusterway}{i}{split_type}_Validation.csv'),index=False)
    testset.to_csv(os.path.join(save_path,f'{clusterway}{i}{split_type}_Test.csv'),index=False)