# Predicting Drug Potency (IC50) Using Specific Drug and Mutation Combinations in Cancer Cell Lines

In [16]:
#import libraries
import pandas as pd
import numpy as np
import pubchempy as pcp
from rdkit.Chem import rdFingerprintGenerator
from sklearn.model_selection import train_test_split

In [17]:
#read CSV containing drug data
drug_data = pd.read_csv("data/raw/PANCANCER_IC_Wed Jun 18 15_22_15 2025.csv")
drug_data

Unnamed: 0,Drug Name,Drug ID,Cell Line Name,Cosmic ID,TCGA Classification,Tissue,Tissue Sub-type,IC50,AUC,Max Conc,RMSE,Z score,Dataset Version
0,Camptothecin,1003,PFSK-1,683667,MB,nervous_system,medulloblastoma,-1.463887,0.930220,0.1,0.089052,0.433123,GDSC2
1,Camptothecin,1003,A673,684052,UNCLASSIFIED,soft_tissue,rhabdomyosarcoma,-4.869455,0.614970,0.1,0.111351,-1.421100,GDSC2
2,Camptothecin,1003,ES5,684057,UNCLASSIFIED,bone,ewings_sarcoma,-3.360586,0.791072,0.1,0.142855,-0.599569,GDSC2
3,Camptothecin,1003,ES7,684059,UNCLASSIFIED,bone,ewings_sarcoma,-5.044940,0.592660,0.1,0.135539,-1.516647,GDSC2
4,Camptothecin,1003,EW-11,684062,UNCLASSIFIED,bone,ewings_sarcoma,-3.741991,0.734047,0.1,0.128059,-0.807232,GDSC2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
243461,N-acetyl cysteine,2499,SNU-175,1659928,COREAD,digestive_system,large_intestine,10.127082,0.976746,2000.0,0.074498,0.156872,GDSC2
243462,N-acetyl cysteine,2499,SNU-407,1660034,COREAD,digestive_system,large_intestine,8.576377,0.913378,2000.0,0.057821,-1.626959,GDSC2
243463,N-acetyl cysteine,2499,SNU-61,1660035,COREAD,digestive_system,large_intestine,10.519636,0.975001,2000.0,0.058090,0.608442,GDSC2
243464,N-acetyl cysteine,2499,SNU-C5,1674021,COREAD,digestive_system,large_intestine,10.694579,0.969969,2000.0,0.101013,0.809684,GDSC2


In [3]:
# read CSV containing genetic data
genetic_data = pd.read_csv("data/raw/PANCANCER_Genetic_features_Wed Jun 18 15_22_44 2025.csv")
genetic_data

  genetic_data = pd.read_csv("data/raw/PANCANCER_Genetic_features_Wed Jun 18 15_22_44 2025.csv")


Unnamed: 0,Cell Line Name,COSMIC ID,GDSC Desc1,GDSC Desc2,TCGA Desc,Genetic Feature,IS Mutated,Recurrent Gain Loss,Genes in Segment
0,CAL-29,1290730,urogenital_system,bladder,BLCA,CDC27_mut,0,,
1,CAL-29,1290730,urogenital_system,bladder,BLCA,CDC73_mut,0,,
2,CAL-29,1290730,urogenital_system,bladder,BLCA,CDH1_mut,0,,
3,CAL-29,1290730,urogenital_system,bladder,BLCA,CDK12_mut,0,,
4,CAL-29,1290730,urogenital_system,bladder,BLCA,CDKN1A_mut,0,,
...,...,...,...,...,...,...,...,...,...
697995,UWB1.289,1480374,urogenital_system,ovary,OV,HLA.B_mut,0,,
697996,UWB1.289,1480374,urogenital_system,ovary,OV,HNF1A_mut,0,,
697997,UWB1.289,1480374,urogenital_system,ovary,OV,HRAS_mut,0,,
697998,UWB1.289,1480374,urogenital_system,ovary,OV,HSPA8_mut,0,,


## Initial Data Exploration

In [4]:
# print shape of drug_data
drug_data.shape

(243466, 13)

In [5]:
# identify types within drug_data
drug_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243466 entries, 0 to 243465
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Drug Name            243466 non-null  object 
 1   Drug ID              243466 non-null  int64  
 2   Cell Line Name       243466 non-null  object 
 3   Cosmic ID            243466 non-null  int64  
 4   TCGA Classification  242399 non-null  object 
 5   Tissue               243107 non-null  object 
 6   Tissue Sub-type      243107 non-null  object 
 7   IC50                 243466 non-null  float64
 8   AUC                  243466 non-null  float64
 9   Max Conc             243466 non-null  float64
 10  RMSE                 243466 non-null  float64
 11  Z score              243466 non-null  float64
 12  Dataset Version      243466 non-null  object 
dtypes: float64(5), int64(2), object(6)
memory usage: 24.1+ MB


In [6]:
# basic statistics for drug_data
drug_data.describe()

Unnamed: 0,Drug ID,Cosmic ID,IC50,AUC,Max Conc,RMSE,Z score
count,243466.0,243466.0,243466.0,243466.0,243466.0,243466.0,243466.0
mean,1595.325466,992067.3,2.814039,0.881876,23.421608,0.082867,6.581099e-18
std,397.918457,220951.6,2.774684,0.148429,158.160555,0.042821,0.9993919
min,1003.0,683667.0,-8.747724,0.006282,0.01,0.003274,-8.254501
25%,1168.0,906805.0,1.497959,0.848868,3.0,0.05109,-0.6572931
50%,1632.0,909720.0,3.237408,0.944192,10.0,0.076114,0.01026455
75%,1912.0,1240144.0,4.707872,0.974946,10.0,0.106209,0.6560839
max,2499.0,1789883.0,13.820189,0.998904,2000.0,0.299984,7.978776


In [7]:
# print shape of genetic_data
genetic_data.shape

(698000, 9)

In [8]:
# identify types within genetic_data
genetic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698000 entries, 0 to 697999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Cell Line Name       698000 non-null  object
 1   COSMIC ID            698000 non-null  int64 
 2   GDSC Desc1           696530 non-null  object
 3   GDSC Desc2           696530 non-null  object
 4   TCGA Desc            694210 non-null  object
 5   Genetic Feature      698000 non-null  object
 6   IS Mutated           698000 non-null  int64 
 7   Recurrent Gain Loss  409700 non-null  object
 8   Genes in Segment     409700 non-null  object
dtypes: int64(2), object(7)
memory usage: 47.9+ MB


In [9]:
# basic statistics for genetic_data
genetic_data.describe()

Unnamed: 0,COSMIC ID,IS Mutated
count,698000.0,698000.0
mean,995177.2,0.040358
std,224235.8,0.196798
min,683667.0,0.0
25%,906807.0,0.0
50%,909727.0,0.0
75%,1240151.0,0.0
max,1789883.0,1.0


## Downsampling Drug Data

In [10]:
# prepare downsampling function
def downsample(group, max_samples=500):
    if len(group) > max_samples:
        return group.sample(n=max_samples, random_state=42)
    else:
        return group

In [11]:
# remove missing IC50s
drug_data_clean = drug_data.dropna(subset=['IC50'])

In [12]:
# downsample
drug_data_selected = drug_data_clean.groupby('Drug Name', group_keys=False).apply(downsample)

print(f"Original Drug Rows: {len(drug_data)}")
print(f"Downsampled Drug Rows: {len(drug_data_selected)}")

Original Drug Rows: 243466
Downsampled Drug Rows: 140107


  drug_data_selected = drug_data_clean.groupby('Drug Name', group_keys=False).apply(downsample)


## Filter Genes

In [13]:
# select only meaningful genes with > 1% frequency
mutation_counts = genetic_data.groupby('Genetic Feature')['IS Mutated'].sum()
total_cell_lines = genetic_data['COSMIC ID'].nunique()

mutation_freq = mutation_counts / total_cell_lines

meaningful_genes = mutation_freq[mutation_freq > 0.01].index

genetic_data_selected = genetic_data[genetic_data['Genetic Feature'].isin(meaningful_genes)]

print(f"Original Genes: {len(mutation_counts)}")
print(f"Meaningful Genes: {len(meaningful_genes)}")

Original Genes: 735
Meaningful Genes: 543


## SMILES

In [14]:
# obtain SMILES for selected drugs
unique_drugs = drug_data_selected['Drug Name'].unique()

drug_smiles = {}
for drug in unique_drugs:
    try:
        compound = pcp.get_compounds(drug, 'name')[0]
        drug_smiles[drug] = compound.smiles
    except (IndexError, pcp.PubChemHTTPError):
        print(f"Could not find SMILES for: {drug}")
        drug_smiles[drug] = None

drug_data_selected['SMILES'] = drug_data_selected['Drug Name'].map(drug_smiles)

drug_data_selected = drug_data_selected.dropna(subset=['SMILES'])
print(f"Drugs with SMILES: {drug_data_selected['Drug Name'].nunique()}")

Could not find SMILES for: 123138
Could not find SMILES for: 123829
Could not find SMILES for: 150412
Could not find SMILES for: 50869
Could not find SMILES for: 615590
Could not find SMILES for: 630600
Could not find SMILES for: 667880
Could not find SMILES for: 720427
Could not find SMILES for: 729189
Could not find SMILES for: 741909
Could not find SMILES for: 743380
Could not find SMILES for: 765771
Could not find SMILES for: 776928
Could not find SMILES for: ABT737
Could not find SMILES for: BDF00022089a
Could not find SMILES for: BDILV000379a
Could not find SMILES for: BDOCA000347a
Could not find SMILES for: BDP-00009066
Could not find SMILES for: BPD-00008900
Could not find SMILES for: Bleomycin (50 uM)
Could not find SMILES for: CDK9_5038
Could not find SMILES for: CDK9_5576
Could not find SMILES for: CT7033-2
Could not find SMILES for: ERK_2440
Could not find SMILES for: ERK_6604
Could not find SMILES for: Eg5_9814
Could not find SMILES for: GSK-LSD1-2HCl 
Could not find SMILE

In [15]:
# generate fingerprints
mfgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)

def get_fingerprint(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return np.zeros(1024, dtype=int)
        return mfgen.GetFingerprintAsNumPy(mol)
    except Exception as e:
        return np.zeros(1024, dtype=int)

print("Generator initialized.")

drug_data_selected['Fingerprint'] = drug_data_selected['SMILES'].apply(get_fingerprint)

fp_df = pd.DataFrame(drug_data_selected['Fingerprint'].tolist(), index=drug_data_selected.index)
fp_df.columns = [f'FP_{i}' for i in range(1024)]

drug_data_featurized = pd.concat([drug_data_selected, fp_df], axis=1)

print("Success!")
print(drug_data_featurized.head())

Generator initialized.


NameError: name 'np' is not defined

## Pivot & Merge

In [None]:
# pivot using COSMIC ID and Genetic Feature and reset index to remove extra row
genetic_data_pivoted = genetic_data_selected.pivot(index="COSMIC ID", columns="Genetic Feature", values="IS Mutated")
genetic_data_pivoted = genetic_data_pivoted.reset_index()
genetic_data_pivoted.head()

In [None]:
# merge
combined_data = pd.merge(drug_data_featurized,
                         genetic_data_pivoted,
                         on='COSMIC ID',
                         how='inner')

print(f"Merged Dataset Shape: {combined_data.shape}")

In [None]:
# check shapes of drug, genetic, and merged data to make sure they were properly joined
print("Shape of drug_data:", drug_data_featurized.shape)
print("Shape of genetic_data_pivot:", genetic_data_pivoted.shape)
print("Shape of merged:", combined_data.shape)

In [None]:
#
combined_data = pd.get_dummies(combined_data, columns=['Tissue'], prefix='Tissue')

In [None]:
#
unique_drugs = combined_data['Drug Name'].unique()

train_drugs, test_drugs = train_test_split(unique_drugs,
                                           test_size=0.2,
                                           random_state=42)

train_df = combined_data[combined_data['Drug Name'].isin(train_drugs)]

test_df = combined_data[combined_data['Drug Name'].isin(test_drugs)]

In [None]:
#
drop_cols = ['COSMIC ID', 'Drug Name', 'IC50', 'SMILES', 'Fingerprint',
             'TCGA Classification', 'Tissue Sub-type', 'Cell Line Name',
             'AUC', 'Max Conc', 'RMSE', 'Z score', 'Dataset Version']

X_train = train_df.drop(columns=drop_cols, errors='ignore')
y_train = train_df['IC50']

X_test = test_df.drop(columns=drop_cols, errors='ignore')
y_test = test_df['IC50']

print(f"Shape: {X_train.shape}")