In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Descriptors.csv")
print(df.head())

                   Name  nAcid   ALogP     ALogp2       AMR       apol  \
0  AUTOGEN_molecule_791      0  0.5938   0.352598   56.2416  80.451204   
1  AUTOGEN_molecule_736      1 -1.2467   1.554261   58.9439  48.185274   
2  AUTOGEN_molecule_987      1 -3.1752  10.081895  172.1119  94.983962   
3  AUTOGEN_molecule_602      0  0.0000   0.000000    0.0000   0.000000   
4  AUTOGEN_molecule_328      1 -0.0489   0.002391   66.8022  45.915067   

   naAromAtom  nAromBond  nAtom  nHeavyAtom  ...  APC2D10_I_I  APC2D10_I_B  \
0          23         25     66          38  ...          0.0          0.0   
1           6          6     42          24  ...          0.0          0.0   
2           0          0     82          48  ...          0.0          0.0   
3           0          0      0           0  ...          0.0          0.0   
4           5          5     40          21  ...          0.0          0.0   

   APC2D10_I_Si  APC2D10_I_X  APC2D10_B_B  APC2D10_B_Si  APC2D10_B_X  \
0           0.

In [3]:
# Extract molecule number from name
df["mol_id"] = df["Name"].str.extract(r'AUTOGEN_molecule_(\d+)').astype(int)
print(df.head())

                   Name  nAcid   ALogP     ALogp2       AMR       apol  \
0  AUTOGEN_molecule_791      0  0.5938   0.352598   56.2416  80.451204   
1  AUTOGEN_molecule_736      1 -1.2467   1.554261   58.9439  48.185274   
2  AUTOGEN_molecule_987      1 -3.1752  10.081895  172.1119  94.983962   
3  AUTOGEN_molecule_602      0  0.0000   0.000000    0.0000   0.000000   
4  AUTOGEN_molecule_328      1 -0.0489   0.002391   66.8022  45.915067   

   naAromAtom  nAromBond  nAtom  nHeavyAtom  ...  APC2D10_I_B  APC2D10_I_Si  \
0          23         25     66          38  ...          0.0           0.0   
1           6          6     42          24  ...          0.0           0.0   
2           0          0     82          48  ...          0.0           0.0   
3           0          0      0           0  ...          0.0           0.0   
4           5          5     40          21  ...          0.0           0.0   

   APC2D10_I_X  APC2D10_B_B  APC2D10_B_Si  APC2D10_B_X  APC2D10_Si_Si  \
0      

In [4]:
df2 = pd.read_csv("S.typhi_MIC_Dataset_Processed.csv") 

# Add index as a new column 'mol_id' (starting from 1)
df2 = df2.reset_index().rename(columns={'index': 'mol_id'})
df2["mol_id"] = df2["mol_id"] + 1  # Shift index by +1

# Ensure 'mol_id' is of integer type
df2["mol_id"] = df2["mol_id"].astype(int)

print(df2.head())


   mol_id  Unnamed: 0 Molecule ChEMBL ID Molecule Name  Molecule Max Phase  \
0       1           0       CHEMBL478315           NaN                 NaN   
1       2           1       CHEMBL521322           NaN                 NaN   
2       3           2       CHEMBL469088           NaN                 NaN   
3       4           3       CHEMBL514863           NaN                 NaN   
4       5           4       CHEMBL453967           NaN                 NaN   

   Molecular Weight  #RO5 Violations  AlogP Compound Key  \
0            433.60              0.0   3.79          15c   
1            476.92              0.0   4.60           5d   
2            474.43              0.0   4.46          17c   
3            419.57              0.0   3.54          14c   
4            584.59              2.0   3.06           6c   

                                              Smiles  ... Document Journal  \
0  CC(C)C1C(=O)CC(c2ccccc2)N(C(=O)CN2CCN(C)CC2)C1...  ...   Eur J Med Chem   
1  COC1(O)C(=O

In [5]:
# Merge on 'mol_id'
merged_df = pd.merge(df, df2, on="mol_id", how="inner")

# Save the merged dataset
merged_df.to_csv("merged_dataset.csv", index=False)

# Display the result
print(merged_df.head())

                   Name  nAcid   ALogP     ALogp2       AMR       apol  \
0  AUTOGEN_molecule_791      0  0.5938   0.352598   56.2416  80.451204   
1  AUTOGEN_molecule_736      1 -1.2467   1.554261   58.9439  48.185274   
2  AUTOGEN_molecule_987      1 -3.1752  10.081895  172.1119  94.983962   
3  AUTOGEN_molecule_602      0  0.0000   0.000000    0.0000   0.000000   
4  AUTOGEN_molecule_328      1 -0.0489   0.002391   66.8022  45.915067   

   naAromAtom  nAromBond  nAtom  nHeavyAtom  ...      Document Journal  \
0          23         25     66          38  ...          Med Chem Res   
1           6          6     42          24  ...  Bioorg Med Chem Lett   
2           0          0     82          48  ...  Bioorg Med Chem Lett   
3           0          0      0           0  ...            J Med Chem   
4           5          5     40          21  ...          Med Chem Res   

   Document Year  Cell ChEMBL ID  Properties  Action Type  \
0           2012             NaN         NaN     

In [6]:
# Check the column names
print("Columns in CSV:", merged_df.columns)

Columns in CSV: Index(['Name', 'nAcid', 'ALogP', 'ALogp2', 'AMR', 'apol', 'naAromAtom',
       'nAromBond', 'nAtom', 'nHeavyAtom',
       ...
       'Document Journal', 'Document Year', 'Cell ChEMBL ID', 'Properties',
       'Action Type', 'Standard Text Value', 'Value', 'MIC_µM',
       'Activity_Label', 'Processed_Molecule'],
      dtype='object', length=17590)


In [7]:
df = merged_df

df = df.loc[:, df.nunique() > 1]  # Keeps only columns with more than 1 unique value

df = df.dropna(axis=1)  # Drops any column with NaN values

# Check the column names
print("Columns in CSV:", merged_df.columns)

Columns in CSV: Index(['Name', 'nAcid', 'ALogP', 'ALogp2', 'AMR', 'apol', 'naAromAtom',
       'nAromBond', 'nAtom', 'nHeavyAtom',
       ...
       'Document Journal', 'Document Year', 'Cell ChEMBL ID', 'Properties',
       'Action Type', 'Standard Text Value', 'Value', 'MIC_µM',
       'Activity_Label', 'Processed_Molecule'],
      dtype='object', length=17590)


In [8]:
'''
chosen_columns = ['Smiles', 'Activity_Label', 'MIC_µM']  # Non-numeric columns to keep
non_numeric_cols = df.select_dtypes(exclude=["number"]).columns.difference(chosen_columns)
df = df.drop(columns=non_numeric_cols)
print(df.head())
'''

'\nchosen_columns = [\'Smiles\', \'Activity_Label\', \'MIC_µM\']  # Non-numeric columns to keep\nnon_numeric_cols = df.select_dtypes(exclude=["number"]).columns.difference(chosen_columns)\ndf = df.drop(columns=non_numeric_cols)\nprint(df.head())\n'

In [9]:
import re

# Columns to drop explicitly
columns_to_drop = [
    'Molecule ChEMBL ID', 'Molecule Name', 'Molecule Max Phase', 'Compound Key', 'Standard Type', 'Standard Relation', 'Standard Value', 'Standard Units',
    'pChEMBL Value', 'Data Validity Comment', 'Comment', 'Uo Units', 'Ligand Efficiency BEI', 'mol_id',
    'Ligand Efficiency LE', 'Ligand Efficiency LLE', 'Ligand Efficiency SEI', 'Potential Duplicate',
    'Assay ChEMBL ID', 'Assay Description', 'Assay Type', 'BAO Format ID', 'BAO Label', 'Assay Organism',
    'Assay Tissue ChEMBL ID', 'Assay Tissue Name', 'Assay Cell Type', 'Assay Subcellular Fraction',
    'Assay Parameters', 'Assay Variant Accession', 'Assay Variant Mutation', 'Target ChEMBL ID', 'Processed_Molecule',
    'Target Name', 'Target Organism', 'Target Type', 'Document ChEMBL ID', 'Source ID', 'Source Description', 'Name', 'Unnamed: 0',
    'Document Journal', 'Document Year', 'Cell ChEMBL ID', 'Properties', 'Action Type', 'Standard Text Value', 'Value'
]

# Drop explicitly listed columns
df = df.drop(columns=columns_to_drop, errors="ignore")

# Drop columns with regex patterns
#regex_patterns = ["FP\d+", "ExtFP\d+", "GraphFP\d+", "MACCSFP\d+", "PubchemFP\d+", "SubFP\d+", 
#                  "KRFP\d+", "AD2D\d+", "SubFPC\d+", "KRFPC\d+", "APC2D\d+", "EStateFP\d+"]

# Drop columns matching the patterns
#df = df.drop(columns=[col for col in df.columns if any(re.match(pattern, col) for pattern in regex_patterns)], errors="ignore")

# Print result
print(df.head())


   nAcid   ALogP     ALogp2       AMR       apol  naAromAtom  nAromBond  \
0      0  0.5938   0.352598   56.2416  80.451204          23         25   
1      1 -1.2467   1.554261   58.9439  48.185274           6          6   
2      1 -3.1752  10.081895  172.1119  94.983962           0          0   
3      0  0.0000   0.000000    0.0000   0.000000           0          0   
4      1 -0.0489   0.002391   66.8022  45.915067           5          5   

   nAtom  nHeavyAtom  nH  ...  APC2D10_S_X  APC2D10_F_F  APC2D10_F_Cl  \
0     66          38  28  ...          0.0          0.0           0.0   
1     42          24  18  ...          0.0          0.0           0.0   
2     82          48  34  ...          0.0          0.0           0.0   
3      0           0   0  ...          0.0          0.0           0.0   
4     40          21  19  ...          0.0          0.0           0.0   

   APC2D10_F_X  APC2D10_Cl_X  APC2D10_X_X  Molecular Weight  \
0          0.0           0.0          0.0      

In [10]:
# Save the merged dataset
df.to_csv("merged_dataset_cleaned.csv", index=False)

In [11]:
print(df["Activity_Label"].value_counts(dropna=False))

Activity_Label
0    1132
1     425
Name: count, dtype: int64


In [14]:
from sklearn.model_selection import train_test_split

# Step 1: Separate Actives and Inactives
actives = df[df['Activity_Label'] == 1]
inactives = df[df['Activity_Label'] == 0]

# Step 2: Randomly Undersample Inactives to match Actives (425 samples)
#inactives_undersampled = inactives.sample(n=len(actives), random_state=42)

# Step 3: Combine balanced dataset
#df_balanced = pd.concat([actives, inactives_undersampled])

# Step 4: Train-Test Split (80-20)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Print dataset sizes
print(f"Total dataset size after balancing: {len(df)}")
print(f"Training set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")
print(train_df['Activity_Label'].value_counts())  # Check class distribution

Total dataset size after balancing: 1557
Training set size: 1245
Test set size: 312
Activity_Label
0    906
1    339
Name: count, dtype: int64


In [15]:
train_df.to_csv("train_dataset.csv", index=False)
test_df.to_csv("test_dataset.csv", index=False)

In [16]:
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.utils import ImageReader
from io import BytesIO
import os

# Parameters
MIC_COLUMN = "MIC_µM"  
GRID_SIZE = (2, 2)
IMAGE_SIZE = (300, 300)
FONT_SIZE = 10
MARGIN = 40

def smiles_to_2d_mol(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        AllChem.Compute2DCoords(mol)
    return mol

def save_molecules_to_pdf_grid_labeled(smiles_list, mol_ids, mic_scores, output_path):
    mols = [(smiles_to_2d_mol(smi), mid, mic) for smi, mid, mic in zip(smiles_list, mol_ids, mic_scores) if Chem.MolFromSmiles(smi)]
    cols, rows = GRID_SIZE
    mols_per_page = cols * rows

    page_width, page_height = letter
    cell_width = (page_width - (cols + 1) * MARGIN) / cols
    cell_height = (page_height - (rows + 1) * MARGIN) / rows

    c = canvas.Canvas(output_path, pagesize=letter)
    c.setFont("Helvetica", FONT_SIZE)

    for i in range(0, len(mols), mols_per_page):
        chunk = mols[i:i + mols_per_page]
        for idx, (mol, mol_id, mic) in enumerate(chunk):
            row = idx // cols
            col = idx % cols

            img = Draw.MolToImage(mol, size=IMAGE_SIZE)
            buf = BytesIO()
            img.save(buf, format='PNG')
            buf.seek(0)

            x = MARGIN + col * (cell_width + MARGIN)
            y = page_height - MARGIN - (row + 1) * (cell_height + MARGIN) + MARGIN

            # Draw molecule
            c.drawImage(ImageReader(buf), x, y + FONT_SIZE * 2 + 4, width=cell_width, height=cell_height - FONT_SIZE * 2 - 10, preserveAspectRatio=True)

            # Draw label with mol_id and MIC
            label = f"mol_id: {mol_id}\nMIC: {mic}"
            c.drawCentredString(x + cell_width / 2, y + FONT_SIZE + 2, f"mol_id: {mol_id}")
            c.drawCentredString(x + cell_width / 2, y, f"MIC: {mic}")

        c.showPage()

    c.save()

# === Prepare data ===
active_df = train_df[train_df["Activity_Label"] == 1]
inactive_df = train_df[train_df["Activity_Label"] == 0]

active_smiles = active_df['Smiles'].tolist()
inactive_smiles = inactive_df['Smiles'].tolist()
active_ids = active_df['mol_id'].tolist()
inactive_ids = inactive_df['mol_id'].tolist()
active_mics = active_df[MIC_COLUMN].tolist()
inactive_mics = inactive_df[MIC_COLUMN].tolist()

# === Output paths ===
active_pdf_path = os.path.join("figures", "all_actives_structures_2x2_labeled_with_MIC.pdf")
inactive_pdf_path = os.path.join("figures", "all_inactives_structures_2x2_labeled_with_MIC.pdf")

# === Generate PDFs ===
save_molecules_to_pdf_grid_labeled(active_smiles, active_ids, active_mics, active_pdf_path)
save_molecules_to_pdf_grid_labeled(inactive_smiles, inactive_ids, inactive_mics, inactive_pdf_path)

print(f"✅ Actives PDF with MIC saved: {active_pdf_path}")
print(f"✅ Inactives PDF with MIC saved: {inactive_pdf_path}")


KeyError: 'mol_id'