<a href="https://colab.research.google.com/github/mozey256/OSCAAR/blob/main/Smiles_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install rdkit


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


## PROCESSING THE DRUG STRUCTURAL DATA

In [2]:
import pandas as pd
import numpy as np
import seaborn as sn
#---------------------- RDKit packages
from rdkit.Chem import AllChem
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina
from rdkit import Chem
from rdkit.Chem import MACCSkeys
#------------------- progress bar
from tqdm import tqdm
#------------------- hide warning
import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv('./Drug_smiles.csv')

In [5]:
data.head()

Unnamed: 0,Drug_name,SMILES
0,2aminobenzothiazole_conc3_T30,C1=CC=C2C(=C1)N=C(S2)N
1,2hydroxyethylhydrazine_conc3_T30,C(CO)NN
2,3aminotriazole_conc3_T30,C1=NNC(=N1)N
3,4hydroxytamoxifene_conc3_T30,CCC(=C(C1=CC=C(C=C1)O)C2=CC=C(C=C2)OCCN(C)C)C3...
4,A23187_conc3_T30,CC1CCC2(C(CC(C(O2)C(C)C(=O)C3=CC=CN3)C)C)OC1CC...


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Drug_name  125 non-null    object
 1   SMILES     124 non-null    object
dtypes: object(2)
memory usage: 2.1+ KB


In [7]:
data.describe()

Unnamed: 0,Drug_name,SMILES
count,125,124
unique,125,124
top,2aminobenzothiazole_conc3_T30,C1=CC=C2C(=C1)N=C(S2)N
freq,1,1


In [14]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MACCSkeys

# Define a class to compute MACCS keys
class MACCS:
    def __init__(self, data):
        self.data = data
        self.names = data['Drug_name']
        self.smiles = data['SMILES']
        self.mols = [self.get_mol_from_smiles(i) for i in self.smiles]
    
    def get_mol_from_smiles(self, smile):
        if isinstance(smile, str):
            return Chem.MolFromSmiles(smile)
        else:
            return None
    
    def compute_MACCS(self, name):
        MACCS_list = []
        header = ['bit' + str(i) for i in range(167)]
        
        for i in range(len(self.mols)):
            if self.mols[i] is not None:
                ds = list(MACCSkeys.GenMACCSKeys(self.mols[i]).ToBitString())
                MACCS_list.append(ds)
            else:
                MACCS_list.append(['NaN']*167)
        
        df = pd.DataFrame(MACCS_list, columns=header)
        df.insert(loc=0, column='Drug Name', value=self.names)
        df.insert(loc=1, column='SMILES', value=self.smiles)
        df.to_csv(name[:-4]+'_MACCS.csv', index=False)

# Read the input file containing drug SMILES and drug names
data = pd.read_csv('./Drug_smiles.csv')

# Create an instance of the MACCS class and compute MACCS keys
maccs_calculator = MACCS(data)
maccs_calculator.compute_MACCS("output_file.csv")

In [24]:
data_Macc = pd.read_csv("./output_file_MACCS.csv")

In [25]:
data_Macc.head()

Unnamed: 0,Drug Name,SMILES,bit0,bit1,bit2,bit3,bit4,bit5,bit6,bit7,...,bit157,bit158,bit159,bit160,bit161,bit162,bit163,bit164,bit165,bit166
0,2aminobenzothiazole_conc3_T30,C1=CC=C2C(=C1)N=C(S2)N,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,0,1,0
1,2hydroxyethylhydrazine_conc3_T30,C(CO)NN,0,0,0,0,0,0,0,0,...,1,1,0,0,1,0,0,1,0,0
2,3aminotriazole_conc3_T30,C1=NNC(=N1)N,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,0,0,1,0
3,4hydroxytamoxifene_conc3_T30,CCC(=C(C1=CC=C(C=C1)O)C2=CC=C(C=C2)OCCN(C)C)C3...,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
4,A23187_conc3_T30,CC1CCC2(C(CC(C(O2)C(C)C(=O)C3=CC=CN3)C)C)OC1CC...,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0


In [26]:
# Specify the columns to be deleted
columns_to_delete = ['SMILES']

# Drop the specified columns
data_Macc = data_Macc.drop(columns=columns_to_delete)

In [31]:
import pandas as pd
import itertools

# Load the combined data from a CSV file (replace 'MACCS.csv' with your file path)
data3 = data_Macc

# Extract drug names and substructure data
drug_names = data3['Drug Name'].tolist()
substructure_data = data3.drop(columns=['Drug Name'])

# Convert Boolean data to integers (True=1, False=0)
substructure_data = substructure_data.astype(int)

# Generate all possible combinations of 2 drugs (171 combinations)
combinations = list(itertools.combinations(drug_names, 2))

# Create a list to store joint substructure profiles
joint_profiles_list = []

# Calculate joint substructure profiles for each combination
for combo in combinations:
    drug1, drug2 = combo
    substructure_profile1 = substructure_data.loc[data3['Drug Name'] == drug1].values[0][1:]
    substructure_profile2 = substructure_data.loc[data3['Drug Name'] == drug2].values[0][1:]
    joint_profile = (substructure_profile1 + substructure_profile2)/2

    # Append the combination names and joint profile to the list
    joint_profiles_list.append([drug1, drug2] + joint_profile.tolist())

# Create a DataFrame from the list
columns = ['Drug1', 'Drug2'] + [f'Joint_Substructure_{i}' for i in range(1, len(joint_profile) + 1)]
joint_profiles_df = pd.DataFrame(joint_profiles_list, columns=columns)

# Save the DataFrame to a CSV file
joint_profiles_df.to_csv('joint_substructures.csv',index=False)

In [37]:
import pandas as pd
import itertools
from rdkit.Chem import MACCSkeys

# Assuming 'data_Macc' is your DataFrame containing MACCS keys data
data3 = data_Macc

# Extract drug names and MACCS keys data
drug_names = data3['Drug Name'].tolist()
maccs_data = data3.drop(columns=['Drug Name'])

# Generate all possible combinations of 2 drugs
combinations = list(itertools.combinations(drug_names, 2))

# Create a list to store joint substructure profiles
joint_profiles_list = []

# Calculate joint substructure profiles for each combination
for combo in combinations:
    drug1, drug2 = combo
    maccs_profile1 = maccs_data.loc[data3['Drug Name'] == drug1].values[0][1:]
    maccs_profile2 = maccs_data.loc[data3['Drug Name'] == drug2].values[0][1:]
    joint_profile = (maccs_profile1 + maccs_profile2) / 2

    # Append the combination names and joint profile to the list
    joint_profiles_list.append([drug1, drug2, '+'.join(combo)] + joint_profile.tolist())

# Create a DataFrame from the list
columns = ['Drug1', 'Drug2', 'Combination'] + [f'Joint_Substructure_{i}' for i in range(1, len(joint_profile) + 1)]
joint_profiles_df = pd.DataFrame(joint_profiles_list, columns=columns)

# Save the DataFrame to a CSV file
joint_profiles_df.to_csv('joint_substructures.csv', index=False)


In [38]:
data4 = pd.read_csv("./joint_substructures.csv")

In [39]:
# Filter rows containing 'fluconazole'
df_fluco = data4[data4['Combination'].str.contains('fluconazole_conc2_T30')]

# Write the filtered dataset to a new file
df_fluco.to_csv('fluconazole_dataset.csv', index=False)


In [None]:
# Filter rows containing 'geldanamycin'

df_geldanamycin = data4[data4['Combination'].str.contains('geldanamycin_conc3_T30')]

# Write the filtered dataset to a new file
df_geldanamycin.to_csv('geldanamycin.csv', index=False)