In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.5


In [3]:
import pandas as pd

# read the csv
#please change the path
data = pd.read_csv("/content/drive/MyDrive/1:1_Neel Banga/Datasets/smile_pic50_processed.csv")
data.head()

Unnamed: 0,canonical_smiles,pIC50
0,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCC(=O...,6.38405
1,CC(C)C[C@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](N...,8.69897
2,CCC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)[C@@H]...,6.337242
3,CC(=O)NCC(=O)N[C@@H](Cc1ccccc1)[C@@H](O)CC(=O)...,5.045757
4,CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](Cc1ccccc1...,5.251812


In [4]:
# extract the main cols to remove in preprocessing
MAIN_COLS = list(data.columns)
print("Main columns", MAIN_COLS)

Main columns ['canonical_smiles', 'pIC50']


In [5]:
def extract_lipinski_and_other_descriptors(raw):
    """
    This function extracts descriptors from the canonical smile form of the data.

    @retrun
    dataframe raw with each new extractor descriptor
    """
    from rdkit import Chem
    from rdkit.Chem import Descriptors, Lipinski
    import numpy as np
    #get the canonical smiles formula
    smile = raw["canonical_smiles"]
    processed_smile = Chem.MolFromSmiles(smile)

    #loop through descriptor functions of rdkit descriptors
    for descriptor_name, descriptor_function in Descriptors._descList:
        try:
            descriptor_value = descriptor_function(processed_smile)
            raw[descriptor_name] = descriptor_value
        except Exception as error:
            print(str(error))
            raw[descriptor_name] = np.nan

    return raw

#run the descriptor function for the dataset
data = data.apply(extract_lipinski_and_other_descriptors, axis = 1)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [6]:
import numpy as np

# check for infinte values
infinite_values = np.isinf(data[[col for col in data.columns if col not in ["canonical_smiles"]]]).sum()
infinite_cols = list(infinite_values[infinite_values > 0].index)
print("Columns with infinite Values", infinite_cols)

if len(infinite_cols) > 0:
    # replace infintes with Nans
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    # drop nans
    print("Shape before dropping Nans: {}".format(data.shape))
    data.dropna(how = "any", inplace = True)
    print("Shape before dropping Nans: {}".format(data.shape))

Columns with infinite Values []


In [7]:
# identify cols with missing values
missing_values = data.isnull().sum()
cols_missing = list(missing_values[missing_values > 0].index)
cols_missing

['MaxPartialCharge',
 'MinPartialCharge',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'BCUT2D_MWHI',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGHI',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPHI',
 'BCUT2D_LOGPLOW',
 'BCUT2D_MRHI',
 'BCUT2D_MRLOW']

In [8]:
from sklearn.model_selection import train_test_split

# split the dataset
X = data.loc[:, [cols for cols in list(data.columns) if cols != "pIC50"]]
Y = data.loc[:, "pIC50"]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

print(x_train.shape)
print(y_train.shape)

(5882, 211)
(5882,)


### Filling missing values

In [9]:
# fill missing values
from sklearn.impute import SimpleImputer
import pickle

if len(cols_missing) > 0:
    print("Filling missing values")
    imputer = SimpleImputer(strategy='median').fit(x_train[cols_missing])
    # transform on train data
    x_train[cols_missing] = imputer.transform(x_train[cols_missing])
    # transform on train data
    x_test[cols_missing] = imputer.transform(x_test[cols_missing])

    # save the impute model for later use
    with open("/content/drive/MyDrive/1:1_Neel Banga/Models/imputation_model", "wb") as impute_model:
        pickle.dump(imputer, impute_model)
        print("model saved")

Filling missing values
model saved


### Scaling
- Make sure to scale values as a processing which we will need in predicting pubchem10M
- Please create folder called **Models** inside your drive and save the scaling model to your drive. We will need this model later on.
- **THI IS COMPULSORY**.

In [10]:
from sklearn.preprocessing import StandardScaler

# get only the required columns without canonical smile and pIC50
REQUIRED_COLUMNS = [cols for cols in list(x_train.columns) if cols not in MAIN_COLS]
print(REQUIRED_COLUMNS)

print("Scaling")
stc = StandardScaler().fit(x_train[REQUIRED_COLUMNS])
# transform on train data
x_train[REQUIRED_COLUMNS] = stc.transform(x_train[REQUIRED_COLUMNS])
# transform on test data
x_test[REQUIRED_COLUMNS] = stc.transform(x_test[REQUIRED_COLUMNS])
# save the scaling model for later use
with open("/content/drive/MyDrive/1:1_Neel Banga/Models/scaling_model", "wb") as scaling_file:
    pickle.dump(stc, scaling_file)
    print("model saved")

['MaxAbsEStateIndex', 'MaxEStateIndex', 'MinAbsEStateIndex', 'MinEStateIndex', 'qed', 'SPS', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'AvgIpc', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'S

In [11]:
selected_columns = ["canonical_smiles", "pIC50"]
# save train and test csvs
# train
train_csv = pd.concat([x_train, y_train], axis = 1)
train_csv.reset_index(drop = True, inplace = True)
# select only smile and pic50 for chemberta and pubchem
train_csv_smile = train_csv[selected_columns]
train_csv.drop(selected_columns[0], axis = 1, inplace = True)

# test
test_csv = pd.concat([x_test, y_test], axis = 1)
test_csv.reset_index(drop = True, inplace = True)
# select only smile and pic50 for chemberta and pubchem
test_csv_smile = test_csv[selected_columns]
test_csv.drop(selected_columns[0], axis = 1, inplace = True)

In [12]:
# save the csvs
train_csv.to_csv("/content/drive/MyDrive/1:1_Neel Banga/Datasets/train.csv", index = False)
test_csv.to_csv("/content/drive/MyDrive/1:1_Neel Banga/Datasets/test.csv", index = False)

In [13]:
# save the csvs with smile
train_csv_smile.to_csv("/content/drive/MyDrive/1:1_Neel Banga/Datasets/smile_train.csv", index = False)
test_csv_smile.to_csv("/content/drive/MyDrive/1:1_Neel Banga/Datasets/smile_test.csv", index = False)