# Machine learning-aided algorithm to predict absorption maximum wavelength and cytotoxicity of platinum-containing BODIPYs

In [3]:
# Importing libraries.
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.compose import ColumnTransformer
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import pickle
from model_files.classes_imported import GetFgp, GetSolventParams, get_weights
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder

# Declaring constant values.
RANDOM_STATE = 12345
TRAINING_DATA_DIRECTORY = 'train_valid_test_sets'

## Absorption wavelength prediction.

In [4]:
# Transformer to get molecular fingerprints and solvent parameters.
column_transformer_fgp = ColumnTransformer([('smile_transformer', GetFgp(), 'smiles'),
                                            ('solvent_transformer', GetSolventParams(), 'solvent')])

# CatBoost regressor with hyperparameters optimized.
cat = CatBoostRegressor(logging_level='Silent',
                        loss_function='RMSE',
                        random_state = RANDOM_STATE,
                        learning_rate=0.27,
                        depth=4,
                        iterations=75,
                        l2_leaf_reg=5.5)

In [5]:
# Training, validation and test data sets.
data_train = pd.read_csv(f'{TRAINING_DATA_DIRECTORY}/absorption/data_train.csv')
data_valid = pd.read_csv(f'{TRAINING_DATA_DIRECTORY}/absorption/data_valid.csv')
data_test = pd.read_csv(f'{TRAINING_DATA_DIRECTORY}/absorption/data_test.csv')

In [6]:
# Getting features and target values for model.
def get_features_abd_targets(df:pd.DataFrame) -> tuple[np.array]:
    features = column_transformer_fgp.fit_transform(df.drop('absorption', axis=1))
    targets = df['absorption']

    return features, targets


features_train, target_train = get_features_abd_targets(data_train)
features_valid, target_valid = get_features_abd_targets(data_valid)
features_test, target_test = get_features_abd_targets(data_test)

In [7]:
# Training the model.
cat.fit(features_train, target_train)

<catboost.core.CatBoostRegressor at 0x28ea8bc8910>

In [8]:
# Results on training, validation and test datasets.
def get_predictions(model, features, target, df_subset:str) -> None:
    prediction = model.predict(features)
    rmse = mean_squared_error(target, prediction, squared=False)

    print(f'RMSE ({df_subset}) ->', round(rmse, 4), 'nm')


get_predictions(cat, features_train, target_train, 'train')
get_predictions(cat, features_valid, target_valid, 'valid')
get_predictions(cat, features_test, target_test, 'test')

RMSE (train) -> 29.9508 nm
RMSE (valid) -> 28.9123 nm
RMSE (test) -> 38.3005 nm


In [9]:
# Saving the model.
with open('cat_fgp.sav', 'wb') as file:
    pickle.dump(cat, file)

## $lg(IC_{50})$ prediction.

In [10]:
# Training, validation and test data sets.
features_train_knn, target_train_knn = (
                                        pd.read_csv(f'{TRAINING_DATA_DIRECTORY}/ic_50/features_train.csv'),
                                        pd.read_csv(f'{TRAINING_DATA_DIRECTORY}/ic_50/target_train.csv')
                                        )

features_valid_knn, target_valid_knn = (
                                        pd.read_csv(f'{TRAINING_DATA_DIRECTORY}/ic_50/features_valid.csv'),
                                        pd.read_csv(f'{TRAINING_DATA_DIRECTORY}/ic_50/target_valid.csv')
                                        )

features_test_knn, target_test_knn = (
                                        pd.read_csv(f'{TRAINING_DATA_DIRECTORY}/ic_50/features_test.csv'),
                                        pd.read_csv(f'{TRAINING_DATA_DIRECTORY}/ic_50/target_test.csv')
                                        )

In [11]:
# One Hot Encoding.
ohe = OneHotEncoder(drop='first')

# Text vectorization.
hv = HashingVectorizer(ngram_range=(5,5), 
                       analyzer='char', 
                       n_features=32)

column_transformer = ColumnTransformer([('smile_transformer', hv, 'smiles'), 
                                        ('ohe', ohe, ['further_details', 'cell_line'])])

# Transforming the data.
column_transformer.fit(features_train_knn)

for df in ['features_train_knn', 'features_valid_knn', 'features_test_knn']:
    globals()[df] = column_transformer.transform(globals()[df])

In [12]:
# Creating an object of kNN algorithm with hyperparameters optimized.
knn = KNeighborsRegressor(n_neighbors=4,
                          p=2, 
                          weights=get_weights)

# Training the model.
knn.fit(features_train_knn, target_train_knn)

In [13]:
get_predictions(knn, features_train_knn, target_train_knn, 'train')
get_predictions(knn, features_valid_knn, target_valid_knn, 'valid')
get_predictions(knn, features_test_knn, target_test_knn, 'test')

RMSE (train) -> 0.3682 nm
RMSE (valid) -> 0.3909 nm
RMSE (test) -> 0.6149 nm


In [14]:
# Saving the model.
with open('knn_model.sav', 'wb') as file:
    pickle.dump(knn, file)