# Applying the classifiers that can predict the aerobic biodegradability of organic chemicals

To run this file you should clone the repository and use use the main_venv. 

In [1]:
import pandas as pd
import pickle
import numpy as np

from code_files.processing_functions import convert_to_maccs_fingerprints
from code_files.processing_functions import bit_vec_to_lst_of_lst
from code_files.processing_functions import check_substances_in_ad
from code_files.processing_functions import get_datasets_for_ad

### Loading the models

In [2]:
with open('models/xgbc_df_curated_scs.pkl', 'rb') as model_file:
    classifier_scs = pickle.load(model_file)

with open('models/xgbc_df_curated_biowin.pkl', 'rb') as model_file:
    classifier_biowin = pickle.load(model_file)

with open('models/xgbc_df_curated_final.pkl', 'rb') as model_file:
    classifier_final = pickle.load(model_file)

classifiers = [classifier_scs, classifier_biowin, classifier_final]
classifier_names = ["scs", "biowin", "final"]


### Input data

To run the model on new data, you should create an excel file in the following format. The first column should be some kind of identifier of the substances, the second column should contain the SMILES. The SMILES column has to be called "smiles". Make sure that you have added the correct SMILES for each subastance and that for all ionizable substances you have ideally added the SMILES of the substance’s dominant species at pH 7.4 and 298 K.

In [3]:
file_name = "substance_file" # Change this to the name of your excel file
df_substances = pd.read_excel(f"{file_name}.xlsx") 

df = convert_to_maccs_fingerprints(df_substances)
x_class = bit_vec_to_lst_of_lst(df, False)
x = np.array(x_class, dtype=object)

### Predicting the ready biodegradability

Meaning of the labels:
- 0: Not readily biodegradable (NRB)
- 1: Readily biodegradable (RB)

In [4]:
for classifier, name in zip(classifiers, classifier_names):
    df_substances[f"prediction_{name}"] = classifier.predict(x)
df_substances.to_excel(f"{file_name}_predicted.xlsx")

df_substances.head()

Unnamed: 0,CAS,smiles,prediction_scs,prediction_biowin,prediction_final
0,6372-81-2,[O-]C(=O)C1=CC=CC=C1NN=C1C(=O)C=CC2=CC=CC=C12,0,0,0
1,93940-93-3,CN(CCO)CC([O-])=O,1,1,1
2,178452-71-6,CCCCN1C(=O)C(=NNC2=CC=C(C=C2)S(=O)(=O)NC2=CC3=...,0,0,0
3,75214-69-6,CN(C)S(=O)(=O)C1=CC(O)=C([N-]N=C2C(C)=NN(C2=O)...,0,0,0
4,102-54-5,[CH-]1C=CC=C1.[CH-]1C=CC=C1.[Fe+2],1,1,1


### Check if substances are in the AD

Meaning of the labels:
- 0: Not in AD
- 1: in AD

In [5]:
df_curated_scs, df_curated_biowin, df_curated_final = get_datasets_for_ad()
for name, df in zip(["scs", "biowin", "final"], [df_curated_scs, df_curated_biowin, df_curated_final]):
    df_substances = check_substances_in_ad(df_train=df, df_train_name=name, df_test=df_substances)
df_substances.head()

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,CAS,smiles,prediction_scs,prediction_biowin,prediction_final,in_ad_of_scs,in_ad_of_biowin,in_ad_of_final
0,6372-81-2,[O-]C(=O)C1=CC=CC=C1NN=C1C(=O)C=CC2=CC=CC=C12,0,0,0,1,1,1
1,93940-93-3,CN(CCO)CC([O-])=O,1,1,1,1,1,1
2,178452-71-6,CCCCN1C(=O)C(=NNC2=CC=C(C=C2)S(=O)(=O)NC2=CC3=...,0,0,0,1,1,1
3,75214-69-6,CN(C)S(=O)(=O)C1=CC(O)=C([N-]N=C2C(C)=NN(C2=O)...,0,0,0,1,1,1
4,102-54-5,[CH-]1C=CC=C1.[CH-]1C=CC=C1.[Fe+2],1,1,1,0,0,0
