In [1]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error,mean_absolute_error
import xgboost as xgb
import numpy as np
import pandas as pd

#  1. Load Data and Delete duplicates

In [2]:


def delete_rows_with_same_values(df, columns_to_check):
    X = df.copy()  # Make a copy of the DataFrame to avoid modifying the original
    X.sort_values(by="ddG", ascending=False)
    # Identify rows where specified columns have the same value
    rows_to_delete = X.duplicated(subset=columns_to_check, keep='first') 

    # Invert the boolean mask to keep rows that don't have the same values in specified columns
    X_filtered = X[~rows_to_delete]

    return X_filtered

columns_to_check_for_duplicates = ['starting electrophile SMILES', 'nucleophile SMILES',
        '3,3 Catalyst Substituent ', 'Temperature (Kelvin)', 
        'N Catalyst Substituent']


In [3]:

result = pd.read_csv("../Data/merged_max.csv")
result = delete_rows_with_same_values(result, columns_to_check_for_duplicates)
result.to_csv("../Data/clean_merged_max.csv")
result= pd.read_csv("../Data/clean_merged_max.csv")#Relod to recount rows


In [4]:
#print unique molecules
print(len(np.unique(result['input electrophile SMILES'])),len(np.unique(result['nucleophile SMILES'])),len(np.unique(result['N Catalyst Substituent'])),len(np.unique(result['3,3 Catalyst Substituent '])))

155 49 5 25


# 2. Load RDKit Descrciptors

In [5]:
from functions.common_functions import *

electrophileSMILES = result['input electrophile SMILES'].values
electrophiledescriptors = smi2RDKIT(electrophileSMILES)
nucleophileSMILES = result['nucleophile SMILES'].values
nucleophiledescriptors = smi2RDKIT(nucleophileSMILES)
threethreesubSMILES = result['3,3 Catalyst Substituent '].values
threethreesubdescriptors = smi2RDKIT(threethreesubSMILES)


In [6]:
alldescriptors = electrophiledescriptors.join(nucleophiledescriptors, lsuffix='electrophile', rsuffix='nucleophile').join(threethreesubdescriptors, rsuffix='threethreesub')#.join(nsubdescriptors, rsuffix='nsubdescriptors')

In [7]:
# choose usefull molecular descriptors
chosen_descriptors  = ['Kappa1','MaxPartialCharge',   'MinEStateIndex', 'MinPartialCharge', 'MolLogP',  'ExactMolWt','qed']
chosen_descriptors2 = ['Kappa1','ExactMolWt', 'MaxPartialCharge','MinEStateIndex', 'MinPartialCharge',  'MolLogP']


In [8]:
def add_suffixes(input_list,input_list2):
    result_list = []
    for entry in input_list:
        result_list.append(entry + 'electrophile')
        result_list.append(entry + 'nucleophile')
    for entry in input_list2:
        result_list.append(entry + '')
    return result_list

# Example list


# Add suffixes '_e' and '_n' to each entry
output_list = add_suffixes(chosen_descriptors,chosen_descriptors2)

# Display the result
print(output_list)

['Kappa1electrophile', 'Kappa1nucleophile', 'MaxPartialChargeelectrophile', 'MaxPartialChargenucleophile', 'MinEStateIndexelectrophile', 'MinEStateIndexnucleophile', 'MinPartialChargeelectrophile', 'MinPartialChargenucleophile', 'MolLogPelectrophile', 'MolLogPnucleophile', 'ExactMolWtelectrophile', 'ExactMolWtnucleophile', 'qedelectrophile', 'qednucleophile', 'Kappa1', 'ExactMolWt', 'MaxPartialCharge', 'MinEStateIndex', 'MinPartialCharge', 'MolLogP']


In [9]:
# Create a new DataFrame with only the selected columns
result_df = alldescriptors[output_list]
result_df

Unnamed: 0,Kappa1electrophile,Kappa1nucleophile,MaxPartialChargeelectrophile,MaxPartialChargenucleophile,MinEStateIndexelectrophile,MinEStateIndexnucleophile,MinPartialChargeelectrophile,MinPartialChargenucleophile,MolLogPelectrophile,MolLogPnucleophile,ExactMolWtelectrophile,ExactMolWtnucleophile,qedelectrophile,qednucleophile,Kappa1,ExactMolWt,MaxPartialCharge,MinEStateIndex,MinPartialCharge,MolLogP
0,7.042665,6.956883,0.149570,0.047963,0.730278,-0.775270,-0.297896,-0.103369,2.6523,2.5106,156.057515,114.086477,0.579913,0.382049,5.482230,128.06260,-0.018404,1.310185,-0.061630,2.8398
1,8.754380,6.956883,0.149570,0.047963,0.696914,-0.775270,-0.496745,-0.103369,2.6609,2.5106,186.068080,114.086477,0.673793,0.382049,5.482230,128.06260,-0.018404,1.310185,-0.061630,2.8398
2,8.448344,6.956883,0.149570,0.047963,0.717428,-0.775270,-0.297896,-0.103369,3.4148,2.5106,233.968027,114.086477,0.691883,0.382049,5.482230,128.06260,-0.018404,1.310185,-0.061630,2.8398
3,5.035138,6.956883,0.149548,0.047963,0.729167,-0.775270,-0.297896,-0.103369,1.4991,2.5106,106.041865,114.086477,0.495636,0.382049,5.482230,128.06260,-0.018404,1.310185,-0.061630,2.8398
4,5.947877,6.956883,0.152494,0.047963,-0.465278,-0.775270,-0.297781,-0.103369,1.6382,2.5106,124.032443,114.086477,0.519899,0.382049,7.807968,180.09390,-0.014904,1.187685,-0.061951,3.4522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,6.410000,2.703214,0.144920,-0.016293,0.821759,1.138889,-0.298281,-0.080750,1.5416,1.5025,98.073165,66.046950,0.377957,0.401713,7.571439,178.07825,-0.010542,1.311296,-0.061629,3.9930
319,5.544953,2.703214,0.145202,-0.016293,0.971389,1.138889,-0.298269,-0.080750,1.6857,1.5025,110.073165,66.046950,0.469249,0.401713,7.571439,178.07825,-0.010542,1.311296,-0.061629,3.9930
320,5.410000,2.703214,0.144910,-0.016293,0.787037,1.138889,-0.298281,-0.080750,1.1515,1.5025,84.057515,66.046950,0.343827,0.401713,7.571439,178.07825,-0.010542,1.311296,-0.061629,3.9930
321,4.566006,2.703214,0.145202,-0.016293,0.951389,1.138889,-0.298269,-0.080750,1.2956,1.5025,96.057515,66.046950,0.449462,0.401713,7.571439,178.07825,-0.010542,1.311296,-0.061629,3.9930


# 3. Data Engineering

In [10]:
result.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'reaction',
       'starting electrophile SMILES', 'input electrophile SMILES',
       'nucleophile SMILES', '3,3 Catalyst Substituent ',
       'N Catalyst Substituent', 'solvent', 'Temperature (Celsius)',
       'Temperature (Kelvin)', 'yield (%)', 'ee (%)', 'e.r. 1', 'e.r. 2',
       'ddG', 'reference ', 'Link', 'ster1', 'ster2', 'ster3', 'polarxx',
       'polarxy', 'polaryy', 'polarzx', 'polaryz', 'polarzz', 'mean_polar',
       'HOMO', 'LUMO', 'biggest distance', 'major_axis', 'Axis_2', 'Axis_3',
       'Vbur75', 'old_name', 'smiles_e', 'L_e', 'Bmin_e', 'Bmax_e',
       'bur_shell_e', 'bur_vol_e', 'max_distance_e', 'tot_V_e', 'disp_e',
       'max_axis_e', 'A2_e', 'min_axis_e', 'smiles_n', 'L_n', 'Bmin_n',
       'Bmax_n', 'bur_shell_n', 'bur_vol_n', 'max_distance_n', 'tot_V_n',
       'disp_n', 'max_axis_n', 'A2_n', 'min_axis_n'],
      dtype='object')

In [11]:
col_names = ['ddG','ster1', 'ster2', 
       'ster3',  
       'input electrophile SMILES', 'nucleophile SMILES', 'starting electrophile SMILES',
       'HOMO','LUMO',
       'N Catalyst Substituent', 'Temperature (Kelvin)',
      'mean_polar', 
       'biggest distance',
       'major_axis', 'Axis_2', 'Axis_3' ,
       'L_e', 'Bmin_e', 'Bmax_e', 'bur_vol_e', 
       'L_n', 'Bmin_n', 'Bmax_n', 'bur_vol_n', 
        '3,3 Catalyst Substituent ',
       'tot_V_e', 'disp_e',  
       'Vbur75',
        'tot_V_n', 'disp_n',  
       'max_distance_n','max_distance_e'
       
      ]
DFT_data = result.loc[:,col_names]

DFT_data["Box_vol"] = DFT_data['major_axis']*DFT_data['Axis_2']*DFT_data['Axis_3']
DFT_data["RAxis_2"] = DFT_data['Axis_2']/DFT_data['major_axis']
DFT_data["RAxis_3"] = DFT_data['Axis_3']/DFT_data['major_axis']
DFT_data["disp_n"] = DFT_data['disp_n']/DFT_data['max_distance_n']
DFT_data["disp_e"] = DFT_data['disp_e']/DFT_data['max_distance_e']


DFT_data = DFT_data.drop(['Axis_2','Axis_3','max_distance_n','max_distance_e'],axis=1)

# Featurize Nsubs

DFT_data['Ncat_alif'] = DFT_data['N Catalyst Substituent'].apply(lambda x: 1 if x == 'NS(=O)(=O)C(F)(F)F' else (2 if x == 'NS(=O)(=O)C(F)(F)C(F)(F)F' else (4 if x == 'NS(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F' else ( 6 if x == 'NS(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F' else 0))))
DFT_data['Ncat_arom'] = DFT_data['N Catalyst Substituent'].apply(lambda x: 1 if x == 'NS(=O)(=O)c1c(F)c(F)c(F)c(F)c1F' else 0)




ddg= result.loc[:,'ddG']

ddg


0      1.231712
1      1.066407
2      1.002518
3      0.896691
4      1.033884
         ...   
318    1.338303
319    1.338303
320    1.092856
321    1.195347
322    0.537292
Name: ddG, Length: 323, dtype: float64

In [12]:
DFT_data

Unnamed: 0,ddG,ster1,ster2,ster3,input electrophile SMILES,nucleophile SMILES,starting electrophile SMILES,HOMO,LUMO,N Catalyst Substituent,...,tot_V_e,disp_e,Vbur75,tot_V_n,disp_n,Box_vol,RAxis_2,RAxis_3,Ncat_alif,Ncat_arom
0,1.231712,8.98,1.70,4.42,O=Cc2ccc1ccccc1c2,C=CC[Si](C)(C)C,O=Cc2ccc1ccccc1c2,-0.26066,-0.02697,NS(=O)(=O)C(F)(F)F,...,0.001207,0.340037,10.694121,120.049756,0.414050,130.584488,0.473305,0.185862,1,0
1,1.066407,8.98,1.70,4.42,COc2ccc1cc(C=O)ccc1c2,C=CC[Si](C)(C)C,COc2ccc1cc(C=O)ccc1c2,-0.26066,-0.02697,NS(=O)(=O)C(F)(F)F,...,102.106155,0.933660,10.694121,120.049756,0.414050,130.584488,0.473305,0.185862,1,0
2,1.002518,8.98,1.70,4.42,O=Cc2ccc1cc(Br)ccc1c2,C=CC[Si](C)(C)C,O=Cc2ccc1cc(Br)ccc1c2,-0.26066,-0.02697,NS(=O)(=O)C(F)(F)F,...,0.001092,0.958748,10.694121,120.049756,0.414050,130.584488,0.473305,0.185862,1,0
3,0.896691,8.98,1.70,4.42,O=Cc1ccccc1,C=CC[Si](C)(C)C,O=Cc1ccccc1,-0.26066,-0.02697,NS(=O)(=O)C(F)(F)F,...,0.001296,0.497507,10.694121,120.049756,0.414050,130.584488,0.473305,0.185862,1,0
4,1.033884,11.14,2.13,4.49,O=Cc1ccccc1F,C=CC[Si](C)(C)C,O=Cc1ccccc1F,-0.26081,-0.02053,NS(=O)(=O)C(F)(F)F,...,0.001254,0.310178,7.767809,120.049756,0.414050,238.869273,0.404818,0.238722,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,1.338303,11.15,1.70,4.49,CC/C=C(C)/C=O,C1=CCC=C1,CC/C=C(C)/C=O,-0.26240,-0.02488,NS(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,...,80.426018,0.434489,7.767166,29.406022,0.724685,170.347416,0.409886,0.167639,4,0
319,1.338303,11.15,1.70,4.49,O=CC1=CCCCC1,C1=CCC=C1,O=CC1=CCCCC1,-0.26240,-0.02488,NS(=O)(=O)C(F)(F)C(F)(F)F,...,80.781766,0.410741,7.767166,29.406022,0.724685,170.347416,0.409886,0.167639,2,0
320,1.092856,11.15,1.70,4.49,C/C=C(C)/C=O,C1=CCC=C1,C/C=C(C)/C=O,-0.26240,-0.02488,NS(=O)(=O)C(F)(F)C(F)(F)F,...,37.591729,0.382726,7.767166,29.406022,0.724685,170.347416,0.409886,0.167639,2,0
321,1.195347,11.15,1.70,4.49,O=CC1=CCCC1,C1=CCC=C1,O=CC1=CCCC1,-0.26240,-0.02488,NS(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,...,50.989746,0.383794,7.767166,29.406022,0.724685,170.347416,0.409886,0.167639,4,0


In [13]:
alldesc = pd.concat([result_df ,DFT_data], axis=1)

alldesc

Unnamed: 0,Kappa1electrophile,Kappa1nucleophile,MaxPartialChargeelectrophile,MaxPartialChargenucleophile,MinEStateIndexelectrophile,MinEStateIndexnucleophile,MinPartialChargeelectrophile,MinPartialChargenucleophile,MolLogPelectrophile,MolLogPnucleophile,...,tot_V_e,disp_e,Vbur75,tot_V_n,disp_n,Box_vol,RAxis_2,RAxis_3,Ncat_alif,Ncat_arom
0,7.042665,6.956883,0.149570,0.047963,0.730278,-0.775270,-0.297896,-0.103369,2.6523,2.5106,...,0.001207,0.340037,10.694121,120.049756,0.414050,130.584488,0.473305,0.185862,1,0
1,8.754380,6.956883,0.149570,0.047963,0.696914,-0.775270,-0.496745,-0.103369,2.6609,2.5106,...,102.106155,0.933660,10.694121,120.049756,0.414050,130.584488,0.473305,0.185862,1,0
2,8.448344,6.956883,0.149570,0.047963,0.717428,-0.775270,-0.297896,-0.103369,3.4148,2.5106,...,0.001092,0.958748,10.694121,120.049756,0.414050,130.584488,0.473305,0.185862,1,0
3,5.035138,6.956883,0.149548,0.047963,0.729167,-0.775270,-0.297896,-0.103369,1.4991,2.5106,...,0.001296,0.497507,10.694121,120.049756,0.414050,130.584488,0.473305,0.185862,1,0
4,5.947877,6.956883,0.152494,0.047963,-0.465278,-0.775270,-0.297781,-0.103369,1.6382,2.5106,...,0.001254,0.310178,7.767809,120.049756,0.414050,238.869273,0.404818,0.238722,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,6.410000,2.703214,0.144920,-0.016293,0.821759,1.138889,-0.298281,-0.080750,1.5416,1.5025,...,80.426018,0.434489,7.767166,29.406022,0.724685,170.347416,0.409886,0.167639,4,0
319,5.544953,2.703214,0.145202,-0.016293,0.971389,1.138889,-0.298269,-0.080750,1.6857,1.5025,...,80.781766,0.410741,7.767166,29.406022,0.724685,170.347416,0.409886,0.167639,2,0
320,5.410000,2.703214,0.144910,-0.016293,0.787037,1.138889,-0.298281,-0.080750,1.1515,1.5025,...,37.591729,0.382726,7.767166,29.406022,0.724685,170.347416,0.409886,0.167639,2,0
321,4.566006,2.703214,0.145202,-0.016293,0.951389,1.138889,-0.298269,-0.080750,1.2956,1.5025,...,50.989746,0.383794,7.767166,29.406022,0.724685,170.347416,0.409886,0.167639,4,0


# 6. Divide data into reaction classes

In [14]:
filtered_df = alldesc[alldesc['nucleophile SMILES'] == alldesc['starting electrophile SMILES']]
list_one = np.unique(filtered_df['starting electrophile SMILES'])

list_two = ['C=C(OC)O[Si](C)(C)C','C=C(OC)O[Si](C)(C)C(C)(C)C','CO/C(O[Si](C)(C)C)=C(C)/C','C=C(OC)O[Si](CC)(CC)CC',
'C=C(OC)O[Si]1(C(C)C)C(C)CC1C','C=C(OC1CCCCC1)O[Si](C)(C)C(C)(C)C','C=C(OCc1ccccc1)O[Si](C)(C)C',
'C=C(OCc1ccccc1)O[Si](C)(C)C(C)(C)C','CO/C(=C\C(C)C)O[Si](C)(C)C','CO/C(=C/c1ccccc1)O[Si](C)(C)C','CO/C(=C\c1ccccc1)O[Si](C)(C)C',
'C=CO[Si](C)(C)C(C)(C)C','C=CO[Si](CC)(CC)CC','C=C(/C=C/c1ccccc1)O[Si](C)(C)C(C)(C)C','C=C(O[Si](C)(C)C(C)(C)C)c1ccccc1','C=CC[Si](C)(C)C',
'C=C1C=C(O[Si](C)(C)C)OC(C)(C)O1','C=C(OC)O[Si](C(C)C)(C(C)C)C(C)C','C=C(OC)O[Si](C(C)C)(C(C)C)C(C)C','C=C/C=C(OC(C)C)\O[Si](C)(C)C(C)(C)C',
'C=C/C=C(OC(C)C)\O[Si](C)(C)C(C)(C)C'  ,'CO/C(O[Si](C)(C)C)=C/1CCC1','CO/C(O[Si](C)(C)C)=C/1CCCC1','CO/C(O[Si](C)(C)C)=C/1CCCCC1']


unique_values = [alldesc[col].unique() for col in ['nucleophile SMILES', 'starting electrophile SMILES']]
list_three= [value for value in unique_values[0] if (value not in list_one and value not in list_two)]
filtered_df = alldesc[alldesc['nucleophile SMILES'] == alldesc['starting electrophile SMILES']]
len(filtered_df['starting electrophile SMILES'])

def categorize_id(id):
    if id in list_one:
        return 1
    elif id in list_two:
        return 2
    elif id in list_three:
        return 3
    else:
        return 'Unknown'

# Add a new column 'reaction_group' to the DataFrame based on the categorization
alldesc['reaction_group'] = alldesc['nucleophile SMILES'].apply(categorize_id)
react_groups = alldesc['nucleophile SMILES'].apply(categorize_id)
alldesc = pd.get_dummies(alldesc, columns=['reaction_group'])



In [15]:
alldesc.columns

Index(['Kappa1electrophile', 'Kappa1nucleophile',
       'MaxPartialChargeelectrophile', 'MaxPartialChargenucleophile',
       'MinEStateIndexelectrophile', 'MinEStateIndexnucleophile',
       'MinPartialChargeelectrophile', 'MinPartialChargenucleophile',
       'MolLogPelectrophile', 'MolLogPnucleophile', 'ExactMolWtelectrophile',
       'ExactMolWtnucleophile', 'qedelectrophile', 'qednucleophile', 'Kappa1',
       'ExactMolWt', 'MaxPartialCharge', 'MinEStateIndex', 'MinPartialCharge',
       'MolLogP', 'ddG', 'ster1', 'ster2', 'ster3',
       'input electrophile SMILES', 'nucleophile SMILES',
       'starting electrophile SMILES', 'HOMO', 'LUMO',
       'N Catalyst Substituent', 'Temperature (Kelvin)', 'mean_polar',
       'biggest distance', 'major_axis', 'L_e', 'Bmin_e', 'Bmax_e',
       'bur_vol_e', 'L_n', 'Bmin_n', 'Bmax_n', 'bur_vol_n',
       '3,3 Catalyst Substituent ', 'tot_V_e', 'disp_e', 'Vbur75', 'tot_V_n',
       'disp_n', 'Box_vol', 'RAxis_2', 'RAxis_3', 'Ncat_alif', 'Nc

# 7. Split Data 

In [17]:
alldesc

Unnamed: 0,Kappa1electrophile,Kappa1nucleophile,MaxPartialChargeelectrophile,MaxPartialChargenucleophile,MinEStateIndexelectrophile,MinEStateIndexnucleophile,MinPartialChargeelectrophile,MinPartialChargenucleophile,MolLogPelectrophile,MolLogPnucleophile,...,tot_V_n,disp_n,Box_vol,RAxis_2,RAxis_3,Ncat_alif,Ncat_arom,reaction_group_1,reaction_group_2,reaction_group_3
0,7.042665,6.956883,0.149570,0.047963,0.730278,-0.775270,-0.297896,-0.103369,2.6523,2.5106,...,120.049756,0.414050,130.584488,0.473305,0.185862,1,0,False,True,False
1,8.754380,6.956883,0.149570,0.047963,0.696914,-0.775270,-0.496745,-0.103369,2.6609,2.5106,...,120.049756,0.414050,130.584488,0.473305,0.185862,1,0,False,True,False
2,8.448344,6.956883,0.149570,0.047963,0.717428,-0.775270,-0.297896,-0.103369,3.4148,2.5106,...,120.049756,0.414050,130.584488,0.473305,0.185862,1,0,False,True,False
3,5.035138,6.956883,0.149548,0.047963,0.729167,-0.775270,-0.297896,-0.103369,1.4991,2.5106,...,120.049756,0.414050,130.584488,0.473305,0.185862,1,0,False,True,False
4,5.947877,6.956883,0.152494,0.047963,-0.465278,-0.775270,-0.297781,-0.103369,1.6382,2.5106,...,120.049756,0.414050,238.869273,0.404818,0.238722,1,0,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,6.410000,2.703214,0.144920,-0.016293,0.821759,1.138889,-0.298281,-0.080750,1.5416,1.5025,...,29.406022,0.724685,170.347416,0.409886,0.167639,4,0,False,False,True
319,5.544953,2.703214,0.145202,-0.016293,0.971389,1.138889,-0.298269,-0.080750,1.6857,1.5025,...,29.406022,0.724685,170.347416,0.409886,0.167639,2,0,False,False,True
320,5.410000,2.703214,0.144910,-0.016293,0.787037,1.138889,-0.298281,-0.080750,1.1515,1.5025,...,29.406022,0.724685,170.347416,0.409886,0.167639,2,0,False,False,True
321,4.566006,2.703214,0.145202,-0.016293,0.951389,1.138889,-0.298269,-0.080750,1.2956,1.5025,...,29.406022,0.724685,170.347416,0.409886,0.167639,4,0,False,False,True


In [18]:

xtrain, xtest, ytrain, ytest = train_test_split(alldesc, ddg, test_size=0.2, stratify=react_groups,random_state=1)
xtrain = xtrain.drop(columns=['nucleophile SMILES','input electrophile SMILES','ddG','starting electrophile SMILES',
'N Catalyst Substituent','3,3 Catalyst Substituent '])
xtest = xtest.drop(columns=['nucleophile SMILES','input electrophile SMILES','ddG','starting electrophile SMILES',
'N Catalyst Substituent','3,3 Catalyst Substituent '])
trainedscaler = MinMaxScaler().fit(xtrain)
xtrainscaled = trainedscaler.transform(xtrain)
xtestscaled = trainedscaler.transform(xtest)

# 8. Hyperparameter Optimization and LOO CV

In [19]:
from sklearn.pipeline import Pipeline
import scipy.stats as stats
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
weights = np.ones_like(ytrain)  # Use equal weights for all samples initially

# add weights for underrepresented regions
weights[ytrain < 1] = 1
weights[ytrain > 2] = 2
X_train_weighted = np.repeat(xtrainscaled, weights.astype(int), axis=0)
y_train_weighted = np.repeat(ytrain, weights.astype(int))
xgb_param_grid = {
    'alpha': stats.uniform(1, 2),  
    'n_estimators': [300,400, 500, 600, 700, 800,900],
    'learning_rate': [ 0.1, 0.2, 0.3, 0.4],
    'subsample': [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9],
    'max_depth': [7, 8, 9, 10],
    'min_child_weight': [0, 1, 2, 3],
    'colsample_bytree': [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9],
    
    
}
xg_reg = xgb.XGBRegressor(objective='reg:squarederror')
n_features_to_select_range = stats.randint(25, 30)
# Create a pipeline with RFE and XGBoost
pipeline = Pipeline([
    ('rfe', RFE(estimator=xg_reg)),
    ('xgb', xg_reg)
])

# Perform GridSearchCV
grid_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=dict(
        rfe__n_features_to_select=n_features_to_select_range,
        **{'xgb__' + key: value for key, value in xgb_param_grid.items()}
    ),
    scoring='neg_mean_squared_error',n_iter=100,
    cv=10, random_state=43
)
grid_search.fit(X_train_weighted, y_train_weighted)


# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)


Best Hyperparameters: {'rfe__n_features_to_select': 25, 'xgb__alpha': 1.7764522803726124, 'xgb__colsample_bytree': 0.85, 'xgb__learning_rate': 0.3, 'xgb__max_depth': 7, 'xgb__min_child_weight': 3, 'xgb__n_estimators': 500, 'xgb__subsample': 0.85}


In [20]:
from sklearn.model_selection import LeaveOneOut
scorer = make_scorer(mean_absolute_error, greater_is_better=False)

best_estimator = grid_search.best_estimator_
cv_strategy = LeaveOneOut()  
mae_scores = -cross_val_score(best_estimator, X_train_weighted, y_train_weighted,  cv=cv_strategy,scoring=scorer)

# Calculate mean MAE
mae_mean = np.mean(mae_scores)

print("Mean Absolute Error (MAE) using LOO Cross-Validation:", mae_mean)

Mean Absolute Error (MAE) using LOO Cross-Validation: 0.258173042403048
