In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem

def get_ecfc(smiles_list, radius=2, nBits=2048, useCounts=True):
    """
    Calculates the ECFP fingerprint for given SMILES list
    
    :param smiles_list: List of SMILES
    :type smiles_list: list
    :param radius: The ECPF fingerprints radius.
    :type radius: int
    :param nBits: The number of bits of the fingerprint vector.
    :type nBits: int
    :param useCounts: Use count vector or bit vector.
    :type useCounts: bool
    :returns: The calculated ECPF fingerprints for the given SMILES
    :rtype: Dataframe
    """     
    
    ecfp_fingerprints=[]
    erroneous_smiles=[]
    for smiles in smiles_list:
        mol=Chem.MolFromSmiles(smiles)
        if mol is None:
            ecfp_fingerprints.append([None]*nBits)
            erroneous_smiles.append(smiles)
        else:
            mol=Chem.AddHs(mol)
            if useCounts:
                ecfp_fingerprints.append(list(AllChem.GetHashedMorganFingerprint(mol, radius, nBits)))  
            else:    
                ecfp_fingerprints.append(list(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits).ToBitString()))  
    
    # Create dataframe of fingerprints
    df_ecfp_fingerprints = pd.DataFrame(data = ecfp_fingerprints, index = smiles_list)
    # Remove erroneous data
    if len(erroneous_smiles)>0:
        print("The following erroneous SMILES have been found in the data:\n{}.\nThe erroneous SMILES will be removed from the data.".format('\n'.join(map(str, erroneous_smiles))))           
        df_ecfp_fingerprints = df_ecfp_fingerprints.dropna(how='any')    
    
    return df_ecfp_fingerprints

In [2]:
from sklearn.model_selection import train_test_split
import numpy as np 
import pandas as pd

In [3]:
#Get and Arrange data
import pandas as pd
df_data= pd.read_csv('all_data.csv')

train_data = df_data[df_data['data_type'] == 0]
test1_data = df_data[df_data['data_type'] == 1]
test2_data = df_data[df_data['data_type'] == 2]

In [4]:
train_encoded = get_ecfc(train_data["reactant_smiles"])
test1_encoded = get_ecfc(test1_data["reactant_smiles"])
test2_encoded = get_ecfc(test2_data["reactant_smiles"])

In [5]:
# def modeling(train_data, test1_data, test2_data, encoder, model):  - After deciding the list of the smiles

def modeling(train_encoded, test1_encoded, test2_encoded, model):
    
    from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
    import time
    start_time = time.time()
    
    # Training
    X = train_encoded
    y = train_data['reaction_energy']
    
    model.fit(X.values, y)
    
    # Predicting
    pred_train = model.predict(train_encoded.values)
    pred_test1 = model.predict(test1_encoded.values)
    pred_test2 = model.predict(test2_encoded.values)
    
    
    # Scores of Train Data 
    tr_mae = mean_absolute_error(y, pred_train)
    tr_rmse = mean_squared_error(y ,pred_train , squared=False)
    tr_r2 = r2_score(y, pred_train)
    print('##########################  Scores of Train Data  ##########################')
    print('Train set MAE of {}: {:.3f}'.format(model, tr_mae))
    print('Train set RMSE of {}: {:.3f}'.format(model, tr_rmse))
    print('Train set R2 Score of {}: {:.3f}'.format(model, tr_r2))
    
    print("----------------------------------------------------------------------------")
    
    # Test1 Data
    test1_mae = mean_absolute_error(test1_data['reaction_energy'], pred_test1)
    test1_rmse = mean_squared_error(test1_data['reaction_energy'], pred_test1, squared=False)
    test1_r2 = r2_score(test1_data['reaction_energy'], pred_test1)
    print('##########################  Scores of Test1 Data  ##########################')
    print('Test1 set MAE of {}: {:.3f}'.format(model, test1_mae))
    print('Test1 set RMSE of {}: {:.3f}'.format(model, test1_rmse))
    print('Test1 set R2 Score of {}: {:.3f}'.format(model, test1_r2))
    
    print("----------------------------------------------------------------------------")
    
    # Test2 Data
    test2_mae = mean_absolute_error(test2_data['reaction_energy'], pred_test2)
    test2_rmse = mean_squared_error(test2_data['reaction_energy'], pred_test2, squared=False)
    test2_r2 = r2_score(test2_data['reaction_energy'], pred_test2)
    print('##########################  Scores of Test2 Data  ##########################')
    print('Test2 set MAE of {}: {:.3f}'.format(model, test2_mae))
    print('Test2 set RMSE of {}: {:.3f}'.format(model, test2_rmse))
    print('Test2 set R2 Score of {}: {:.3f}'.format(model, test2_r2))
    
    print("----------------------------------------------------------------------------")

    elapsed_time = time.time() - start_time
    print('##########################  Details  ##########################')
    print(f'{elapsed_time:.2f}s elapsed during modeling')

# INITIAL RUNNING

In [6]:
import lightgbm as lgbm

# Model
model = lgbm.LGBMRegressor(random_state=1)

# Training
modeling(train_encoded=train_encoded, test1_encoded=test1_encoded, test2_encoded=test2_encoded, model=model)

##########################  Scores of Train Data  ##########################
Train set MAE of LGBMRegressor(random_state=1): 0.004
Train set RMSE of LGBMRegressor(random_state=1): 0.007
Train set R2 Score of LGBMRegressor(random_state=1): 0.968
----------------------------------------------------------------------------
##########################  Scores of Test1 Data  ##########################
Test1 set MAE of LGBMRegressor(random_state=1): 0.004
Test1 set RMSE of LGBMRegressor(random_state=1): 0.008
Test1 set R2 Score of LGBMRegressor(random_state=1): 0.964
----------------------------------------------------------------------------
##########################  Scores of Test2 Data  ##########################
Test2 set MAE of LGBMRegressor(random_state=1): 0.009
Test2 set RMSE of LGBMRegressor(random_state=1): 0.011
Test2 set R2 Score of LGBMRegressor(random_state=1): 0.845
----------------------------------------------------------------------------
##########################  Detail

In [None]:
0.004	0.007	0.968		0.004	0.008	0.964		0.009	0.011	0.845

# Best without FS

### 1- Final Best without FS -----------> Best: 0.956932 using {'lambda_l1': 0.0, 'lambda_l2': 0.4, 'learning_rate': 0.05, 'min_data_in_leaf': 20, 'num_leaves': 49}

In [None]:
Best: 0.956932 using {'lambda_l1': 0.0, 'lambda_l2': 0.4, 'learning_rate': 0.05, 'min_data_in_leaf': 20, 'num_leaves': 49}

In [28]:
import lightgbm as lgbm

# Model
model = lgbm.LGBMRegressor(random_state=1, lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05, min_data_in_leaf=20, num_leaves=49)

# Training
modeling(train_encoded=train_encoded, test1_encoded=test1_encoded, test2_encoded=test2_encoded, model=model)

##########################  Scores of Train Data  ##########################
Train set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.004
Train set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.008
Train set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.964
----------------------------------------------------------------------------
##########################  Scores of Test1 Data  ##########################
Test1 set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.005
Test1 set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1)

In [None]:
0.004	0.007	0.968		0.004	0.008	0.964		0.009	0.011	0.845

# Experiment 1

In [None]:
1- Best: 0.956917 using {'lambda_l1': 0.0, 'lambda_l2': 0.6, 'learning_rate': 0.05, 'min_data_in_leaf': 20, 'num_leaves': 49}

In [22]:
import lightgbm as lgbm

# Model
model = lgbm.LGBMRegressor(random_state=1, lambda_l1=0.0, lambda_l2=0.6, learning_rate=0.05, min_data_in_leaf=20, num_leaves=49)

# Training
modeling(train_encoded=train_encoded, test1_encoded=test1_encoded, test2_encoded=test2_encoded, model=model)

##########################  Scores of Train Data  ##########################
Train set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.6, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.004
Train set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.6, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.008
Train set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.6, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.964
----------------------------------------------------------------------------
##########################  Scores of Test1 Data  ##########################
Test1 set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.6, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.005
Test1 set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.6, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1)

In [None]:
0.004	0.007	0.968		0.004	0.008	0.964		0.009	0.011	0.845

In [None]:
##########################  Scores of Train Data  ##########################
Train set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.6, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.004
Train set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.6, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.008
Train set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.6, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.964
----------------------------------------------------------------------------
##########################  Scores of Test1 Data  ##########################
Test1 set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.6, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.005
Test1 set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.6, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.008
Test1 set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.6, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.961
----------------------------------------------------------------------------
##########################  Scores of Test2 Data  ##########################
Test2 set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.6, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.011
Test2 set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.6, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.015
Test2 set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.6, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.722
----------------------------------------------------------------------------
##########################  Details  ##########################
1.62s elapsed during modeling

# Experiment 2

In [None]:
2- Best: 0.956943 using {'lambda_l1': 0.0, 'lambda_l2': 0.1, 'learning_rate': 0.1, 'min_data_in_leaf': 20, 'num_leaves': 49}

In [36]:
import lightgbm as lgbm

# Model
model = lgbm.LGBMRegressor(random_state=1, lambda_l1=0.0, lambda_l2=0.1, learning_rate=0.1, min_data_in_leaf=20, num_leaves=49)

# Training
modeling(train_encoded=train_encoded, test1_encoded=test1_encoded, test2_encoded=test2_encoded, model=model)

##########################  Scores of Train Data  ##########################
Train set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.1, min_data_in_leaf=20, num_leaves=49,
              random_state=1): 0.003
Train set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.1, min_data_in_leaf=20, num_leaves=49,
              random_state=1): 0.006
Train set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.1, min_data_in_leaf=20, num_leaves=49,
              random_state=1): 0.974
----------------------------------------------------------------------------
##########################  Scores of Test1 Data  ##########################
Test1 set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.1, min_data_in_leaf=20, num_leaves=49,
              random_state=1): 0.004
Test1 set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.1, min_data_in_leaf=20, num_leaves=49,
              random_state=1): 0.007
Test1 set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.1, min_data_in_leaf=20, num_l

In [None]:
0.004	0.007	0.968		0.004	0.008	0.964		0.009	0.011	0.845

In [None]:
##########################  Scores of Train Data  ##########################
Train set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.1, min_data_in_leaf=20, num_leaves=49,
              random_state=1): 0.003
Train set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.1, min_data_in_leaf=20, num_leaves=49,
              random_state=1): 0.006
Train set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.1, min_data_in_leaf=20, num_leaves=49,
              random_state=1): 0.974
----------------------------------------------------------------------------
##########################  Scores of Test1 Data  ##########################
Test1 set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.1, min_data_in_leaf=20, num_leaves=49,
              random_state=1): 0.004
Test1 set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.1, min_data_in_leaf=20, num_leaves=49,
              random_state=1): 0.007
Test1 set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.1, min_data_in_leaf=20, num_leaves=49,
              random_state=1): 0.967
----------------------------------------------------------------------------
##########################  Scores of Test2 Data  ##########################
Test2 set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.1, min_data_in_leaf=20, num_leaves=49,
              random_state=1): 0.009
Test2 set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.1, min_data_in_leaf=20, num_leaves=49,
              random_state=1): 0.012
Test2 set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.1, min_data_in_leaf=20, num_leaves=49,
              random_state=1): 0.826
----------------------------------------------------------------------------
##########################  Details  ##########################
1.74s elapsed during modeling

# Experiment 3

In [None]:
3- Best: 0.956932 using {'lambda_l1': 0.0, 'lambda_l2': 0.4, 'learning_rate': 0.05, 'min_data_in_leaf': 20, 'num_leaves': 49}

In [37]:
import lightgbm as lgbm

# Model
model = lgbm.LGBMRegressor(random_state=1, lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05, min_data_in_leaf=20, num_leaves=49)

# Training
modeling(train_encoded=train_encoded, test1_encoded=test1_encoded, test2_encoded=test2_encoded, model=model)

##########################  Scores of Train Data  ##########################
Train set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.004
Train set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.008
Train set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.964
----------------------------------------------------------------------------
##########################  Scores of Test1 Data  ##########################
Test1 set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.005
Test1 set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1)

In [None]:
##########################  Scores of Train Data  ##########################
Train set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.004
Train set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.008
Train set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.964
----------------------------------------------------------------------------
##########################  Scores of Test1 Data  ##########################
Test1 set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.005
Test1 set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.008
Test1 set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.962
----------------------------------------------------------------------------
##########################  Scores of Test2 Data  ##########################
Test2 set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.010
Test2 set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.014
Test2 set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.4, learning_rate=0.05,
              min_data_in_leaf=20, num_leaves=49, random_state=1): 0.755
----------------------------------------------------------------------------
##########################  Details  ##########################
1.75s elapsed during modeling

--------------

# Overfitting Problem 

# 1. Experiment 

In [None]:
Best: 0.953880 using {'lambda_l1': 0.0, 'lambda_l2': 0.2, 'learning_rate': 0.1, 'max_depth': 7, 'min_data_in_leaf': 20}

In [47]:
import lightgbm as lgbm

# Model
model = lgbm.LGBMRegressor(random_state=1, lambda_l1=0.0, lambda_l2=0.2, learning_rate=0.1, max_depth=7, min_data_in_leaf=20)

# Training
modeling(train_encoded=train_encoded, test1_encoded=test1_encoded, test2_encoded=test2_encoded, model=model)

##########################  Scores of Train Data  ##########################
Train set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, min_data_in_leaf=20,
              random_state=1): 0.004
Train set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, min_data_in_leaf=20,
              random_state=1): 0.008
Train set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, min_data_in_leaf=20,
              random_state=1): 0.961
----------------------------------------------------------------------------
##########################  Scores of Test1 Data  ##########################
Test1 set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, min_data_in_leaf=20,
              random_state=1): 0.005
Test1 set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, min_data_in_leaf=20,
              random_state=1): 0.008
Test1 set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, min_data_in_leaf=20,
  

In [None]:
0.004	0.007	0.968		0.004	0.008	0.964		0.009	0.011	0.845

In [None]:
##########################  Scores of Train Data  ##########################
Train set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, min_data_in_leaf=20,
              random_state=1): 0.004
Train set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, min_data_in_leaf=20,
              random_state=1): 0.008
Train set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, min_data_in_leaf=20,
              random_state=1): 0.961
----------------------------------------------------------------------------
##########################  Scores of Test1 Data  ##########################
Test1 set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, min_data_in_leaf=20,
              random_state=1): 0.005
Test1 set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, min_data_in_leaf=20,
              random_state=1): 0.008
Test1 set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, min_data_in_leaf=20,
              random_state=1): 0.959
----------------------------------------------------------------------------
##########################  Scores of Test2 Data  ##########################
Test2 set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, min_data_in_leaf=20,
              random_state=1): 0.009
Test2 set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, min_data_in_leaf=20,
              random_state=1): 0.011
Test2 set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, min_data_in_leaf=20,
              random_state=1): 0.850
----------------------------------------------------------------------------
##########################  Details  ##########################
1.48s elapsed during modeling

# 2. Experiment 

In [None]:
Best: 0.953880 using {'lambda_l1': 0.0, 'lambda_l2': 0.2, 'learning_rate': 0.1, 'max_depth': 7}

In [57]:
import lightgbm as lgbm

# Model
model = lgbm.LGBMRegressor(random_state=1, lambda_l1=0.0, lambda_l2=0.2, learning_rate=0.1, max_depth=7)

# Training
modeling(train_encoded=train_encoded, test1_encoded=test1_encoded, test2_encoded=test2_encoded, model=model)

##########################  Scores of Train Data  ##########################
Train set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, random_state=1): 0.004
Train set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, random_state=1): 0.008
Train set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, random_state=1): 0.961
----------------------------------------------------------------------------
##########################  Scores of Test1 Data  ##########################
Test1 set MAE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, random_state=1): 0.005
Test1 set RMSE of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, random_state=1): 0.008
Test1 set R2 Score of LGBMRegressor(lambda_l1=0.0, lambda_l2=0.2, max_depth=7, random_state=1): 0.959
----------------------------------------------------------------------------
##########################  Scores of Test2 Data  ##########################
Test2 set MAE of LGBM

# FINAL MODEL

# 3. (LAST) Experiment 

In [13]:
import lightgbm as lgbm

# Model
model = lgbm.LGBMRegressor(random_state=1, lambda_l2=0.2, learning_rate=0.1, max_depth=7, num_leaves=31)

# Training
modeling(train_encoded=train_encoded, test1_encoded=test1_encoded, test2_encoded=test2_encoded, model=model)

##########################  Scores of Train Data  ##########################
Train set MAE of LGBMRegressor(lambda_l2=0.2, max_depth=7, random_state=1): 0.004
Train set RMSE of LGBMRegressor(lambda_l2=0.2, max_depth=7, random_state=1): 0.008
Train set R2 Score of LGBMRegressor(lambda_l2=0.2, max_depth=7, random_state=1): 0.961
----------------------------------------------------------------------------
##########################  Scores of Test1 Data  ##########################
Test1 set MAE of LGBMRegressor(lambda_l2=0.2, max_depth=7, random_state=1): 0.005
Test1 set RMSE of LGBMRegressor(lambda_l2=0.2, max_depth=7, random_state=1): 0.008
Test1 set R2 Score of LGBMRegressor(lambda_l2=0.2, max_depth=7, random_state=1): 0.959
----------------------------------------------------------------------------
##########################  Scores of Test2 Data  ##########################
Test2 set MAE of LGBMRegressor(lambda_l2=0.2, max_depth=7, random_state=1): 0.009
Test2 set RMSE of LGBMRegress

In [133]:
model

LGBMRegressor(lambda_l2=0.2, max_depth=7, random_state=1)

# Save Test1 and Test2 Preds

## For Test 1

In [89]:
pred_test1 = model.predict(test1_encoded.values)
pred_test1

array([-0.04543877, -0.01376934,  0.00165234, ..., -0.00923236,
       -0.01095121, -0.01223837])

In [90]:
test1_data.head()

Unnamed: 0,reaction_id,data_package_id,bond_type,functional_group_stoichiometry,reactant_smiles,product_smiles,reactant_inchiKey,product_inchiKey,reactant_solubility,product_solubility,...,product_single_point_job_id,reactant_optimization_job_id,product_optimization_job_id,UMAP-1,UMAP-2,data_type,reactantUFF,reactantMMFF,productUFF,productMMFF
2,3,1,OH,COOH,O=C1CC(=O)C(C(=O)O)=C1C(=O)O,C1C(O)=C(C(=O)O)C(=C1O)C(=O)O,QLGSJNWSAMBDMI-UHFFFAOYSA-N,AEVQXUUICHCAGT-UHFFFAOYSA-N,-0.966,-0.867,...,26.0,19.0,25.0,13.362195,3.405288,1,33.280029,-113.415199,37.053325,-2.716193
3,4,1,OH,F,O=C1CC(=O)C=C1F,C1C(O)=C(F)C=C1O,XVCHEZRSXGJYDT-UHFFFAOYSA-N,LLYKNQJBRVSOKM-UHFFFAOYSA-N,-0.524,-0.499,...,34.0,21.0,33.0,14.856011,2.316219,1,23.112465,-12.882731,24.103198,34.474933
13,14,2,OH,,O=C1NC(=O)C=C1,Oc1[nH]c(O)cc1,PEEHTFAAVSWFBL-UHFFFAOYSA-N,RDUFWJSALXDELV-UHFFFAOYSA-N,-0.533,-0.178,...,68.0,45.0,67.0,12.894115,1.835507,1,,,,
14,15,2,OH,F,O=C1NC(=O)C(F)=C1F,Oc1[nH]c(O)c(F)c1F,WGEANPDRAHQNMF-UHFFFAOYSA-N,WEWQBYWNWVSGIO-UHFFFAOYSA-N,-0.38,-0.439,...,84.0,65.0,83.0,13.093039,1.540677,1,35.312477,-37.025287,38.21408,15.840633
24,25,3,OH,COOH,O=C1SC(=O)C(C(=O)O)=C1C(=O)O,Oc1sc(O)c(C(=O)O)c1C(=O)O,OMUTVOCQJHSMBJ-UHFFFAOYSA-N,HHXTWZYEPJOJGQ-UHFFFAOYSA-N,-1.39,-1.031,...,120.0,103.0,119.0,13.202756,3.411132,1,36.602118,-146.179999,50.843455,-66.445944


In [91]:
test1_data[["reaction_id", "reactant_smiles", "reaction_energy"]]

Unnamed: 0,reaction_id,reactant_smiles,reaction_energy
2,3,O=C1CC(=O)C(C(=O)O)=C1C(=O)O,-0.06394
3,4,O=C1CC(=O)C=C1F,-0.01642
13,14,O=C1NC(=O)C=C1,-0.00731
14,15,O=C1NC(=O)C(F)=C1F,-0.00118
24,25,O=C1SC(=O)C(C(=O)O)=C1C(=O)O,-0.06775
...,...,...,...
15759,15897,O=S(=O)(O)c(c1)cc(S(=O)(=O)O)c(c12)c(O)n(c2O)N...,-0.00215
15762,15900,c1cccc(c12)c(O)n(c2O)N(C3=O)C(=O)c(c34)c(S(=O)...,0.02162
15768,15906,O=S(=O)(O)c(c1)c(S(=O)(=O)O)c(S(=O)(=O)O)c(c12...,-0.01803
15788,15927,O=S(=O)(O)c1ccc(S(=O)(=O)O)c(c12)c(O)n(c2O)N(C...,-0.03881


In [118]:
lgbm_result_test1 = test1_data[["reaction_id", "reactant_smiles", "reaction_energy"]]

In [119]:
lgbm_result_test1

Unnamed: 0,reaction_id,reactant_smiles,reaction_energy
2,3,O=C1CC(=O)C(C(=O)O)=C1C(=O)O,-0.06394
3,4,O=C1CC(=O)C=C1F,-0.01642
13,14,O=C1NC(=O)C=C1,-0.00731
14,15,O=C1NC(=O)C(F)=C1F,-0.00118
24,25,O=C1SC(=O)C(C(=O)O)=C1C(=O)O,-0.06775
...,...,...,...
15759,15897,O=S(=O)(O)c(c1)cc(S(=O)(=O)O)c(c12)c(O)n(c2O)N...,-0.00215
15762,15900,c1cccc(c12)c(O)n(c2O)N(C3=O)C(=O)c(c34)c(S(=O)...,0.02162
15768,15906,O=S(=O)(O)c(c1)c(S(=O)(=O)O)c(S(=O)(=O)O)c(c12...,-0.01803
15788,15927,O=S(=O)(O)c1ccc(S(=O)(=O)O)c(c12)c(O)n(c2O)N(C...,-0.03881


In [120]:
lgbm_result_test1["pred_test1"] = pred_test1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [121]:
lgbm_result_test1

Unnamed: 0,reaction_id,reactant_smiles,reaction_energy,pred_test1
2,3,O=C1CC(=O)C(C(=O)O)=C1C(=O)O,-0.06394,-0.045439
3,4,O=C1CC(=O)C=C1F,-0.01642,-0.013769
13,14,O=C1NC(=O)C=C1,-0.00731,0.001652
14,15,O=C1NC(=O)C(F)=C1F,-0.00118,-0.000202
24,25,O=C1SC(=O)C(C(=O)O)=C1C(=O)O,-0.06775,-0.047565
...,...,...,...,...
15759,15897,O=S(=O)(O)c(c1)cc(S(=O)(=O)O)c(c12)c(O)n(c2O)N...,-0.00215,0.001921
15762,15900,c1cccc(c12)c(O)n(c2O)N(C3=O)C(=O)c(c34)c(S(=O)...,0.02162,-0.000530
15768,15906,O=S(=O)(O)c(c1)c(S(=O)(=O)O)c(S(=O)(=O)O)c(c12...,-0.01803,-0.009232
15788,15927,O=S(=O)(O)c1ccc(S(=O)(=O)O)c(c12)c(O)n(c2O)N(C...,-0.03881,-0.010951


In [None]:
# saving the dataframe
#result_lgbm_test1.to_csv('result_lgbm_test1.csv')

In [122]:
# saving the dataframe
lgbm_result_test1.to_csv(r'.\final_models\lgbm_result_test1.csv', index=False)

## For Test 2

In [97]:
pred_test2 = model.predict(test2_encoded.values)
pred_test2

array([-0.01038539, -0.01016181, -0.01016181, ..., -0.0487231 ,
       -0.04866332, -0.04986921])

In [98]:
test2_data[["reaction_id", "reactant_smiles", "reaction_energy"]]

Unnamed: 0,reaction_id,reactant_smiles,reaction_energy
767,769,c1cccc(c12)cc(nn2)O,-0.01916
772,774,Oc1cccc(c12)cc(nn2)O,-0.01587
776,778,c1c(O)ccc(c12)cc(nn2)O,-0.01793
779,781,c1cc(O)cc(c12)cc(nn2)O,-0.01559
781,783,c1ccc(O)c(c12)cc(nn2)O,-0.01734
...,...,...,...
14233,14338,O=C(O)c1cc(C(=O)O)c(C(=O)O)c(c12)S/C(C2=O)=C(C...,-0.06081
14234,14339,O=C(O)c1c(C(=O)O)cc(C(=O)O)c(c12)S/C(C2=O)=C(C...,-0.04998
14235,14340,O=C(O)c1c(C(=O)O)cc(C(=O)O)c(c12)S/C(C2=O)=C(C...,-0.05763
14236,14341,O=C(O)c1c(C(=O)O)c(C(=O)O)cc(c12)S/C(C2=O)=C(C...,-0.05533


In [123]:
lgbm_result_test2 = test2_data[["reaction_id", "reactant_smiles", "reaction_energy"]]
lgbm_result_test2

Unnamed: 0,reaction_id,reactant_smiles,reaction_energy
767,769,c1cccc(c12)cc(nn2)O,-0.01916
772,774,Oc1cccc(c12)cc(nn2)O,-0.01587
776,778,c1c(O)ccc(c12)cc(nn2)O,-0.01793
779,781,c1cc(O)cc(c12)cc(nn2)O,-0.01559
781,783,c1ccc(O)c(c12)cc(nn2)O,-0.01734
...,...,...,...
14233,14338,O=C(O)c1cc(C(=O)O)c(C(=O)O)c(c12)S/C(C2=O)=C(C...,-0.06081
14234,14339,O=C(O)c1c(C(=O)O)cc(C(=O)O)c(c12)S/C(C2=O)=C(C...,-0.04998
14235,14340,O=C(O)c1c(C(=O)O)cc(C(=O)O)c(c12)S/C(C2=O)=C(C...,-0.05763
14236,14341,O=C(O)c1c(C(=O)O)c(C(=O)O)cc(c12)S/C(C2=O)=C(C...,-0.05533


In [124]:
lgbm_result_test2["pred_test2"] = pred_test2
lgbm_result_test2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,reaction_id,reactant_smiles,reaction_energy,pred_test2
767,769,c1cccc(c12)cc(nn2)O,-0.01916,-0.010385
772,774,Oc1cccc(c12)cc(nn2)O,-0.01587,-0.010162
776,778,c1c(O)ccc(c12)cc(nn2)O,-0.01793,-0.010162
779,781,c1cc(O)cc(c12)cc(nn2)O,-0.01559,-0.010232
781,783,c1ccc(O)c(c12)cc(nn2)O,-0.01734,-0.010232
...,...,...,...,...
14233,14338,O=C(O)c1cc(C(=O)O)c(C(=O)O)c(c12)S/C(C2=O)=C(C...,-0.06081,-0.050642
14234,14339,O=C(O)c1c(C(=O)O)cc(C(=O)O)c(c12)S/C(C2=O)=C(C...,-0.04998,-0.049104
14235,14340,O=C(O)c1c(C(=O)O)cc(C(=O)O)c(c12)S/C(C2=O)=C(C...,-0.05763,-0.048723
14236,14341,O=C(O)c1c(C(=O)O)c(C(=O)O)cc(c12)S/C(C2=O)=C(C...,-0.05533,-0.048663


In [125]:
# saving the dataframe
lgbm_result_test2.to_csv(r'.\final_models\lgbm_result_test2.csv', index=False)

In [127]:
lgbm_result_test2= pd.read_csv(r'.\final_models\lgbm_result_test2.csv')

In [128]:
lgbm_result_test2

Unnamed: 0,reaction_id,reactant_smiles,reaction_energy,pred_test2
0,769,c1cccc(c12)cc(nn2)O,-0.01916,-0.010385
1,774,Oc1cccc(c12)cc(nn2)O,-0.01587,-0.010162
2,778,c1c(O)ccc(c12)cc(nn2)O,-0.01793,-0.010162
3,781,c1cc(O)cc(c12)cc(nn2)O,-0.01559,-0.010232
4,783,c1ccc(O)c(c12)cc(nn2)O,-0.01734,-0.010232
...,...,...,...,...
1475,14338,O=C(O)c1cc(C(=O)O)c(C(=O)O)c(c12)S/C(C2=O)=C(C...,-0.06081,-0.050642
1476,14339,O=C(O)c1c(C(=O)O)cc(C(=O)O)c(c12)S/C(C2=O)=C(C...,-0.04998,-0.049104
1477,14340,O=C(O)c1c(C(=O)O)cc(C(=O)O)c(c12)S/C(C2=O)=C(C...,-0.05763,-0.048723
1478,14341,O=C(O)c1c(C(=O)O)c(C(=O)O)cc(c12)S/C(C2=O)=C(C...,-0.05533,-0.048663


# Save the model

In [106]:
model

LGBMRegressor(lambda_l2=0.2, max_depth=7, random_state=1)

In [107]:
pred_test1 = model.predict(test1_encoded.values)
pred_test1

array([-0.04543877, -0.01376934,  0.00165234, ..., -0.00923236,
       -0.01095121, -0.01223837])

In [108]:
#model.booster_.save_model('final_lgbm_model.txt')

In [109]:
#model.booster_.save_model(r'C:\Users\PC\Desktop\DIFFER\Projects\REDDB-ML\REDDB-ML-dev-main\Code\prediction\final_models\final_lgbm_model.txt')
# Same with below

In [129]:
model.booster_.save_model(r'.\final_models\lgbm_final_model.txt')

<lightgbm.basic.Booster at 0x2b6138d4788>

#### Read Saved model 

In [130]:
lgbm_final_model = lgbm.Booster(model_file= r'.\final_models\lgbm_final_model.txt')

In [131]:
lgbm_final_model

<lightgbm.basic.Booster at 0x2b613f03bc8>

In [132]:
s_pred_test1 = lgbm_final_model.predict(test1_encoded.values)
s_pred_test1

array([-0.04543877, -0.01376934,  0.00165234, ..., -0.00923236,
       -0.01095121, -0.01223837])