In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tc-smiles/Tc_SMILES.csv
/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train.csv
/kaggle/input/neurips-open-polymer-prediction-2025/test.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset2.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset1.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset3.csv
/kaggle/input/tg-smiles-pid-polymer-class/TgSS_enriched_cleaned.csv
/kaggle/input/rdkit-install-whl/rdkit_wheel/pillow-11.2.1-cp311-cp311-manylinux_2_28_x86_64.whl
/kaggle/input/rdkit-install-whl/rdkit_wheel/numpy-2.3.0-cp311-cp311-manylinux_2_28_x86_64.whl
/kaggle/input/rdkit-install-whl/rdkit_wheel/rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl


# Welcome to my notebook! I am learning about applying machine learning to predict the properties of polymers.

The goal of this project is to predict the fundamental properties of polymers to speed up material discovery and development.

These properties are:
1. Glass Transition Temperature (Tg): Simply put, this is the temperature at which a polymer moves from being stiff and rigid to being flexible.
2. Fractional Free volume (FFV): This is the ratio of the free space (between monomers) to the total volume occupied by the polymer.
3. Thermal Conductivity (Tc): The ability of the polymer to allow heat pass through it.
4. Density: The Density of a polymer refers to how tightly packed the polymer chains are in the materials.
5. Radius of Gyration (Rg): This is the square root of the squared average distance between monomers from the polymer's center of mass.



In [2]:
#This allows us to install RDkit for offline use

!pip -q install /kaggle/input/rdkit-install-whl/rdkit_wheel/rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

RDKit is an open-source cheminformatics software toolkit written in C++ and Python. It provides a wide range of tools for manipulating, analyzing, and visualizing chemical information, making it valuable in cheminformatics, computational chemistry, and machine learning applications, particularly in drug discovery and materials science.

RDKit enables us to see the structural formula of compounds like benzene, alcohol etc using python

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
from tqdm.notebook import tqdm
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GroupKFold, KFold
from rdkit import Chem  #a chemical library used for reading, interpreting and manipulating SMILES strings
from rdkit.Chem import Descriptors, Lipinski, rdMolDescriptors, Fragments, AllChem, Draw
import warnings
warnings.filterwarnings("ignore")
import logging #silencing logs

#====Removes optuna messages while running====#
optuna.logging.set_verbosity(optuna.logging.WARNING)
logging.getLogger('optuna').setLevel(logging.WARNING)

#from pathlib import Path #Handles file paths cleaner

In [4]:
# create some configuration settings

class config_settings():
    def __init__(self):
        #General settings
        self.use_gpu = True
        self.gpu_id = 0
        self.random_state = 1

        #Training
        self.n_folds = 5
        #self.max_iterations = 10000
        self.early_stopping = 100
        self.optuna_trials = 100
        self.verbose_eval = 100

        #Target and dirs
        self.target_cols = ["Tg", "FFV", "Tc", "Density", "Rg"]
        #self.study_dir = Path("Studies")
        #self.model_dir = Path("Models")
        #self.study_dir.mkdir(parents=True, exist_ok=True)
        #self.model_dir.mkdir(parents=True, exist_ok=True)

In [5]:
#some polymer molecules can be written in different ways with the SMILES representation
# To avoid treating the polymer differently during analysis whereas it's the same, we conver SMILES to Canonical SMILES
# What this does is to ensure that the SMILE text for the same polymers are identical.

def convert_to_canonical_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return Chem.MolToSmiles(mol, canonical=True)
        return smiles

    except:
        return smiles

What is SMILES?
SMILES stands for Simplified Molecular Input line entry system. 
SMILES is simply a textual representation of the structure of a molecule (molecular structure written as text)
For example:
- Propane (C3H8) -> CCC 
- Ethene (C2H4) -> C=C
- Propyne (C3H4) -> C#CC
- Benzene (C6H6) -> c1ccccc1 (lowercase letters for aromatic compounds)
- Ethanol (C2H5OH) -> CC(O)
- Isobutane (CH3CH(CH3)2) -> CC(C)C
- Tert-butanol ((CH3)2OHCH3) -> CC(C)(C)O

There are two ways of encoding SMILES data to a format ML models can understand:
1. Molecular Fingerprints
2. Sequence Models

More on this later...

In [6]:
#load the data
data1 = pd.read_csv("/kaggle/input/neurips-open-polymer-prediction-2025/train.csv")
data2 = pd.read_csv("/kaggle/input/neurips-open-polymer-prediction-2025/test.csv")
data1.head()

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
0,87817,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,,
1,106919,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.37041,,,
2,388772,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.37886,,,
3,519416,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.387324,,,
4,539187,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,0.35547,,,


In [7]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7973 entries, 0 to 7972
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       7973 non-null   int64  
 1   SMILES   7973 non-null   object 
 2   Tg       511 non-null    float64
 3   FFV      7030 non-null   float64
 4   Tc       737 non-null    float64
 5   Density  613 non-null    float64
 6   Rg       614 non-null    float64
dtypes: float64(5), int64(1), object(1)
memory usage: 436.2+ KB


In [8]:
data1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,7973.0,1080050000.0,621824100.0,87817.0,537664100.0,1079079000.0,1621708000.0,2147438000.0
Tg,511.0,96.45231,111.2283,-148.029738,13.67451,74.04018,161.1476,472.25
FFV,7030.0,0.367212,0.02960878,0.226992,0.3495489,0.3642636,0.38079,0.7770971
Tc,737.0,0.2563341,0.08953785,0.0465,0.186,0.236,0.3305,0.524
Density,613.0,0.9854844,0.1461892,0.748691,0.8902434,0.9481932,1.062096,1.840999
Rg,614.0,16.41979,4.60864,9.728355,12.54033,15.05219,20.41107,34.67291


In [9]:
df1=data1.copy()
dfx = data1.copy()
df2 = data2.copy()
cfg = config_settings()
target_cols = cfg.target_cols

In [10]:
# Calculate descriptors from Smiles
#def get_descriptors(smiles):
#    mol = Chem.MolFromSmiles(smiles)
#    if not mol:
#        return pd.Series([None]*16)
#    return pd.Series([
#        Descriptors.MolWt(mol),                             
#        rdMolDescriptors.CalcNumRings(mol),                 
#        Descriptors.MolLogP(mol),
#        Lipinski.NumRotatableBonds(mol),
#        Descriptors.HeavyAtomCount(mol),
#        rdMolDescriptors.CalcNumAromaticRings(mol),
#        rdMolDescriptors.CalcNumAliphaticRings(mol),
#        Descriptors.TPSA(mol),
 #       Lipinski.NumHAcceptors(mol),
 #       Lipinski.NumHDonors(mol),
 #       Descriptors.BertzCT(mol),
#        Descriptors.BalabanJ(mol),
#        Fragments.fr_ether(mol),
 #       Fragments.fr_ester(mol),
 #       Fragments.fr_amide(mol),
  #  ])

#dfx[['MolWt', 'RingCount', 'LogP','NumRotatableBonds','HeavyAtomCount',
#    'CalcNumAromaticRings','CalcNumAliphaticRings','NumHAcceptors','TPSA',
#    'NumHDonors','BertzCT','BalabanJ','Num_Ether','Num_Ester','Num_Amide']] = dfx['SMILES'].apply(get_descriptors)

#for target in target_cols:
#    miss_col = f"{target}_missing"
    # 1 represent missing value while 0 represent non-null values
 #   dfx[miss_col] = dfx[target].isnull().astype(int)  
    
 #   print(f"\n Checking missing pattern for {target}...")

 #   for desc in ['MolWt', 'RingCount', 'LogP','NumRotatableBonds','HeavyAtomCount',
 #   'CalcNumAromaticRings','CalcNumAliphaticRings','NumHAcceptors','TPSA',
  #  'NumHDonors','BertzCT','BalabanJ','Num_Ether','Num_Ester','Num_Amide']:
 #       plt.figure(figsize=(5, 4))
 #       sns.barplot(x=miss_col, y=desc, data=dfx)
 #       plt.xticks([0, 1], [f'{target} present', f'{target} missing'])
 #       plt.title(f"Avg {desc} when {target} is present vs missing")
 #       plt.xlabel("")
 #       plt.ylabel(desc)
 #       plt.tight_layout()
  #      plt.show()


In [11]:
#plt.figure(figsize=(6,4))

#sns.heatmap(dfx[target_cols].corr(),annot=False, cbar=True, cmap=sns.diverging_palette(240,10))
#plt.show()

In [12]:
'''fig, axes = plt.subplots(2,3,figsize=(15,8))

sns.histplot(data1['Tg'], kde=True, ax=axes[0,0])
axes[0,0].set_title('Transition glass temperature')
axes[0,0].set_xlabel('Tg')

sns.histplot(data1['FFV'], kde=True, ax=axes[0,1])
axes[0,1].set_title('Fractional Free Volume')
axes[0,1].set_xlabel('FFV')

sns.histplot(data1['Tc'], kde=True, ax=axes[0,2])
axes[0,2].set_title('Thermal conductivity')
axes[0,2].set_xlabel('Tc')

sns.histplot(data1['Density'], kde=True, ax=axes[1,0])
axes[1,0].set_title('Density')
axes[1,0].set_xlabel('Density')

sns.histplot(data1['Rg'], kde=True, ax=axes[1,1])
axes[1,1].set_title('Radius of Gyration')
axes[1,1].set_xlabel('Radius of Gyration')

axes[1,2].axis('off')

plt.tight_layout()
plt.show()'''

"fig, axes = plt.subplots(2,3,figsize=(15,8))\n\nsns.histplot(data1['Tg'], kde=True, ax=axes[0,0])\naxes[0,0].set_title('Transition glass temperature')\naxes[0,0].set_xlabel('Tg')\n\nsns.histplot(data1['FFV'], kde=True, ax=axes[0,1])\naxes[0,1].set_title('Fractional Free Volume')\naxes[0,1].set_xlabel('FFV')\n\nsns.histplot(data1['Tc'], kde=True, ax=axes[0,2])\naxes[0,2].set_title('Thermal conductivity')\naxes[0,2].set_xlabel('Tc')\n\nsns.histplot(data1['Density'], kde=True, ax=axes[1,0])\naxes[1,0].set_title('Density')\naxes[1,0].set_xlabel('Density')\n\nsns.histplot(data1['Rg'], kde=True, ax=axes[1,1])\naxes[1,1].set_title('Radius of Gyration')\naxes[1,1].set_xlabel('Radius of Gyration')\n\naxes[1,2].axis('off')\n\nplt.tight_layout()\nplt.show()"

In [13]:
#Let's visualize a polymer molecule
#m1 = Chem.MolFromSmiles("*CC(*)c1ccccc1C(=O)OCCCCCC")
#m1

In [14]:
#mols_list = []  #list to store the polymer molecules after converting from SMILES

#code snippet to visualize the top 6 polymer molecules in the training dataset
#for smiles in df1['SMILES'].head(6):
#    mols = Chem.MolFromSmiles(smiles)
#    mols_list.append(mols)

#mol_imgs = Draw.MolsToGridImage(mols_list, molsPerRow=3, subImgSize=(800,300))
#mol_imgs

In [15]:
#Lets see some information about the polymers
'''for mols in mols_list:
    molwt = Descriptors.MolWt(mols)
    val_electrons = Descriptors.NumValenceElectrons(mols)
    rot_bonds = Descriptors.NumRotatableBonds(mols)
    ring_counts = Descriptors.RingCount(mols)
    h_donors = Descriptors.NumHDonors(mols)
    r_bonds = Descriptors.NumRotatableBonds(mols)
    print("Molecule:", Chem.MolToSmiles(mols))
    print("Molecular weight:", molwt)
    print("Valence electrons:", val_electrons)
    print("Number of Rotatable Bonds:", rot_bonds)
    print("Number of Ring counts:", ring_counts)
    print("Number of Hydrogen donors:", h_donors)
    print("Number of Rotatable bonds:", r_bonds)
    print("-" * 30)'''

'for mols in mols_list:\n    molwt = Descriptors.MolWt(mols)\n    val_electrons = Descriptors.NumValenceElectrons(mols)\n    rot_bonds = Descriptors.NumRotatableBonds(mols)\n    ring_counts = Descriptors.RingCount(mols)\n    h_donors = Descriptors.NumHDonors(mols)\n    r_bonds = Descriptors.NumRotatableBonds(mols)\n    print("Molecule:", Chem.MolToSmiles(mols))\n    print("Molecular weight:", molwt)\n    print("Valence electrons:", val_electrons)\n    print("Number of Rotatable Bonds:", rot_bonds)\n    print("Number of Ring counts:", ring_counts)\n    print("Number of Hydrogen donors:", h_donors)\n    print("Number of Rotatable bonds:", r_bonds)\n    print("-" * 30)'

Sometimes a molecule can be represented with more than one SMILES texts. If not handled properly during featurization, the molecule properties will be different for the same molecule. For example ethanol can be represented like this:

CCO and C(C)O

Both are ethanol but the arrangement is different, if not handled, the calculated properties will be different.
To handle this, we canonize the SMILES text. This ensures that any molecule having more than one SMILES representation are written in the same way. For ethanol C(C)O will become CCO.

Why do this?

Canonizing the SMILES texts will help us identify duplicates very easily. Think about it!

Checking for duplicates without canonizing the SMILES text would return no duplicates found. But if we canonize the SMILES, any molecule with different SMILES text would be the same and thus we can easily get rid of the duplicates.

In [16]:
#Convert SMILES to Canonical SMILES for both train and test data
df1['SMILES'] = df1['SMILES'].apply(convert_to_canonical_smiles)
df2['SMILES'] = df2['SMILES'].apply(convert_to_canonical_smiles)

print("Converting to SMILES to Canonical SMILES...")
df1 = df1.dropna(subset=['SMILES'])
df2 = df2.dropna(subset=['SMILES'])
print('All Finished!')

Converting to SMILES to Canonical SMILES...
All Finished!


In [17]:
#check for duplicates
df1['SMILES'].duplicated().any()

False

In [18]:
#df1.head()

### Feature Engineering

Recall that SMILES is just a representation of the molecule structure in the form of text. This information as it is (SMILES) is not of any use to machine learning models. To make these strings/texts model-friendly so the models can learn from it, we need to transform these strings into numerical entities that will encode/capture the molecules topology (chemical, structural).

This is achieved using RDkit Descriptors

We'll also create Morgan fingerprints for each molecule

Basically, Morgan Fingerprints is a representation of a molecule structure as fixed-length vectors of 0s and 1s based on the substructures found in the molecule

In [19]:
# Transform string to numerical features for modelling
def featurize(smiles):
    pol = Chem.MolFromSmiles(smiles)
    
    if pol is None:
        return None

    return {
        "MolWt": Descriptors.MolWt(pol),
        "HeavyAtomCount": Descriptors.HeavyAtomCount(pol),
        "RingCount": rdMolDescriptors.CalcNumRings(pol),
        "NumRotatableBonds": Lipinski.NumRotatableBonds(pol),
        "AromaticRingCounts": rdMolDescriptors.CalcNumAromaticRings(pol),
        "NumAliphaticRings": rdMolDescriptors.CalcNumAliphaticRings(pol),
        "LogP": Descriptors.MolLogP(pol),
        "TPSA": Descriptors.TPSA(pol),
        "NumHAcceptors": Lipinski.NumHAcceptors(pol),
        "NumHDonors": Lipinski.NumHDonors(pol),
        "BertzCT": Descriptors.BertzCT(pol),
        "BalabanJ": Descriptors.BalabanJ(pol),
        "EtherCount": Fragments.fr_ether(pol),
        "EsterCount": Fragments.fr_ester(pol),
        "AmideCount": Fragments.fr_amide(pol),
            
    }

def smiles_to_morgan(smiles, radius=2, nBits=1024):
    pol = Chem.MolFromSmiles(smiles)
    if pol is None:
        return [0] * nBits

    fp = AllChem.GetMorganFingerprintAsBitVect(pol, radius=radius, nBits=nBits)
    return list(fp)

In [20]:
featurized_data = df1['SMILES'].apply(featurize)
descriptors_df = pd.DataFrame(featurized_data.tolist())
#===============test data=============================#
featurized_data2 = df2['SMILES'].apply(featurize)
descriptors_df2 = pd.DataFrame(featurized_data2.tolist())

In [21]:
df1['Morgan_fp'] = df1['SMILES'].apply(smiles_to_morgan)
fp_df = pd.DataFrame(df1['Morgan_fp'].tolist(), index=df1.index, columns=[f'Morgan_fp{i}' for i in range(1024)])
#=========================Test data====================================#
df2['Morgan_fp'] = df2['SMILES'].apply(smiles_to_morgan)
fp_df2 = pd.DataFrame(df2['Morgan_fp'].tolist(), index=df2.index, columns=[f'Morgan_fp{i}' for i in range(1024)])
fp_df2.head()

Unnamed: 0,Morgan_fp0,Morgan_fp1,Morgan_fp2,Morgan_fp3,Morgan_fp4,Morgan_fp5,Morgan_fp6,Morgan_fp7,Morgan_fp8,Morgan_fp9,...,Morgan_fp1014,Morgan_fp1015,Morgan_fp1016,Morgan_fp1017,Morgan_fp1018,Morgan_fp1019,Morgan_fp1020,Morgan_fp1021,Morgan_fp1022,Morgan_fp1023
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
#Join df1 + descriptors_df + fp_df
train_df = pd.concat([df1,descriptors_df,fp_df], axis=1)
train_df = train_df.drop(['Morgan_fp'], axis=1)
#feature_columns = pd.concat([descriptors_df,fp_df], axis=1)

test_df = pd.concat([df2,descriptors_df2,fp_df2], axis=1)
test_df = test_df.drop(['Morgan_fp'], axis=1)
#feature_columns = pd.concat([descriptors_df,fp_df], axis=1)

train_df.head()

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg,MolWt,HeavyAtomCount,RingCount,...,Morgan_fp1014,Morgan_fp1015,Morgan_fp1016,Morgan_fp1017,Morgan_fp1018,Morgan_fp1019,Morgan_fp1020,Morgan_fp1021,Morgan_fp1022,Morgan_fp1023
0,87817,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,,,232.323,17,1,...,0,0,0,0,0,0,0,0,0,0
1,106919,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.37041,,,,598.919,45,5,...,0,0,0,0,0,1,0,0,0,0
2,388772,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.37886,,,,1003.207,73,10,...,0,0,0,0,0,0,0,0,0,0
3,519416,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.387324,,,,542.726,42,6,...,0,0,0,0,0,0,0,0,0,0
4,539187,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,0.35547,,,,965.154,70,6,...,0,0,0,0,0,1,0,0,0,0


#### Handling Missing values

To be honest, i don't know why target features should be missing. Since these target values were calculated during simulation, the target values are not missing at random. After some EDA, it could be that the simulation failed for some polymers due to one reason or the other. For example for the glass transition temperature, i saw that the average molecular weight of molecules with missing values were higher compared to those with non-missing values. This could mean that the simulation failed for polymers with a certain high molecular weight.

Best way to go:
remove polymers with missing targets and train one model for each target. A lot of data would be lost but the remaining data would be enough to train a good model based on ground truth rather than fake data.

In [23]:
feature_cols = [col for col in train_df.columns 
                   if col not in ['Tg', 'Tc', 'FFV', 'Rg', 'Density']]

#data for Tg

tg_data = train_df[feature_cols + ['Tg']]
tg_data = tg_data.dropna()

#data for Tc
tc_data = train_df[feature_cols + ['Tc']]
tc_data = tc_data.dropna()

#data for ffv
ffv_data = train_df[feature_cols + ['FFV']]
ffv_data = ffv_data.dropna()

#data for Rg
rg_data = train_df[feature_cols + ['Rg']]
rg_data = rg_data.dropna()

#data for density
density_data = train_df[feature_cols + ['Density']]
density_data = density_data.dropna()

In [24]:
y_tg_data = tg_data['Tg']
x_tg_data = tg_data.drop(['id','SMILES','Tg'], axis=1)
train_x_tg, val_x_tg, train_y_tg, val_y_tg = train_test_split(x_tg_data, y_tg_data, test_size=0.1, random_state=cfg.random_state)

y_tc_data = tc_data['Tc']
x_tc_data = tc_data.drop(['id','SMILES','Tc'], axis=1)
train_x_tc, val_x_tc, train_y_tc, val_y_tc = train_test_split(x_tc_data, y_tc_data, test_size=0.1, random_state=cfg.random_state)


y_ffv_data = ffv_data['FFV']
x_ffv_data = ffv_data.drop(['id','SMILES','FFV'], axis=1)
train_x_ffv, val_x_ffv, train_y_ffv, val_y_ffv = train_test_split(x_ffv_data, y_ffv_data, test_size=0.2, random_state=cfg.random_state)


y_rg_data = rg_data['Rg']
x_rg_data = rg_data.drop(['id','SMILES','Rg'], axis=1)
train_x_rg, val_x_rg, train_y_rg, val_y_rg = train_test_split(x_rg_data, y_rg_data, test_size=0.1, random_state=cfg.random_state)


y_density_data = density_data['Density']
x_density_data = density_data.drop(['id','SMILES','Density'], axis=1)
train_x_den, val_x_den, train_y_den, val_y_den = train_test_split(x_density_data, y_density_data, test_size=0.1, random_state=cfg.random_state)


In [25]:
train_x_den.shape, val_x_den.shape, train_y_den.shape, val_y_den.shape

((551, 1039), (62, 1039), (551,), (62,))

In [26]:
'''def optimize_xgb_for_target(X, y, n_trials=cfg.optuna_trials, n_splits=cfg.n_folds, random_state=cfg.random_state):
    def objective(trial):
        params = {
            "max_depth": trial.suggest_int("max_depth", 3, 15),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0, log=True),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
            "random_state": random_state,
            "tree_method": "hist",
            'gpu_id': cfg.gpu_id if cfg.use_gpu else None,
            
        }

        kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
        fold_scores = []

        for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            model = xgb.XGBRegressor(**params)
            model.fit(X_train, y_train,
                      eval_set=[(X_val, y_val)],
                      early_stopping_rounds=cfg.early_stopping,
                      verbose=False)

            preds = model.predict(X_val)
            fold_score = mean_absolute_error(y_val, preds)
            fold_scores.append(fold_score)

        # Print fold scores during optimization (optional)
        print(f"Trial {trial.number}: Fold MAEs = {[round(s, 4) for s in fold_scores]}, Avg MAE = {np.mean(fold_scores):.4f}")
        return np.mean(fold_scores)

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)

    print("\n Best Hyperparameters Found:")
    for k, v in study.best_params.items():
        print(f"  {k}: {v}")
    print(f"\n Best Average CV MAE: {study.best_value:.4f}")

    return study.best_params
    '''


'def optimize_xgb_for_target(X, y, n_trials=cfg.optuna_trials, n_splits=cfg.n_folds, random_state=cfg.random_state):\n    def objective(trial):\n        params = {\n            "max_depth": trial.suggest_int("max_depth", 3, 15),\n            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),\n            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),\n            "subsample": trial.suggest_float("subsample", 0.5, 1.0),\n            \'colsample_bytree\': trial.suggest_float(\'colsample_bytree\', 0.6, 1.0),\n            "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),\n            "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0, log=True),\n            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),\n            "random_state": random_state,\n            "tree_method": "hist",\n            \'gpu_id\': cfg.gpu_id if cfg.use_gpu else None,\n            \n        }\n\n        kf = KFold(n_splits=n_splits,

In [27]:
#best_params = optimize_xgb_for_target(train_x_tg, train_y_tg)

In [28]:
#best_params_tc = optimize_xgb_for_target(train_x_tc, train_y_tc)

In [29]:
#best_params_rg = optimize_xgb_for_target(train_x_rg, train_y_rg)

In [30]:
#best_params_den = optimize_xgb_for_target(train_x_den, train_y_den)

In [31]:
#best_params_ffv = optimize_xgb_for_target(train_x_ffv, train_y_ffv)

In [32]:
tg_model = xgb.XGBRegressor(max_depth=4,learning_rate=0.15040860215701513,n_estimators=162,subsample=0.6264768531105072,
                            colsample_bytree=0.9784979701300981,reg_alpha=2.089812909157528e-07,reg_lambda=4.346630154191774e-06,
                            min_child_weight=10, random_state=cfg.random_state)
tg_model.fit(x_tg_data, y_tg_data)

In [33]:
tc_model = xgb.XGBRegressor(max_depth=12,learning_rate=0.044525558492377486,n_estimators=214,subsample=0.7463295028003523,
                            colsample_bytree=0.9488211584814711,reg_alpha=1.1468116137572995e-08,
                            reg_lambda=0.000531880411719889,min_child_weight=9,random_state=cfg.random_state)
tc_model.fit(x_tc_data,y_tc_data)

In [34]:
rg_model = xgb.XGBRegressor(max_depth=15,learning_rate=0.04036647507368534,n_estimators=685,subsample=0.6711141702316268,
                            colsample_bytree=0.7509012395639328,reg_alpha=0.013833090657544508,reg_lambda=1.9887375521322532,
                            min_child_weight=1, random_state=cfg.random_state)
rg_model.fit(x_rg_data, y_rg_data)

In [35]:
den_model = xgb.XGBRegressor(max_depth=9,learning_rate=0.02537552871578587,n_estimators=675,subsample=0.6829242277806739,
                             colsample_bytree=0.7798811657441316,reg_alpha=0.0715796172388684,reg_lambda=5.597600517185866e-06,
                             min_child_weight=2, random_state=cfg.random_state)
den_model.fit(x_density_data, y_density_data)

In [36]:
ffv_model = xgb.XGBRegressor(max_depth=8,learning_rate=0.04217144634180374,n_estimators=913,subsample=0.6200728653565712,
                             colsample_bytree=0.8134754834293325,reg_alpha=0.013713096174599305,reg_lambda=6.242728863633394e-05,
                             min_child_weight=2, random_state=cfg.random_state)
ffv_model.fit(x_ffv_data,y_ffv_data)

In [37]:
test_df = test_df.drop(['id', 'SMILES'], axis=1)
test_df

Unnamed: 0,MolWt,HeavyAtomCount,RingCount,NumRotatableBonds,AromaticRingCounts,NumAliphaticRings,LogP,TPSA,NumHAcceptors,NumHDonors,...,Morgan_fp1014,Morgan_fp1015,Morgan_fp1016,Morgan_fp1017,Morgan_fp1018,Morgan_fp1019,Morgan_fp1020,Morgan_fp1021,Morgan_fp1022,Morgan_fp1023
0,540.463,39,4,8,4,0,7.3603,43.18,4,0,...,0,0,0,0,0,0,0,0,0,0
1,510.589,39,5,9,5,0,7.2845,52.6,4,0,...,0,0,0,0,0,0,0,0,0,0
2,586.644,44,6,13,4,2,6.1875,93.22,6,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
tg_predx = tg_model.predict(test_df)
tc_predx = tc_model.predict(test_df)
rg_predx = rg_model.predict(test_df)
den_predx = den_model.predict(test_df)
ffv_predx = ffv_model.predict(test_df)

In [39]:
sub_df=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv')
sub = sub_df.copy()
sub['Tg'] = tg_predx
sub['FFV'] = ffv_predx
sub['Tc'] = tc_predx
sub['Density'] = den_predx
sub['Rg'] = rg_predx

In [40]:
sub

Unnamed: 0,id,Tg,FFV,Tc,Density,Rg
0,1109053969,165.10701,0.371591,0.208152,1.237848,24.885931
1,1422188626,189.709351,0.376959,0.258537,1.106354,20.149303
2,2032016830,75.977371,0.351827,0.314886,1.09453,19.996471


In [41]:
sub.to_csv('submission.csv')