In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [2]:
import pandas as pd

train_path = '/content/drive/MyDrive/melting-point/train.csv'
train_df = pd.read_csv(train_path)

# Display basic info
print("Shape of train data:", train_df.shape)
train_df.head()


Shape of train data: (2662, 427)


Unnamed: 0,id,SMILES,Tm,Group 1,Group 2,Group 3,Group 4,Group 5,Group 6,Group 7,...,Group 415,Group 416,Group 417,Group 418,Group 419,Group 420,Group 421,Group 422,Group 423,Group 424
0,2175,FC1=C(F)C(F)(F)C1(F)F,213.15,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1222,c1ccc2c(c1)ccc3Nc4ccccc4c23,407.15,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2994,CCN1C(C)=Nc2ccccc12,324.15,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1704,CC#CC(=O)O,351.15,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2526,CCCCC(S)C,126.15,2,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Feature selection
X = train_df.drop(columns=['id', 'SMILES', 'Tm'])
y = train_df['Tm']

print("Feature matrix shape:", X.shape)
print("Target shape:", y.shape)


Feature matrix shape: (2662, 424)
Target shape: (2662,)


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
!pip install lightgbm --quiet

from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
from lightgbm import early_stopping, log_evaluation


# Baseline model
model = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Train
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='mae',
    callbacks=[early_stopping(50), log_evaluation(100)]
)

# Evaluate
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print("Validation MAE:", mae)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001266 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 349
[LightGBM] [Info] Number of data points in the train set: 2129, number of used features: 80
[LightGBM] [Info] Start training from score 277.791617
Training until validation scores don't improve for 50 rounds
[100]	valid_0's l1: 44.1402	valid_0's l2: 3918.57
[200]	valid_0's l1: 43.0196	valid_0's l2: 3736.46
[300]	valid_0's l1: 42.5298	valid_0's l2: 3632.88
[400]	valid_0's l1: 42.1768	valid_0's l2: 3563.25
[500]	valid_0's l1: 42.1062	valid_0's l2: 3539.45
Early stopping, best iteration is:
[515]	valid_0's l1: 42.0297	valid_0's l2: 3530.84
Validation MAE: 42.029733912681245


In [7]:
# Load test set
test_path = '/content/drive/MyDrive/melting-point/test.csv'
test_df = pd.read_csv(test_path)

# Drop id & SMILES for prediction
X_test = test_df.drop(columns=['id', 'SMILES'])

# Re-train model on full data (for best performance)
final_model = LGBMRegressor(
    n_estimators=515,  # best iteration from before
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

final_model.fit(X, y)

# Predict test set
test_preds = final_model.predict(X_test)

# Prepare submission
submission = pd.DataFrame({
    'id': test_df['id'],
    'Tm': test_preds
})

# Save CSV
submission_path = '/content/drive/MyDrive/melting-point/submission.csv'
submission.to_csv(submission_path, index=False)

print("✅ Submission file saved at:", submission_path)
submission.head()


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002492 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 404
[LightGBM] [Info] Number of data points in the train set: 2662, number of used features: 94
[LightGBM] [Info] Start training from score 278.263449
✅ Submission file saved at: /content/drive/MyDrive/melting-point/submission.csv


Unnamed: 0,id,Tm
0,1022,350.343401
1,1146,291.416025
2,79,193.991538
3,2279,191.218102
4,1342,248.887308
