In [55]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import lightgbm as lgb

RANDOM_SEED = 42

train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

y = train["log_pSat_Pa"]
X = train.drop(columns=["log_pSat_Pa"])

non_numeric_cols = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=non_numeric_cols, drop_first=True)
test = pd.get_dummies(test, columns=non_numeric_cols, drop_first=True)

X, test = X.align(test, join='left', axis=1)
test = test.fillna(0)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

Finding out the most important features

In [56]:
lgb_model = lgb.LGBMRegressor(
    boosting_type='gbdt',
    num_leaves=31,
    learning_rate=0.1,
    n_estimators=200,
    random_state=RANDOM_SEED
)

lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='rmse')

feature_importance = lgb_model.feature_importances_
feature_names = X_train.columns

most_important_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

print("Top 15 most important features:")
print(most_important_df.head(10))

y_pred = lgb_model.predict(X_val)
print(f"R2 Score on Validation Set: {r2_score(y_val, y_pred):.4f}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001382 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 925
[LightGBM] [Info] Number of data points in the train set: 21309, number of used features: 28
[LightGBM] [Info] Start training from score -5.539761
Top 15 most important features:
             Feature  Importance
7          NumOfConf         918
0                 ID         746
8      NumOfConfUsed         692
1                 MW         487
3             NumOfC         354
6     NumHBondDonors         310
2         NumOfAtoms         288
11  hydroxyl (alkyl)         270
13            ketone         240
4             NumOfO         210
R2 Score on Validation Set: 0.7432


Finding out the least important features:

In [57]:
least_important_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=True)

# Display least important features
print("Least important features:")
print(least_important_df.head(4))

least_important_names = list(least_important_df['Feature'].head(4))
print(least_important_names)

Least important features:
                              Feature  Importance
27         parentspecies_apin_toluene           0
26  parentspecies_apin_decane_toluene           0
29       parentspecies_decane_toluene           0
19                  aromatic hydroxyl           0
['parentspecies_apin_toluene', 'parentspecies_apin_decane_toluene', 'parentspecies_decane_toluene', 'aromatic hydroxyl']


After finding out the least important features, we can re-trained the model and change the values of some hyperparameters.

In [109]:
X_reduced = X.drop(columns=least_important_names)

X_reduced, test = X_reduced.align(test, join='left', axis=1)
test = test.fillna(0)

X_train, X_val, y_train, y_val = train_test_split(X_reduced, y, test_size=0.2, random_state=RANDOM_SEED)

lgb_model = lgb.LGBMRegressor(
    boosting_type='gbdt',
    num_leaves=50,
    learning_rate=0.1,
    n_estimators=100,
    random_state=RANDOM_SEED
)

lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='rmse')

feature_importance = lgb_model.feature_importances_
feature_names = X_train.columns

y_pred = lgb_model.predict(X_val)
print(f"R2 Score on Re-Trained Validation Set: {r2_score(y_val, y_pred):.4f}")



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001174 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 923
[LightGBM] [Info] Number of data points in the train set: 21309, number of used features: 27
[LightGBM] [Info] Start training from score -5.539761
R2 Score on Re-Trained Validation Set: 0.7447


After finding the features with 0 importance and reducing them and lifting the num_leaves parameter from 31 to 50 and lowering n_estimators from 200 to 100, the R2 score went up to 0.7447, i.e. there is a rise of 0.0015. After playing around with other parameters, no improvements with the R2 score was found.