In [1]:
import pandas as pd

df = pd.read_csv("datasets/expt_gap.csv")
print(df.shape)

(2154, 35)


In [2]:
# Separate features and target variable
X = df.drop(columns=['composition','gap expt'])
y = df['gap expt']

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate

scoring = ["r2", "neg_mean_squared_error"]

rf_pipeline = Pipeline([
    ("model", RandomForestRegressor(
        n_estimators=300,
        random_state=42,
        n_jobs=-1
    ))
])

In [7]:
from sklearn.model_selection import KFold

kf = KFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

In [8]:
rf_cv_results = cross_validate(
    rf_pipeline,
    X_train, y_train,
    cv=kf,
    scoring=scoring,          # same ["r2", "neg_mean_squared_error"]
    return_train_score=True,
    n_jobs=-1
)

rf_r2 = rf_cv_results["test_r2"]
rf_mse = -rf_cv_results["test_neg_mean_squared_error"]

print("RF CV R² per fold:", rf_r2)
print("RF CV Mean R²:", rf_r2.mean())
print("RF CV Std R²:", rf_r2.std())

print("RF CV MSE per fold:", rf_mse)
print("RF CV Mean MSE:", rf_mse.mean())

RF CV R² per fold: [0.75424475 0.73906364 0.83117549 0.81000551 0.76827089]
RF CV Mean R²: 0.7805520553146709
RF CV Std R²: 0.03462493033370116
RF CV MSE per fold: [0.62374696 0.46988974 0.31384965 0.42546028 0.5417225 ]
RF CV Mean MSE: 0.4749338269418192


In [9]:
from sklearn.metrics import r2_score, mean_squared_error

rf_pipeline.fit(X_train, y_train)

y_test_pred_rf = rf_pipeline.predict(X_test)

print("RF Test R²:", r2_score(y_test, y_test_pred_rf))
print("RF Test MSE:", mean_squared_error(y_test, y_test_pred_rf))

RF Test R²: 0.8095928597177398
RF Test MSE: 0.39595837382917676


In [10]:
rf_importances = pd.Series(
    rf_pipeline.named_steps["model"].feature_importances_,
    index=X.columns
).sort_values(ascending=False)

print(rf_importances)

MagpieData maximum Electronegativity    2.951264e-01
MagpieData mean NValence                1.000156e-01
MagpieData mode Electronegativity       8.605140e-02
MagpieData range Electronegativity      6.257233e-02
MagpieData avg_dev Column               5.974381e-02
MagpieData mean CovalentRadius          4.834121e-02
MagpieData minimum NValence             4.505084e-02
MagpieData avg_dev NpValence            2.711576e-02
MagpieData mean Electronegativity       2.576098e-02
MagpieData avg_dev CovalentRadius       2.509705e-02
MagpieData mean NUnfilled               2.509583e-02
MagpieData avg_dev NValence             2.338716e-02
MagpieData mean NpUnfilled              1.606387e-02
MagpieData avg_dev NpUnfilled           1.555055e-02
MagpieData avg_dev NUnfilled            1.539735e-02
MagpieData maximum NValence             1.423442e-02
MagpieData mean NpValence               1.262847e-02
MagpieData maximum NpUnfilled           1.220654e-02
MagpieData maximum NUnfilled            1.1684

In [11]:
import numpy as np

df['predicted_gap'] = rf_pipeline.predict(X)
df['percentage_error'] = np.abs(df['predicted_gap'] - df['gap expt']) / df['gap expt'] * 100

In [12]:
import plotly.express as px
import plotly.graph_objects as go 

reference_line = go.Scatter(
    x=[0, 10],
    y=[0,10] ,
    line=dict(color='red', dash='dash'),
    mode="lines",
    showlegend=False
)

fig = px.scatter(df,
    x="predicted_gap",
    y="gap expt",
    hover_name="composition",
    color="percentage_error",
    title="Predicted vs Experimental Band Gaps",
    color_continuous_scale=px.colors.sequential.Bluered,
    width=1200,
    height=800,
)

fig.add_trace(reference_line)
fig.show()