In [1]:
import pandas as pd

df = pd.read_csv("datasets/expt_gap.csv")
print(df.shape)

(2154, 35)


In [2]:
# Separate features and target variable
X = df.drop(columns=['composition','gap expt'])
y = df['gap expt']

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])


In [4]:
from sklearn.model_selection import KFold

kf = KFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [6]:
from sklearn.model_selection import cross_validate

scoring = ["r2", "neg_mean_squared_error"]

cv_results = cross_validate(
    pipeline,
    X_train, y_train,
    cv=kf,
    scoring=scoring,
    return_train_score=True
)


In [7]:
import numpy as np

r2_test = cv_results["test_r2"]
mse_test = -cv_results["test_neg_mean_squared_error"]

print("R² per fold:", r2_test)
print("Mean R²:", r2_test.mean())
print("Std R²:", r2_test.std())

print("MSE per fold:", mse_test)
print("Mean MSE:", mse_test.mean())

R² per fold: [0.58190029 0.5734786  0.66259049 0.63570376 0.66874118]
Mean R²: 0.6244828656110475
Std R²: 0.03987883659583335
MSE per fold: [1.06117131 0.76807244 0.62725405 0.81577935 0.77439711]
Mean MSE: 0.8093348523499948


In [None]:
# >>> Final training on full training set <<<
pipeline.fit(X_train, y_train)  

0,1,2
,steps,"[('scaler', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [10]:
from sklearn.metrics import r2_score, mean_squared_error

y_pred = pipeline.predict(X_test)

print("Test R²:", r2_score(y_test, y_pred))
print("Test MSE:", mean_squared_error(y_test, y_pred))

Test R²: 0.6725403959990206
Test MSE: 0.6809638131362352


In [12]:
# Inspect feature importances
coefs = pd.Series(
    pipeline.named_steps["model"].coef_,
    index=X.columns
).sort_values(ascending=False)

print(coefs)

MagpieData range Electronegativity      0.783618
MagpieData avg_dev NpValence            0.663122
MagpieData minimum Column               0.601301
MagpieData mean NpValence               0.504271
MagpieData mean NpUnfilled              0.453120
MagpieData range NpUnfilled             0.298877
MagpieData mean Electronegativity       0.281591
MagpieData avg_dev NValence             0.214806
MagpieData maximum CovalentRadius       0.199316
MagpieData avg_dev NUnfilled            0.160562
MagpieData mode Electronegativity       0.127174
MagpieData mean NsValence               0.121643
MagpieData minimum NsValence            0.048970
MagpieData range CovalentRadius         0.022793
MagpieData maximum Row                  0.013869
MagpieData mode NsValence              -0.012786
MagpieData avg_dev Column              -0.023279
MagpieData maximum NsValence           -0.061391
MagpieData range NValence              -0.098324
MagpieData maximum NpUnfilled          -0.107937
MagpieData minimum N

In [14]:
df['predicted_gap'] = pipeline.predict(X)
df['percentage_error'] = np.abs(df['predicted_gap'] - df['gap expt']) / df['gap expt'] * 100

In [18]:
import plotly.express as px
import plotly.graph_objects as go 

reference_line = go.Scatter(
    x=[0, 10],
    y=[0,10] ,
    line=dict(color='red', dash='dash'),
    mode="lines",
    showlegend=False
)

fig = px.scatter(df,
    x="predicted_gap",
    y="gap expt",
    hover_name="composition",
    color="percentage_error",
    title="Predicted vs Experimental Band Gaps",
    color_continuous_scale=px.colors.sequential.Bluered,
    width=1200,
    height=800,
)

fig.add_trace(reference_line)
fig.show()

In [19]:
# Add ridge
from sklearn.linear_model import Ridge

ridge_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Ridge(alpha=1.0, random_state=42))
])

In [20]:
ridge_cv_results = cross_validate(
    ridge_pipeline,
    X_train, y_train,
    cv=kf,
    scoring=scoring,         # ["r2", "neg_mean_squared_error"] from before
    return_train_score=True
)

In [21]:
ridge_r2 = ridge_cv_results["test_r2"]
ridge_mse = -ridge_cv_results["test_neg_mean_squared_error"]

In [22]:
print("Ridge CV R² per fold:", ridge_r2)
print("Ridge CV Mean R²:", ridge_r2.mean())
print("Ridge CV Std R²:", ridge_r2.std())

print("Ridge CV MSE per fold:", ridge_mse)
print("Ridge CV Mean MSE:", ridge_mse.mean())

Ridge CV R² per fold: [0.58282963 0.57570851 0.66276975 0.63487183 0.66840304]
Ridge CV Mean R²: 0.6249165533816339
Ridge CV Std R²: 0.03902764752578866
Ridge CV MSE per fold: [1.05881257 0.76405686 0.6269208  0.81764232 0.7751876 ]
Ridge CV Mean MSE: 0.8085240279423355


In [23]:
ridge_pipeline.fit(X_train, y_train)

y_test_pred_ridge = ridge_pipeline.predict(X_test)


In [24]:
print("Ridge Test R²:", r2_score(y_test, y_test_pred_ridge))
print("Ridge Test MSE:", mean_squared_error(y_test, y_test_pred_ridge))

Ridge Test R²: 0.6727077957138345
Ridge Test MSE: 0.6806156995163419


In [25]:
ridge_coefs = pd.Series(
    ridge_pipeline.named_steps["model"].coef_,
    index=X.columns
).sort_values(ascending=False)

print(ridge_coefs)

MagpieData range Electronegativity      0.754878
MagpieData avg_dev NpValence            0.650504
MagpieData minimum Column               0.582466
MagpieData mean NpValence               0.495549
MagpieData mean NpUnfilled              0.441664
MagpieData range NpUnfilled             0.284949
MagpieData mean Electronegativity       0.259461
MagpieData avg_dev NValence             0.212287
MagpieData maximum CovalentRadius       0.205704
MagpieData avg_dev NUnfilled            0.161035
MagpieData mode Electronegativity       0.131814
MagpieData mean NsValence               0.119816
MagpieData minimum NsValence            0.050061
MagpieData range CovalentRadius         0.018152
MagpieData maximum Row                  0.011737
MagpieData mode NsValence              -0.011536
MagpieData avg_dev Column              -0.018139
MagpieData maximum NsValence           -0.061136
MagpieData maximum NpUnfilled          -0.094050
MagpieData range NValence              -0.097767
MagpieData minimum N