In [10]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold,cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge

In [2]:
feature_sets = []
feature_set_labels = []

# import three base feature sets
feature_sets.append(pd.read_csv('C:/Users/cqyzxy/Downloads/Stoich45_PCA_dataset.csv', sep=','))
feature_set_labels.append('Stoich45 PCs')
# ... rename principal component columns to be more descriptive
feature_sets[0] = feature_sets[0].rename(columns = dict([[str(i), 'stoich45 PC '+str(i+1)] for i in range(8)]))

feature_sets.append(pd.read_csv('C:/Users/cqyzxy/Downloads/Stoich45_FeatureSelected_dataset.csv', sep=','))
feature_set_labels.append('Stoich45 intersection')

feature_sets.append(pd.read_csv('C:/Users/cqyzxy/Downloads/SCM_PCA_trainingStoich45_dataset.csv', sep=','))
feature_set_labels.append('SCM PCs')

# merge to form last two
feature_sets.append(feature_sets[0].merge(feature_sets[2]))
feature_set_labels.append('Stoich45 PCs + SCM PCs')

feature_sets.append(feature_sets[1].merge(feature_sets[2]))
feature_set_labels.append('Stoich45 intersection + SCM PCs')

# drop MOF column and rename target in all feature sets
feature_sets = [fs.drop(columns = ['MOF']).rename(columns = {'outputs.hse06.bandgap': 'HSE06 Bandgap'}) for fs in feature_sets]



In [3]:
target = 'HSE06 Bandgap'
kfold = KFold(n_splits = 4, shuffle = True, random_state = 1234)

def get_mean_cv_mse(model, df_feature_set):
     return -cross_val_score(
         model,
         X = df_feature_set.drop(columns = [target]), y = df_feature_set[target],
         cv = kfold, scoring = 'neg_mean_squared_error',
         n_jobs = 4
     ).mean()



# KNR

In [None]:
knn_cv_mse = np.zeros((len(feature_sets),))
print(f"{'Feature Set':<35}{'Cross-Validation MSE'}")

for i, fs in enumerate(feature_sets):
    knn_cv_mse[i] = get_mean_cv_mse(KNeighborsRegressor(n_neighbors=5), fs)
    print(f"{feature_set_labels[i]:<35}{round(knn_cv_mse[i], 6)}")
    

Feature Set                        Cross-Validation MSE
Stoich45 PCs                       0.682927
Stoich45 intersection              0.699272
SCM PCs                            0.897325
Stoich45 PCs + SCM PCs             0.682806
Stoich45 intersection + SCM PCs    0.699215


# SVR

In [None]:
svr_cv_mse = np.zeros((len(feature_sets),))
print(f"{'Feature Set':<35}{'Cross-Validation MSE'}")

for i, fs in enumerate(feature_sets):
    svr_cv_mse[i] = get_mean_cv_mse(SVR(), fs)
    print(f"{feature_set_labels[i]:<35}{round(svr_cv_mse[i], 6)}")
    


Feature Set                        Cross-Validation MSE
Stoich45 PCs                       0.63584
Stoich45 intersection              0.810698
SCM PCs                            0.778956
Stoich45 PCs + SCM PCs             0.635658
Stoich45 intersection + SCM PCs    0.813524


# Bayesian Ridge Regression

In [15]:
bayesridge_cv_mse = np.zeros((len(feature_sets),))
alpha_values = np.zeros((len(feature_sets),))
lambda_values = np.zeros((len(feature_sets),))  # Store lambda values

print(f"{'Feature Set':<35}{'Cross-Validation MSE':<25}{'Estimated Alpha':<20}{'Estimated Lambda'}")

for i, fs in enumerate(feature_sets):
    model = BayesianRidge()
    X = fs.drop(columns=[target])
    y = fs[target]
    
    model.fit(X, y)  # Fit model to extract alpha_ and lambda_
    
    bayesridge_cv_mse[i] = get_mean_cv_mse(model, fs)
    alpha_values[i] = model.alpha_
    lambda_values[i] = model.lambda_  # Extract estimated lambda

    print(f"{feature_set_labels[i]:<35}{round(bayesridge_cv_mse[i], 6):<25}{round(alpha_values[i], 6):<20}{round(lambda_values[i], 6)}")
    


Feature Set                        Cross-Validation MSE     Estimated Alpha     Estimated Lambda
Stoich45 PCs                       0.803307                 1.247673            129.998887
Stoich45 intersection              0.687404                 1.466944            18.760048
SCM PCs                            0.862288                 1.164813            0.00403
Stoich45 PCs + SCM PCs             0.752608                 1.336373            0.011948
Stoich45 intersection + SCM PCs    0.683951                 1.50771             0.047704


$$
\alpha \text{ Precision of noise (inverse of noise variance). A high value suggests low noise in the data.} \\
\lambda \text{ Precision of the weights (inverse of weight variance). A high value suggests stronger regularization, shrinking weights toward zero.}
$$
