In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.decomposition import PCA

In [3]:
df = pd.read_csv('../data/processed_MOFs.csv')
df.drop(columns='MOFname', inplace=True)
X = df.copy()
Y = X.pop('CO2_uptake_P0.15bar_T298K [mmol/g]')

X_scaled = (X - X.mean(axis=0)) / X.std(axis=0)

In [4]:
# Create 4 principal components
pca = PCA(4)
X_pca = pca.fit_transform(X_scaled)

# Convert to dataframe
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

In [25]:
x = X_pca.sample(frac=0.01, random_state=42)
y = Y.loc[x.index]

models = [RandomForestRegressor(), XGBRegressor(), LinearRegression(), SVR(), SGDRegressor(), GradientBoostingRegressor(), ElasticNet()]


for model in models:
    score = cross_val_score(model, x, y, cv=5, scoring='neg_mean_absolute_error')
    score = -score.mean() # MAE
    print(f"{model.__class__.__name__}: Mean MAE = {score:.4f}")



RandomForestRegressor: Mean MAE = 0.2270
XGBRegressor: Mean MAE = 0.2434
LinearRegression: Mean MAE = 0.2725
SVR: Mean MAE = 0.2283
SGDRegressor: Mean MAE = 0.2736
GradientBoostingRegressor: Mean MAE = 0.2373
ElasticNet: Mean MAE = 0.3521


In [26]:
x_all = X_scaled.loc[x.index]
x_all.columns = x_all.columns.str.replace(r'[\[\]<]', '', regex=True)

for model in models:
    score = cross_val_score(model, x_all, y, cv=5, scoring='neg_mean_absolute_error')
    score = -score.mean() # MAE
    print(f"{model.__class__.__name__}: Mean MAE = {score:.4f}")


RandomForestRegressor: Mean MAE = 0.1953
XGBRegressor: Mean MAE = 0.2014
LinearRegression: Mean MAE = 0.2747
SVR: Mean MAE = 0.2102
SGDRegressor: Mean MAE = 0.2740
GradientBoostingRegressor: Mean MAE = 0.2004
ElasticNet: Mean MAE = 0.3737
