This notebook generates the cross-validation MSE and MAE, training MSE and MAE, and test MSE and MAE for two models to compare our modeling efforts against:
- predicting the HSE06 bandgap by the mean HSE06 bandgap
    - baseline: Our models should do a lot better than this.
- predicting the HSE06 bandgap as a linear function of the PBE bandgap
    - goalpost: Given the computational expense of computing the PBE bandgap, it would be great if our models matches the accuracy of this model or beats it.

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_error

##### import training and test bandgaps:

In [43]:
# import the MOF ids from one of the reduced, training feature sets
df_MOFs_train = pd.read_csv('../data/datasets_main/Stoich45_FeatureSelected_dataset.csv', sep=',')[['MOF']]

# import the MOF ids from the test version of the Stoich45  feature sets
df_MOFs_test = pd.read_csv('../data/datasets_main/Stoich45_FeatureSelected_test_set.csv', sep=',')[['MOF']]

# import qmof_ids and bandgaps from the original qmof csv
df_qmof = pd.read_csv('../data/QMOF/qmof_database/qmof_database/qmof.csv', sep=',', usecols=['qmof_id','outputs.pbe.bandgap','outputs.hse06.bandgap'])

# merge df_qmof onto df_MOFs_train to restrict to MOFs in the stoich45 training set and put them in the right order,
# ... and delete MOF ids
df_bandgaps_train = df_MOFs_train.merge(df_qmof, left_on='MOF', right_on='qmof_id', how='left').drop(columns = ['MOF', 'qmof_id'])

# merge df_qmof onto df_stoich45Int to restrict to MOFs in the stoich45 test set and delete MOF ids
df_bandgaps_test = df_MOFs_test.merge(df_qmof, left_on='MOF', right_on='qmof_id', how='left').drop(columns = ['MOF', 'qmof_id'])

##### make defintions for cross validation scoring:

In [50]:
target = 'outputs.hse06.bandgap'
kfold = KFold(n_splits = 4, shuffle = True, random_state = 1234)

def get_mean_cv_mse(model, df_feature_set):
     return -cross_val_score(
         model,
         X = df_feature_set.drop(columns = [target]), y = df_feature_set[target],
         cv = kfold, scoring = 'neg_mean_squared_error',
         n_jobs = 4
     ).mean()

def get_mean_cv_mse_and_mae(model, df_feature_set):
    scores = cross_validate(
        model,
        X = df_feature_set.drop(columns = [target]), y = df_feature_set[target],
        cv = kfold, scoring = ['neg_mean_squared_error', 'neg_mean_absolute_error'],
        n_jobs = 4
    )
    return (-scores['test_neg_mean_squared_error'].mean(), -scores['test_neg_mean_absolute_error'].mean())

def get_training_and_test_mse_and_mae(model, df_train, df_test):
    model.fit(df_train.drop(columns = [target]), df_train[target])
    train_preds = model.predict(df_train.drop(columns = [target]))
    test_preds = model.predict(df_test.drop(columns = [target]))
    return {
        'train MSE': mean_squared_error(df_train[target], train_preds),
        'train MAE': mean_absolute_error(df_train[target], train_preds),
        'test MSE': mean_squared_error(df_test[target], test_preds),
        'test MAE': mean_absolute_error(df_test[target], test_preds)
    }

#### baseline
What is the mean cross-validation MSE for the above k-fold split using just the constant predictor?

In [52]:
# create data frames with just the target and a constant feature
df_constant_only_train = df_bandgaps_train[[target]].copy()
df_constant_only_train['constant'] = np.ones_like(df_bandgaps_train[target])

df_constant_only_test = df_bandgaps_test[[target]].copy()
df_constant_only_test['constant'] = np.ones_like(df_bandgaps_test[target])

# run linear regression on the above data set
model = LinearRegression(fit_intercept = False)
cv_mse_no_features, cv_mae_no_features = get_mean_cv_mse_and_mae(model, df_constant_only_train)
train_test_errors = get_training_and_test_mse_and_mae(model, df_constant_only_train, df_constant_only_test)

print('Predicting the HSE06 bandgap by its mean:\n')
print(f"{'Cross-Validation MSE = ':>25}{cv_mse_no_features}")
print(f"{'Training MSE = ':>25}{train_test_errors['train MSE']}")
print(f"{'Test MSE = ':>25}{train_test_errors['test MSE']}")
print()
print(f"{'Cross-Validation MAE = ':>25}{cv_mae_no_features}")
print(f"{'Training MAE = ':>25}{train_test_errors['train MAE']}")
print(f"{'Test MAE = ':>25}{train_test_errors['test MAE']}")

Predicting the HSE06 bandgap by its mean:

  Cross-Validation MSE = 1.1471062514401023
          Training MSE = 1.14624060698406
              Test MSE = 1.1341719715255374

  Cross-Validation MAE = 0.8076878730740906
          Training MAE = 0.8073838735312149
              Test MAE = 0.8030823244443052


#### goalpost
What is the mean cross-validation MSE for the above k-fold split using the PBE bandgap as the only feature in a linear model?

In [53]:
model = LinearRegression()
cv_mse_PBE, cv_mae_PBE = get_mean_cv_mse_and_mae(model, df_bandgaps_train)
train_test_errors = get_training_and_test_mse_and_mae(model, df_bandgaps_train, df_bandgaps_test)

print('Predicting the HSE06 bandgap as a linear function of the PBE bandgap:\n')
print(f"{'Cross-Validation MSE = ':>25}{cv_mse_PBE}")
print(f"{'Training MSE = ':>25}{train_test_errors['train MSE']}")
print(f"{'Test MSE = ':>25}{train_test_errors['test MSE']}")
print()
print(f"{'Cross-Validation MAE = ':>25}{cv_mae_PBE}")
print(f"{'Training MAE = ':>25}{train_test_errors['train MAE']}")
print(f"{'Test MAE = ':>25}{train_test_errors['test MAE']}")

Predicting the HSE06 bandgap as a linear function of the PBE bandgap:

  Cross-Validation MSE = 0.3382744129126684
          Training MSE = 0.33777515353575815
              Test MSE = 0.33892264082561213

  Cross-Validation MAE = 0.44612804418753527
          Training MAE = 0.44587849931130136
              Test MAE = 0.4446879628062199
