This notebook conducts the final test of two select linear regression models:
- OLS linear regression over the 'raw' Stoich45 Intersection features
- Ridge regression (with alpha = 0.13) over all linear and quadratic polynomial features generated from the Stoich45 Intersection featureset

These two models were selected from the many linear regression models that were investigated in linear_regression_ajk.ipynb for having a good balance between complexity and cross-validation performance.

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error

##### import training and test sets:

In [3]:
df_stoich45Int_train = pd.read_csv('../data/datasets_main/Stoich45_FeatureSelected_dataset.csv', sep=',').drop(columns = ['MOF'])
df_stoich45Int_test = pd.read_csv('../data/datasets_main/Stoich45_FeatureSelected_test_set.csv', sep=',').drop(columns = ['MOF'])

##### define a function that fits, predict, and prints the training and test errors:

In [14]:
target = 'outputs.hse06.bandgap'

def print_training_and_test_mse_and_mae(model, df_train, df_test, target):
    
    model.fit(df_train.drop(columns = [target]), df_train[target])
    
    train_preds = model.predict(df_train.drop(columns = [target]))
    test_preds = model.predict(df_test.drop(columns = [target]))
    
    print(f"{'Training MSE = ':>25}{mean_squared_error(df_train[target], train_preds)}")
    print(f"{'Test MSE = ':>25}{mean_squared_error(df_test[target], test_preds)}")
    print()
    print(f"{'Training MAE = ':>25}{mean_absolute_error(df_train[target], train_preds)}")
    print(f"{'Test MAE = ':>25}{mean_absolute_error(df_test[target], test_preds)}")


#### Model 1: OLS Linear Regression over the 'raw' Stoich45 Intersection features

In [13]:
model = LinearRegression()
print_training_and_test_mse_and_mae(model, df_stoich45Int_train, df_stoich45Int_test, target)

          Training MSE = 0.6782643850534067
              Test MSE = 0.6588399880284861

          Training MAE = 0.6351406931850575
              Test MAE = 0.6238870347745372


#### Model 2: Ridge regression (with alpha = 0.13) over all linear and quadratic polynomial features

In [12]:
model = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias = False)),
    ('scaler', StandardScaler()),
    ('ridge', Ridge(alpha=0.13))
])
print_training_and_test_mse_and_mae(model, df_stoich45Int_train, df_stoich45Int_test, target)

          Training MSE = 0.5385639637119429
              Test MSE = 0.5665684896385375

          Training MAE = 0.5662064870345863
              Test MAE = 0.5705630370873087
