In [1]:
import sys, os
import numpy as np
import time
import pandas as pd
import seaborn as sns
from pprint import pprint
from matplotlib import pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline

def MSE(h, y):
    """
    Compute the Mean Squared Error for hypothesis h and targets y
    Args:
    """

def root_mean_squared_log_error(h, y): 
    """
    Compute the Root Mean Squared Log Error for hypthesis h and targets y
    Args:
        h - numpy array containing predictions with shape (n_samples, n_targets)
        y - numpy array containing targets with shape (n_samples, n_targets)
    """
    return np.sqrt(np.square(np.log(h + 1) - np.log(y + 1)).mean())

def collect_error_score(target, prediction):
    meansquare_error = mean_squared_error (prediction, target)                 # Mean Squared Error  
    r2square_error = r2_score(target, prediction)                              # R Squared  
    rmslog_error = root_mean_squared_log_error(prediction, target)             # Root Mean Square Log Error  
    #meanabsolute_error = mean_absolute_error (target, prediction)              # Absolute Mean Error 
    #msle = mean_squared_log_error(target, prediction)
    
    return ( meansquare_error, r2square_error, rmslog_error)

def error_table (score, labels, sort_col ):
    #labels  = ['Clf','mean absolute error','mean square error','R2 squared', 'Mean Sq Log Error', 'Root Mean Sq Log Error']
    scored_df = pd.DataFrame.from_records(score, columns=labels, index = None)
    sorted_scored = scored_df.sort_values(by = sort_col, ascending=True)
    return sorted_scored

In [2]:
### Prepare data

train = pd.read_csv(os.path.join("data", "train.csv"))  
test = pd.read_csv(os.path.join("data", "test.csv"))
train_size = train.shape
test_size = test.shape
validation_index = int(0.8 * train_size[0])

data_no_id = train.drop(["id"], axis=1)
X = train.drop(["id", 'formation_energy_ev_natom', 'bandgap_energy_ev'], axis=1)
Y1 = train['formation_energy_ev_natom']
Y2 = train['bandgap_energy_ev']

trainX = X[:validation_index]
validationX = X.iloc[validation_index:]
testX = test.drop(["id"], axis=1)
trainY1 = Y1[:validation_index]
trainY2 = Y2[:validation_index]
validationY1 = Y1[validation_index:]
validationY2 = Y2[validation_index:]

print(f"Number of training data ==> X: {trainX.shape[0]}, Y1: {trainY1.shape[0]}, Y2: {trainY2.shape[0]}")
print(f"Number of validation data ==> X: {validationX.shape[0]}, Y1: {validationY1.shape[0]}, Y2: {validationY2.shape[0]}")
print(f"Number of test data ==> X: {testX.shape[0]}")

Number of training data ==> X: 1920, Y1: 1920, Y2: 1920
Number of validation data ==> X: 480, Y1: 480, Y2: 480
Number of test data ==> X: 600


In [3]:
models = {
    "LRG" : LinearRegression(),
    "RIDGE" : Ridge(),
    "LASSO" : Lasso(),
    "DTR" : DecisionTreeRegressor(),
    "RFR" : RandomForestRegressor(),
}
hyperparams = {
    "LGR" : {},
    "RIDGE" : {
        "alpha" : np.arange(0.02, 10, 0.02)
    },
    "LASSO" : {
        "alpha" : np.arange(0.02, 10, 0.02)
    },
    "DTR" : { 'criterion': ['squared_error','absolute_error'],
        'max_depth': [7],
        'max_features': ['sqrt', 'log2'],   
        'max_leaf_nodes': [200] ,
        'min_samples_split':  [20],
        'min_samples_leaf': [7, 10,50 ]
    } ,
    "RFR" : {'max_features' : ['sqrt', 'log2'],
        'max_depth': [7],
        'n_estimators': [90,100],
        'min_samples_split':  [6,7,8,9]
    }
}

best_params = dict.fromkeys(list(models.keys()))
best_models = dict.fromkeys(list(models.keys()))
test_error_scores = []

print ("==== Start training  Regressors ====")
t = time.time()
for i, modelname in enumerate(models):
    #   
    print(f"Model: {modelname}")
    model = models[modelname]
    
    #pipe = make_pipeline(preprocessing.StandardScaler(), model)
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    hyperparam = hyperparams[list(hyperparams.keys())[i]]
    search = GridSearchCV(model, hyperparam, scoring='neg_mean_squared_error', cv=15, n_jobs=-1)
    results = search.fit(trainX, trainY1)

    # Predict on validation set
    predicted_validation = results.best_estimator_.predict(validationX)
    print("Mean: ", np.mean(predicted_validation))
    mse, r2, rmsle = collect_error_score(validationY1, predicted_validation)

    # Save results
    best_params[modelname] = results.best_params_
    best_models[modelname] = results.best_estimator_
    test_error_scores.append((modelname, mse, r2, rmsle))

    print('MSE: %.4f' % results.best_score_)
    print('Best Config: %s' % results.best_params_)
    print(f"Calculated errors ==> mse: {mse:.5f}, r2: {r2:.4f}, rmsle: {rmsle:.4f}")
    print (":::::::::::::::::::::::::::\n")
print ("==== Finished training  Regressors ====")    
print (" Total training time :  ({0:.3f} s)\n".format(time.time() - t) , "\n")

==== Start training  Regressors ====
Model: LRG
Mean:  0.19143720517615245
MSE: -0.0066
Best Config: {}
Calculated errors ==> mse: 0.00657, r2: 0.4363, rmsle: 0.0677
:::::::::::::::::::::::::::

Model: RIDGE
Mean:  0.19152123836830706
MSE: -0.0066
Best Config: {'alpha': 0.52}
Calculated errors ==> mse: 0.00663, r2: 0.4310, rmsle: 0.0680
:::::::::::::::::::::::::::

Model: LASSO
Mean:  0.19089438780144724
MSE: -0.0083
Best Config: {'alpha': 0.02}
Calculated errors ==> mse: 0.00821, r2: 0.2962, rmsle: 0.0757
:::::::::::::::::::::::::::

Model: DTR
Mean:  0.1865523386654709
MSE: -0.0026
Best Config: {'criterion': 'squared_error', 'max_depth': 7, 'max_features': 'log2', 'max_leaf_nodes': 200, 'min_samples_leaf': 7, 'min_samples_split': 20}
Calculated errors ==> mse: 0.00274, r2: 0.7650, rmsle: 0.0415
:::::::::::::::::::::::::::

Model: RFR
Mean:  0.18781398127723875
MSE: -0.0018
Best Config: {'max_depth': 7, 'max_features': 'log2', 'min_samples_split': 7, 'n_estimators': 100}
Calculated er

In [4]:
labels  = ['Regressor','Mean square error', 'R Squared', 'Root Mean Sq. Log Error']
print("Formation Energy scores : on test data - ordered by Mean Square Error : \n")
formation_energy_score = error_table (test_error_scores, labels,  'Mean square error' )
formation_energy_score

Formation Energy scores : on test data - ordered by Mean Square Error : 



Unnamed: 0,Regressor,Mean square error,R Squared,Root Mean Sq. Log Error
4,RFR,0.002115,0.818579,0.036236
3,DTR,0.00274,0.765016,0.041504
0,LRG,0.006572,0.436262,0.067674
1,RIDGE,0.006634,0.430989,0.068011
2,LASSO,0.008205,0.296235,0.075676


In [5]:
test_predictions = []

for i, model in best_models.items():
    test_pred = model.predict(testX)
    test_predictions.append(test_pred)


LRG  -  LinearRegression()
RIDGE  -  Ridge(alpha=0.52)
LASSO  -  Lasso(alpha=0.02)
DTR  -  DecisionTreeRegressor(max_depth=7, max_features='log2', max_leaf_nodes=200,
                      min_samples_leaf=7, min_samples_split=20)
RFR  -  RandomForestRegressor(max_depth=7, max_features='log2', min_samples_split=7)
