## Load the necessary modules

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import pickle
import ipywidgets as widgets
from IPython.display import display, HTML
from collections import namedtuple
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from matbench.data_ops import mean_absolute_percentage_error
from ml_utilities import (grid_search, get_feature_importance_plot,
                          get_shap_plot, get_train_test_plot,
                          get_actual_predict_plot, get_metrics_df)

In [None]:
#Load the pandas dataframe with targets and features for training model
df = pd.read_pickle('./dataforml_automatminer.pkl')

In [None]:
#dataframe with compositions for each mpid
df_comp = pd.read_csv('mpids.csv', sep=',', index_col='metadata.material_id') 

**<font color='red'>Important note</font>:** Check the `n_jobs` parameter in the model_dict below. Adjust this as per your system configuration. 

In [None]:
# List with model name that will be trained and evaluated
models_names =['RandomForestRegressor']

# Dict with model hypterparameter
models_dicts = {
     RandomForestRegressor(n_jobs=30): {
         'selector__n_features_to_select': [50, 100, 180], # Number of features to select
        'regressor__n_estimators': [500],
        }
    }

## The code block below will change the inputs to model training based on user selection

**<font color='red'>Important note</font>:** 
The user first needs to select either one of the button below to reproduce the results of the associated model presented in paper. 

In [None]:
# Code block to select model training input data 
heading = widgets.HTML('<h3>Please select the model you want to train and evaluate before procceding</h3>')
include_button = widgets.Button(description='Including LOBSTER features', 
                                layout=widgets.Layout(width='300px', height='50px'),
                               style={'font_weight': 'bold','font_size': '16px', 'border_radius': '300px'})


exclude_button = widgets.Button(description='Excluding LOBSTER features',
                               layout=widgets.Layout(width='300px', height='50px'),
                               style={'font_weight': 'bold', 'font_size': '16px','border_radius': '300px'})

parent = os.getcwd() # get the directory of script


# Define the button click event handlers
def include_button_clicked(b):
    '''
    This callback function includes the ICOHP features for model training and changes the output directory
    to store the results
    '''
    global y, X
    os.chdir(parent)
    isExist = os.path.exists('inc_icohp')
    if not isExist:
        os.mkdir('inc_icohp')
        os.chdir('inc_icohp')
    else:
        os.chdir('inc_icohp')
        
    y=df.iloc[:,0] #targets
    X=df.iloc[:,1:] #features
    print("LOBSTER features will be included in the model evaluation, results will be stored in the 'inc_icohp' directory")
    print("Great! Now you have the necessary data for model training and evaluation. Run the consequent code blocks")
    print("")
def exclude_button_clicked(b):
    '''
    This callback function excludes the ICOHP features for model training and changes the output directory
    to store the results
    '''
    global y, X
    os.chdir(parent)
    isExist = os.path.exists('exc_icohp')
    if not isExist:
        os.mkdir('exc_icohp')
        os.chdir('exc_icohp')
    else:
        os.chdir('exc_icohp')
    
    y=df.iloc[:,0] #targets
    X=df.iloc[:,1:-18] #features
    print("LOBSTER features will be excluded in the model evaluation, results will be stored in the 'exc_icohp' directory")
    print("Great! Now you have the necessary data for model training and evaluation. Run the consequent code blocks")
    print("")
# Attach the event handlers
include_button.on_click(include_button_clicked)
exclude_button.on_click(exclude_button_clicked)

# Display the widgets
display(heading)
display(include_button)
display(exclude_button)
warning_text = """Please ensure you selected either of options presented above.
You will encounter error ahead as inputs necessary for model will not be instantiated if 
you don\'t select either of the options
"""
display(HTML('<div class="alert-warning">{}<h3></h3></div>'.format(warning_text)))

In [None]:
# define convinience named tuples to store raw metric data of nested cv runs
abs_errors = namedtuple("abs_errors", "train test")
rmse_scores = namedtuple("rmse_scores", "train test")
r2_scores = namedtuple("r2_scores", "train test")
mape_scores = namedtuple("mape_scores", "train test")

print('RandomForestRegressor model training and evaluation initiated with 5 Fold Nested CV', file=sys.stderr)

for name, (model, param) in zip(models_names, models_dicts.items()):
    isExist = os.path.exists(name)
    if not isExist:
        os.mkdir(name)

    abs_errors = namedtuple("abs_errors", "train test")
    rmse_scores = namedtuple("rmse_scores", "train test")
    r2_scores = namedtuple("r2_scores", "train test")
    mape_scores = namedtuple("mape_scores", "train test")

    cv_outer = KFold(n_splits=5, shuffle=True, random_state=18012019)

    #store outfold metrics
    test_rmse = []
    train_rmse =[]
    test_r2 = []
    train_r2 =[]
    test_errors=[]
    test_labels=[]
    train_errors=[]
    y_actual=[]
    y_predict=[]
    mape_train=[]
    mape_test=[]
    iteration=1
    # enumerate splits
    for train_ix, test_ix in cv_outer.split(X):
        print('Fold:', iteration, file=sys.stderr)
        # split data
        X_train, X_test = X.iloc[train_ix, :].values, X.iloc[test_ix, :].values
        y_train, y_test = y.iloc[train_ix].values, y.iloc[test_ix].values
        
        #get the best model and gridsearch cv object
        best_model, search = grid_search(model, param, X_train, y_train)
        
        # get train set predictions of best model
        y_hat_train  = best_model.predict(X_train)
        
        # evaluate model on the hold out test dataset
        y_hat_test = best_model.predict(X_test)
        
        # evaluate the model performace metrics on train and test sets
        rmse_test = mean_squared_error(y_test, y_hat_test, squared=False)
        rmse_train = mean_squared_error(y_train, y_hat_train, squared=False)

        r2_test = r2_score(y_test, y_hat_test)
        r2_train = r2_score(y_train, y_hat_train)

        test_error = abs(y_test - y_hat_test)
        train_error = abs(y_train - y_hat_train)

        # store the result of each folds
        test_rmse.append(rmse_test)
        train_rmse.append(rmse_train)

        test_r2.append(r2_test)
        train_r2.append(r2_train)

        test_errors.append(test_error)
        test_labels.append(y.iloc[test_ix].index)
        train_errors.append(train_error)

        mape_train.append(mean_absolute_percentage_error(y_pred=y_hat_train, y_true=y_train))
        mape_test.append(mean_absolute_percentage_error(y_pred=y_hat_test, y_true=y_test))
        
        y_actual.append(y_test)
        y_predict.append(y_hat_test)

        
        # pickle the trained models  
        filename = '{}/{}_bestmodel_{}.pkl'.format(name, name, iteration)
        pickle.dump(best_model, open(filename, 'wb'))
        
        print('Best parameters={}'.format(search.best_params_), file=sys.stderr)
        print('MAE  >test={}, >train={}'.format(np.mean(test_error), np.mean(train_error)), file=sys.stderr)
        print('RMSE >test={}, >train={}'.format(rmse_test, rmse_train), file=sys.stderr)

        #save feature importance plot with scores from feature selection algorithm
        get_feature_importance_plot(gridsearchcv_obj=search, modelname=name, features=X.iloc[train_ix, :], 
                                               iteration=iteration)
        
        #save shapley values plot for each fold
        get_shap_plot(modelname=name,X_train=X.iloc[train_ix, :],model=search, iteration=iteration)
        
        print('', file=sys.stderr)

        iteration+=1

    # store test and train absolute errors as violin+box plot
    get_train_test_plot(test_errors=test_errors, train_errors=train_errors, 
                                  labels=test_labels,modelname=name)
    
    # populate the defined named tuples with raw data of model performace
    train_test_errors = abs_errors(train_errors, test_errors)
    train_test_rmse = rmse_scores(train_rmse, test_rmse)
    train_test_r2 = r2_scores(train_r2, test_r2)
    train_test_mape = mape_scores(mape_train, mape_test)

    # pass the named tuple to obtain summarized model performace metrics pandas dataframe
    stats_df = get_metrics_df(abs_errors=train_test_errors, rmse_scores=train_test_rmse,
                       r2_scores=train_test_r2, mape_scores=train_test_mape, model=name)

    # save the summary stats as csv
    #stats_df.to_csv('summary_results.csv')
    
    df_predictions = pd.DataFrame(index=list(np.concatenate(test_labels)),
                                columns=['Composition','lastphdospeak_actual','lastphdospeak_predicted'])
    
    for row, col in df_predictions.iterrows():
        df_predictions.loc[row,'Composition'] = df_comp.loc[row,'metadata.formula']
                                  
    df_predictions['lastphdospeak_actual'] = np.concatenate(y_actual)
    df_predictions['lastphdospeak_predicted'] = np.concatenate(y_predict)
    
    # get actual and model predict scatter plot
    get_actual_predict_plot(df_predictions=df_predictions,modelname=name)
    
    # save the prediction data as csv
    #df_predictions.to_csv('predictions_data.csv')
                                  
    
    os.chdir('..')
    
    print(name + 'model evaluation Finished',file=sys.stderr)

In [None]:
stats_df