### Import necessary modules

In [None]:
import numpy as np
import pandas as pd
import neptune as neptune
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
#Load the pandas dataframe with features for training model
df = pd.read_pickle('dataforml.pkl')

In [None]:
df.last_phdos_peak.hist(bins=30);

In [None]:
# Exclude extreme outliers from dataset using phonon freqeuncy as criteria
df=df[df['last_phdos_peak']<1700]

In [None]:
# Seperate targets and features
y=df.iloc[:,-1] #targets
X=df.iloc[:,:-2] #features

### Dimension reduction (drop redundant features)

In [None]:
# drop constant features (low variance)
features_constant = X.loc[:,X.nunique() == 1].columns.to_list()
X.drop(columns=features_constant,inplace=True)

In [None]:
#drop intercorrelated features
correlation_matrix = X.corr()
correlated_features = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.append(colname)
X.drop(labels=correlated_features, axis=1, inplace=True)

### Start modelling section

In [None]:
# List with model names that will be trained and evaluated
models_names =['LinearRegression', 'Ridge', 'Lasso', 'RandomForestRegressor', 
               'KernelRidge', 'GradientBoostingRegressor']

# Dict with model instances with hyperparameters for gridsearchCV to optimize and validate models
models_dicts = {LinearRegression():{'selector__k': [20, 40, 60, 80, 100], # Number of features to select
    'regressor__fit_intercept': [True, False], # LinearRegression model parameters
    'regressor__positive':[True, False], # LinearRegression model parameters
                      },
    Ridge():{'selector__k': [20, 40, 60, 80, 100], # Number of features to select
    'regressor__fit_intercept': [True, False], # RidgeRegression model parameters
    'regressor__positive':[True, False], # RidgeRegression model parameters
    'regressor__alpha': [0.1, 0.5, 0.8, 1.0, 5, 10, 15], # RidgeRegression model parameters
}, 
    Lasso(): {'selector__k': [20, 40, 60, 80, 100], # Number of features to select
    'regressor__fit_intercept': [True, False], # Lasso model parameters
    'regressor__positive':[True, False], # Lasso model parameters
    'regressor__alpha': [0.1, 0.5, 0.8, 1.0, 5, 10, 15], # Lasso model parameters
},
     RandomForestRegressor(n_jobs=12): {'selector__k': [20, 40, 60, 80, 100], # Number of features to select
    'regressor__n_estimators': [80,90,100,110,120,130,140,150], # RandomForestRegressor model parameters
},
    KernelRidge(): {'selector__k': [20, 40, 60, 80, 100], # Number of features to select
    'regressor__kernel': ["linear", "rbf", "poly", "cosine"], # KernelRidge model parameters
    'regressor__alpha': [0.1, 0.5, 0.8, 1.0, 5, 10, 15] # KernelRidge model parameters
},
    GradientBoostingRegressor(): {'selector__k': [20, 40, 60, 80, 100], # Number of features to select
    'regressor__n_estimators': [80,90,100,110,120,130,140,150], # GradientBoostingRegressor model parameters
    'regressor__learning_rate': [0.001 ,0.1, 0.3, 0.5, 0.7, 0.9, 1] # GradientBoostingRegressor model parameters
}}

##### `Note` : If using neptune logger to track model details , please add *project* name and *api_token* corresponding to your own account 
#### More details here :  https://docs.neptune.ai/usage/quickstart/ 

In [None]:
def train_validate_models(model_names, models_dict, log_neptune=False):
    """
    The function below with perform nested CV and also store the results in neptune.ai logger for each model 
    (Best model parameters, train/test rmse, r2 and validation plots).
    
    It will return a pands dataframe with train/test rmse, r2 and also saves interactive validation plots
    """
    
    for name, (model, param) in zip(models_names, models_dicts.items()):
        
        df = pd.DataFrame(index=[name])
    
        scorer = {'RMSE':'neg_root_mean_squared_error', 'r2': 'r2'}

        cv_outer = KFold(n_splits=10, shuffle=True, random_state=42)
        # enumerate splits
        outer_test_rmse = []
        outer_train_rmse =[]

        outer_test_r2 = []
        outer_train_r2 =[]
        test_errors=[]
        train_errors=[]
        for train_ix, test_ix in cv_outer.split(X):
            # split data
            X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
            y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
            # configure the cross-validation procedure
            cv_inner = KFold(n_splits=5, shuffle=True, random_state=42)
            # define the model
            pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('selector', SelectKBest(f_regression)),
            ('regressor', model)
            ])

            # define search space
            hyperparameters = param
            # define search
            search = GridSearchCV(pipeline, param_grid=hyperparameters, scoring=scorer,
                                                                                 cv=cv_inner, 
                                  refit='RMSE', return_train_score=True)
            # execute search
            result = search.fit(X_train, y_train)
            # get the best performing model fit on the whole training set
            best_model = result.best_estimator_

            y_train_pred = best_model.predict(X_train)
            # evaluate model on the hold out dataset
            yhat = best_model.predict(X_test)
            # evaluate the model
            rmse_test = mean_squared_error(y_test, yhat, squared=False)
            rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)

            r2_test = r2_score(y_test, yhat)
            r2_train = r2_score(y_train, y_train_pred)

            test_error = abs(y_test - yhat)
            train_error = abs(y_train - y_train_pred)

            # store the result
            outer_test_rmse.append(rmse_test)
            outer_train_rmse.append(rmse_train)

            outer_test_r2.append(r2_test)
            outer_train_r2.append(r2_train)

            test_errors.append(test_error)
            train_errors.append(train_error)

        error_data_test = pd.concat(test_errors)
        error_data_train = pd.concat(train_errors)

        #Extract features selected along with scores obtained using SelectKBest(f_regression) in pipeline 
        feature_score=[]
        feature_names=[]

        for ind, (i, v) in enumerate(zip(search.best_estimator_.steps[1][1].scores_, 
                        search.best_estimator_.steps[1][1].get_support())):
            if v == True:
                feature_score.append(i)
                feature_names.append(X.columns[ind].split('|')[-1])

        # Create and save plot to neptune logger        
        fig_feat = go.Figure(data=go.Bar(
            x=feature_score,
            y=feature_names,
            orientation='h',
        ))
        fig_feat.update_layout(yaxis = dict(tickfont = dict(size=11)))
        fig_feat.update_layout(xaxis = dict(tickfont = dict(size=11)))
        fig_feat.update_yaxes(title_font=dict(size=22), color='black')
        fig_feat.update_xaxes(title_font=dict(size=22), color='black')
        fig_feat.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True, showgrid=False)
        fig_feat.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True, showgrid=False)
        fig_feat.update_xaxes(ticks="inside", tickwidth=1, tickcolor='black', ticklen=5)
        fig_feat.update_yaxes(ticks="inside", tickwidth=1, tickcolor='black', ticklen=5)
        fig_feat.update_layout(template='simple_white')
        fig_feat.update_layout(width=1000, height =1000)
        fig_feat.update_layout(title_text='Feature scores', title_x=0.5)
        fig_feat.write_html("./{}_features.html".format(name),include_mathjax = 'cdn')

        # box plot for train and test errors

        fig_val = go.Figure()

        fig_val.add_trace(go.Violin(x0=name,
                                y=error_data_test.values,
                                legendgroup='Test',name='Test',
                                side='positive', #scalegroup='Train',
                                line_color='blue', box_visible=True)
                     )
        fig_val.add_trace(go.Violin(x0=name,
                                y=error_data_train.values,
                                legendgroup='Train',name='Train',
                                side='negative', #scalegroup='Test',
                                line_color='orange', box_visible=True)
                     )
        fig_val.update_traces(meanline_visible=True)
        fig_val.update_layout(violingap=0, violinmode='overlay')
        fig_val.update_traces(marker_opacity=0.75)
        fig_val.update_layout(yaxis = dict(tickfont = dict(size=11)))
        fig_val.update_layout(xaxis = dict(tickfont = dict(size=11)))
        fig_val.update_yaxes(title_font=dict(size=22), color='black')
        fig_val.update_xaxes(title_font=dict(size=22), color='black')
        fig_val.update_layout(width=1000, height =1000)
        fig_val.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True, showgrid=False)
        fig_val.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True, showgrid=False)
        fig_val.update_xaxes(ticks="inside", tickwidth=1, tickcolor='black', ticklen=5)
        fig_val.update_yaxes(ticks="inside", tickwidth=1, tickcolor='black', ticklen=5)
        fig_val.update_layout(yaxis = dict(tickfont = dict(size=18)))
        fig_val.update_layout(yaxis_title="Validation Absolute error")
        fig_val.update_layout(yaxis = dict(tickfont = dict(size=18)))
        fig_val.update_layout(xaxis = dict(tickfont = dict(size=18)))
        fig_val.update_layout(template='simple_white')
        fig_val.update_layout(yaxis_zeroline=False)
        fig_val.write_html("./{}_validation.html".format(name),include_mathjax = 'cdn')

        if log_neptune:
            #add your neptune logger project and api_token
            run = neptune.init_run(
                project="username/nameofprojet", name = name,
                api_token="xxxx",
            )

            # log train set metric in neptune
            run["model/mean_train_rmse"] = np.mean(outer_train_rmse)
            run["model/mean_train_r2"] = np.mean(outer_train_r2)

            run["model/std_train_rmse"] = np.std(outer_train_rmse)
            run["model/std_train_r2"] = np.std(outer_train_r2)

            #log test set metric in neptune
            run["model/mean_test_rmse"] = np.mean(outer_test_rmse)
            run["model/mean_test_r2"] = np.mean(outer_test_r2)

            run["model/std_test_rmse"] = np.std(outer_test_rmse)
            run["model/std_test_r2"] = np.std(outer_test_r2)

            #log best parameters for model in neptune
            run['model/parameters']= search.best_params_

            run["preprocessing/features_score_plot"].upload(neptune.types.File.as_html(fig_feat))
            run['preprocessing/features_selected'] = dict(zip(feature_names, feature_score))


            run['validation/violinplot'].upload(neptune.types.File.as_html(fig_val))
            run.stop()


        df.loc[name,'mean_train_rmse'] = np.mean(outer_train_rmse)
        df.loc[name,'mean_test_rmse'] = np.mean(outer_test_rmse)
        df.loc[name,'mean_train_r2'] = np.mean(outer_train_r2)
        df.loc[name,'mean_test_r2'] = np.mean(outer_test_r2)
        df.loc[name,'std_train_rmse'] = np.std(outer_train_rmse)
        df.loc[name,'std_test_rmse'] = np.std(outer_test_rmse)
        df.loc[name,'std_train_r2'] = np.std(outer_train_r2)
        df.loc[name,'std_test_r2'] = np.std(outer_train_r2)
    
    return df

In [None]:
results = train_validate_models(model_names=models_names, models_dict=models_dicts,log_neptune=True)

In [None]:
results