In [None]:
import os
import numpy as np
import pandas as pd
import shap
import pickle
import neptune as neptune
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, VarianceThreshold
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')
shap.initjs()

In [None]:
#Load the pandas dataframe with features for training model
df = pd.read_pickle('./dataforml_automatminer.pkl')

In [None]:
df['last phdos peak'].hist(bins=30);

In [None]:
# Seperate targets and features
y=df.iloc[:,0] #targets
X=df.iloc[:,1:] #features

In [None]:
# List with model names that will be trained and evaluated
models_names =['LinearRegression', 'Ridge', 'Lasso', 'RandomForestRegressor', 
               'GradientBoostingRegressor']#,'KernelRidge']

kvals = [50, 'all'] # feature reduction parameters for linear models    
    
# Dict with model instances with hyperparameters for gridsearchCV to optimize and validate models
models_dicts = {LinearRegression(n_jobs=-1):{'selector__k': kvals, # Number of features to select
    'selector__score_func': [f_regression], # SelectKBest score functions
    'regressor__fit_intercept': [True, False], # LinearRegression model parameters
    'regressor__positive':[True, False], # LinearRegression model parameters
                      },
    Ridge():{'selector__k': kvals, # Number of features to select
    'selector__score_func': [f_regression], # SelectKBest score functions
    'regressor__fit_intercept': [True, False], # RidgeRegression model parameters
    'regressor__positive':[True, False], # RidgeRegression model parameters
    'regressor__alpha': [0.1, 1.0, 5, 10, 15], # RidgeRegression model parameters
}, 
    Lasso(max_iter=10000): {'selector__k': kvals, # Number of features to select
    'selector__score_func': [f_regression], # SelectKBest score functions
    'regressor__fit_intercept': [True, False], # Lasso model parameters
    'regressor__positive':[True, False], # Lasso model parameters
    'regressor__alpha': [0.1, 1.0, 5, 10, 15], # Lasso model parameters
},
     RandomForestRegressor(n_jobs=24): {'selector__k': kvals, # Number of features to select
    'selector__score_func': [f_regression], # SelectKBest score functions
    'regressor__n_estimators': [500],#list(np.linspace(100,500, 3, dtype = int,endpoint=True)), # RandomForestRegressor model parameters
    #'regressor__min_samples_leaf': list(np.linspace(2,6, 5, dtype = int, endpoint=True)), # RandomForestRegressor model param
    #'regressor__max_depth': [5,10, 15]
     },
    GradientBoostingRegressor(): {'selector__k': kvals, # Number of features to select
    'selector__score_func': [f_regression], # SelectKBest score functions
    'regressor__n_estimators': [500],#list(np.linspace(100,500, 3, dtype = int,endpoint=True)), # GradientBoostingRegressor model parameters
    #'regressor__learning_rate': [0.1, 0.3, 0.5, 0.7, 0.9, 1], # GradientBoostingRegressor model parameters
    #'regressor__min_samples_leaf': list(np.linspace(1,5, 5, dtype= int, endpoint=True)), # RandomForestRegressor model param
},
    KernelRidge(): {'selector__k': kvals, # Number of features to select
    'selector__score_func': [f_regression], # SelectKBest score functions
    'regressor__kernel': ["linear", "rbf", "poly", "cosine"], # KernelRidge model parameters
    'regressor__alpha': [0.1, 1.0, 5, 10, 15] # KernelRidge model parameters
}}

In [None]:
def get_feature_importance_plot(gridsearchcv_obj, modelname, features, iteration):
    feature_score=[]
    feature_names=[]

    for ind, (i, v) in enumerate(zip(gridsearchcv_obj.best_estimator_.steps[1][1].scores_, 
                    gridsearchcv_obj.best_estimator_.steps[1][1].get_support())):
        if v == True:
            feature_score.append(i)
            feature_names.append(features.columns[ind].split('|')[-1])

    # Create and save plot to neptune logger        
    fig_feat = go.Figure(data=go.Bar(
        x=feature_score,
        y=feature_names,
        orientation='h',
    ))
    fig_feat.update_layout(yaxis = dict(tickfont = dict(size=11)))
    fig_feat.update_layout(xaxis = dict(tickfont = dict(size=11)))
    fig_feat.update_yaxes(title_font=dict(size=22), color='black')
    fig_feat.update_xaxes(title_font=dict(size=22), color='black')
    fig_feat.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True, showgrid=False)
    fig_feat.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True, showgrid=False)
    fig_feat.update_xaxes(ticks="inside", tickwidth=1, tickcolor='black', ticklen=5)
    fig_feat.update_yaxes(ticks="inside", tickwidth=1, tickcolor='black', ticklen=5)
    fig_feat.update_layout(template='simple_white')
    fig_feat.update_layout(width=1000, height =1000)
    fig_feat.update_layout(title_text='Feature scores', title_x=0.5)
    fig_feat.write_html("{}/{}_features_{}.html".format(modelname, modelname, iteration),include_mathjax = 'cdn')
    
    return fig_feat

In [None]:
def get_train_test_plot(errors_test,errors_train, modelname):
    fig_val = go.Figure()

    fig_val.add_trace(go.Violin(x0=name,
                            y=error_data_test.values,
                            legendgroup='Test',name='Test',
                            side='positive', #scalegroup='Train',
                            line_color='blue', box_visible=True)
                 )
    fig_val.add_trace(go.Violin(x0=name,
                            y=error_data_train.values,
                            legendgroup='Train',name='Train',
                            side='negative', #scalegroup='Test',
                            line_color='orange', box_visible=True)
                 )
    fig_val.update_traces(meanline_visible=True)
    fig_val.update_layout(violingap=0, violinmode='overlay')
    fig_val.update_traces(marker_opacity=0.75)
    fig_val.update_layout(yaxis = dict(tickfont = dict(size=11)))
    fig_val.update_layout(xaxis = dict(tickfont = dict(size=11)))
    fig_val.update_yaxes(title_font=dict(size=22), color='black')
    fig_val.update_xaxes(title_font=dict(size=22), color='black')
    fig_val.update_layout(width=1000, height =1000)
    fig_val.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True, showgrid=False)
    fig_val.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True, showgrid=False)
    fig_val.update_xaxes(ticks="inside", tickwidth=1, tickcolor='black', ticklen=5)
    fig_val.update_yaxes(ticks="inside", tickwidth=1, tickcolor='black', ticklen=5)
    fig_val.update_layout(yaxis = dict(tickfont = dict(size=18)))
    fig_val.update_layout(yaxis_title="Validation Absolute error")
    fig_val.update_layout(yaxis = dict(tickfont = dict(size=18)))
    fig_val.update_layout(xaxis = dict(tickfont = dict(size=18)))
    fig_val.update_layout(template='simple_white')
    fig_val.update_layout(yaxis_zeroline=False)
    fig_val.write_html("{}/{}_validation.html".format(modelname,modelname),include_mathjax = 'cdn')

    
    return fig_val

In [None]:
def get_shap_plot(model, X_train, iteration, modelname):
    feature_score=[]
    feature_names=[]
    for ind, (i, v) in enumerate(zip(model.steps[1][1].scores_, 
                        model.steps[1][1].get_support())):
            if v == True:
                feature_score.append(i)
                feature_names.append(X_train.columns[ind])

    std_scaler = StandardScaler() 

    std_scaler.fit(X_train.filter(feature_names)) 

    X_train_scaled = std_scaler.transform(X_train.filter(feature_names))
    if modelname == 'RandomForestRegressor' or modelname =='GradientBoostingRegressor':
        explainer = shap.TreeExplainer(model.steps[2][1], X_train_scaled)
        shap_values = explainer.shap_values(X_train_scaled, check_additivity=False)
    elif modelname =='KernelRidge':
        explainer = shap.KernelExplainer(model.steps[2][1].predict, X_train_scaled)
        shap_values = explainer.shap_values(X_train_scaled)
    else:
        explainer = shap.LinearExplainer(model.steps[2][1], X_train_scaled)
        shap_values = explainer.shap_values(X_train_scaled)
    
    fig = shap.summary_plot(shap_values, features=X_train_scaled, feature_names=X_train.filter(feature_names).columns, show=False)
    plt.savefig('{}/{}_{}.svg'.format(modelname,modelname,iteration))
    plt.close()
    return None

In [None]:
df = pd.DataFrame(index=models_names)
trained_models={}
for name, (model, param) in zip(models_names, models_dicts.items()):
    print(name + ' model training')
    os.mkdir(name)
    #scorer = {'RMSE':'neg_root_mean_squared_error', 'r2': 'r2'}
    #scorer = {'MAE':'neg_mean_absolute_error'}

    cv_outer = KFold(n_splits=5, shuffle=True, random_state=18012019)
    # enumerate splits
    outer_test_rmse = []
    outer_train_rmse =[]

    outer_test_r2 = []
    outer_train_r2 =[]
    test_errors=[]
    train_errors=[]
    iteration=1
    for train_ix, test_ix in cv_outer.split(X):
        # split data
        X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
        y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
        # configure the cross-validation procedure
        cv_inner = KFold(n_splits=2, shuffle=True, random_state=18012019)
        # define the model
        if name == 'RandomForestRegressor' or name =='GradientBoostingRegressor': #can be removed later
            
            pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('selector', SelectKBest()),
            ('regressor', model)
            ])
        else:
            pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('selector', SelectKBest()),
            ('regressor', model)
            ])

        # define search space
        hyperparameters = param
        # define search
        search = GridSearchCV(pipeline, param_grid=hyperparameters, scoring='neg_mean_absolute_error',
                                                                             cv=cv_inner, 
                              refit=True, return_train_score=True, n_jobs=24)
        # execute search
        result = search.fit(X_train, y_train)
        # get the best performing model fit on the whole training set
        best_model = result.best_estimator_

        y_train_pred = best_model.predict(X_train)
        # evaluate model on the hold out dataset
        yhat = best_model.predict(X_test)
        # evaluate the model
        rmse_test = mean_squared_error(y_test, yhat, squared=False)
        rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)

        r2_test = r2_score(y_test, yhat)
        r2_train = r2_score(y_train, y_train_pred)

        test_error = abs(y_test - yhat)
        train_error = abs(y_train - y_train_pred)

        # store the result
        outer_test_rmse.append(rmse_test)
        outer_train_rmse.append(rmse_train)

        outer_test_r2.append(r2_test)
        outer_train_r2.append(r2_train)

        test_errors.append(test_error)
        train_errors.append(train_error)
        
        filename = '{}/{}_bestmodel_{}.pkl'.format(name, name, iteration)
        pickle.dump(best_model, open(filename, 'wb'))
        
        get_shap_plot(modelname=name,X_train=X_train,model=best_model, iteration=iteration)
        
        fig_feat = get_feature_importance_plot(gridsearchcv_obj=search, modelname=name, features=X_train, 
                                               iteration=iteration)
        
        print('>acc=%.3f, est=%.3f, cfg=%s' % (rmse_test, result.best_score_, result.best_params_))
        # summarize the estimated performance of the model
        print('Accuracy: %.3f (%.3f)' % (np.mean(outer_test_rmse), np.std(outer_test_rmse)))

        
        iteration+=1

    error_data_test = pd.concat(test_errors)
    error_data_train = pd.concat(train_errors)
    

    
    fig_val = get_train_test_plot(errors_test=error_data_test, errors_train=error_data_train, modelname=name)

    #store trained model in dict
    trained_models.update({name+'_bestmodel': best_model, name+'_gridsearch':search})

    df.loc[name,'mae_train'] = np.mean(error_data_train.values)
    df.loc[name,'mae_test'] = np.mean(error_data_test.values)
    df.loc[name,'std_mae_train'] = np.std(error_data_train.values)
    df.loc[name,'std_mae_test'] = np.std(error_data_test.values)
    df.loc[name, 'max_mae_error_train'] = np.max(error_data_train.values)
    df.loc[name, 'max_mae_error_test'] = np.max(error_data_test.values)
    df.loc[name, 'min_mae_error_train'] = np.min(error_data_train.values)
    df.loc[name, 'min_mae_error_test'] = np.min(error_data_test.values)
    
    df.loc[name,'mean_train_rmse'] = np.mean(outer_train_rmse)
    df.loc[name,'mean_test_rmse'] = np.mean(outer_test_rmse)
    df.loc[name,'mean_train_r2'] = np.mean(outer_train_r2)
    df.loc[name,'mean_test_r2'] = np.mean(outer_test_r2)
    df.loc[name,'std_train_rmse'] = np.std(outer_train_rmse)
    df.loc[name,'std_test_rmse'] = np.std(outer_test_rmse)
    df.loc[name,'std_train_r2'] = np.std(outer_train_r2)
    df.loc[name,'std_test_r2'] = np.std(outer_train_r2)
    
    print(name + ' done')
    print('')

In [None]:
df

In [None]:
df.to_csv('summary_results.csv')

In [None]:
#load saved models
filename = '{}/{}_bestmodel_5.pkl'.format(models_names[0],models_names[0])
loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
loaded_model