In [None]:
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio   
pio.kaleido.scope.mathjax = None

In [None]:
rootdir = os.getcwd()
df_lit = pd.read_csv('literature.csv',delimiter=';')
metric_dict = {'15_min':['r2','rmse','mse','mae'], '60_min' : ['r2','rmse','mse','mae','mape']}
size_dict = {'uci': {'size': 1030, 'feat': 8}, 'atici': {'size': 140, 'feat': 3},
             'bachir': {'size': 112, 'feat': 3}, 'koya': {'size': 110, 'feat': 10},
             'huang': {'size': 114, 'feat': 9}, 'hu_tensile-strength': {'size': 896, 'feat': 27},
             'hu_yield-strength': {'size': 860, 'feat': 27}, 'hu_elongation': {'size': 783, 'feat': 27},
             'yin': {'size': 900, 'feat': 11}, 'su_bond-1': {'size': 122, 'feat': 7},  
             'su_bond-2': {'size': 136, 'feat': 5}, 'xiong': {'size': 43, 'feat': 4}, 
             'guo': {'size': 63162, 'feat':27}, 'mat-bench': {'size': 312, 'feat': 14}} 
lit_dict = { 'koya_rup': {'range': [0.238,1.26]},
             'koya_compressive': {'range': [0.568,1.23]},
             'hu_tensile-strength': {'range': [0.9901,1.0099]},
             'hu_yield-strength': {'range': [0.9824,1.0176]},
             'hu_elongation': {'range': [0.9417,1.0583]}
           }

In [None]:
def get_df_all(data_folder):
    data_dir = os.path.join(rootdir , data_folder)
    df_all = pd.DataFrame()
    for framework in os.listdir(data_dir):
        for dataset in os.listdir(os.path.join(data_dir,framework)):
            df = pd.read_csv(os.path.join(data_dir,framework,dataset,'regression_summary.csv'))
            df['Framework'] = framework
            df['Task'] = '_'.join(dataset.split("_", 2)[:2])
            df_all = df_all.append(df)
    df_all = df_all.reset_index(drop=True)
    df_all['Size'] = 'nan'
    df_all['Shape_ratio'] = 'nan'
    df_all['relative_score'] = np.nan
    for metric in metric_dict[data_folder]:
        for x in range(len(df_all)):
            task = df_all['Task'].loc[x]
            if metric == 'r2':
                df_all.at[x,'relative_{}'.format(metric)] = df_all.loc[x,metric]/df_lit[metric].loc[df_lit['Task']==task].max()
            else:
                df_all.at[x,'relative_{}'.format(metric)] = 1/(df_all.loc[x,metric]/df_lit[metric].loc[df_lit['Task']==task].min())           
            if not(np.isfinite(df_all.loc[x,'relative_score'])):# and not(df_all.loc[x,'relative_{}'.format(metric)]==0):
                df_all.at[x,'relative_score'] = df_all.loc[x,'relative_{}'.format(metric)]    
            
            # Append size and shape_ratio
            # Extra for Su and Hu, because of 2 different dataset sizes for Su, and Hu task depending
            if df_all.at[x,'Task'].split('_')[0] in ['su','hu']:
                df_all.at[x,'Size'] = size_dict[df_all.at[x,'Task']]['size']
                df_all.at[x,'Shape_ratio'] = (size_dict[df_all.at[x,'Task']]['size']/
                                              size_dict[df_all.at[x,'Task']]['feat'])
            else:  
                df_all.at[x,'Size'] = size_dict[df_all.at[x,'Task'].split('_')[0]]['size']
                df_all.at[x,'Shape_ratio'] = (size_dict[df_all.at[x,'Task'].split('_')[0]]['size']/
                                              size_dict[df_all.at[x,'Task'].split('_')[0]]['feat'])
    df_all['Time'] = data_folder
    return df_all.sort_values(by=['Size','Task'],ascending=[True, False])

def plot_df_all(df_all,colums=None,target='relative_score',save_fig=False,x_range=None):
    dict_all = {'tpot': {'colour':'#0b0305'},
                'mljar': {'colour':'#403b7a'},
                'h2o': {'colour':'#3575a1'},
                'autosklearn': {'colour':'#62cfac'}              
                }

    layout = go.Layout(
        xaxis=dict(title=target,title_font={'size':20},tickfont={'size':16}, 
                   zeroline=False,linecolor='black',gridcolor='#cccccc'),
        yaxis=dict(linecolor='black',title_font={'size':20},tickfont={'size':16}),
        boxmode='group',
        plot_bgcolor='white',
        #xaxis_title=target.replace('_',' '),
        legend=dict(traceorder='reversed',font_size=18,orientation="h",
                    yanchor="top",y=1.1,xanchor='center',x=0.5)
    )

    fig = go.Figure(layout=layout)
    if colums:
        df_all = df_all[df_all.Task.isin(colums)]

    for framework_name, framework_dict in dict_all.items():
        fig.add_trace(go.Box(
            x = df_all[target].loc[df_all['Framework']==framework_name],
            y = df_all['Task'].loc[df_all['Framework']==framework_name],
            name=framework_name,
            #fillcolor=framework_dict['colour'],
            #line_color='black',
            #line_width=1
            marker_color=framework_dict['colour']
        ))

    if 'relative' in target:
        fig.add_vline(x=1, line_color="black")
    else:
        for task in df_lit['Task'].unique():
            max_value = df_lit['r2'].loc[df_lit['Task']==task].max()
            fig.add_trace(go.Scatter(mode='markers',x=[max_value],y=[task],marker_symbol='line-ns',
                                     marker_line_color="midnightblue", marker_color="lightskyblue",
                                     marker_line_width=3, marker_size=20,showlegend=False))

    # literature range
    lit_colour = 'black'
    for counter, lit in enumerate((lit_dict.keys() & df_all['Task'].unique())):
        if counter == 0:
            legend_indicator = True
        else: 
            legend_indicator = False
        range_ = lit_dict[lit]['range']
        fig.add_trace(go.Scatter(x=range_, y=[lit,lit],mode='lines',line_width=5,line_color=lit_colour,
                                 name='literature', showlegend=legend_indicator))
        fig.add_trace(go.Scatter(mode='markers',x=[range_[0]],y=[lit],marker_symbol='line-ns',marker_line_color=lit_colour,
                                 marker_line_width=5, marker_size=10,marker_color=lit_colour,
                                 showlegend=False))
        fig.add_trace(go.Scatter(mode='markers',x=[range_[1]],y=[lit],marker_symbol='line-ns',marker_line_color=lit_colour,
                                 marker_line_width=5, marker_size=10,marker_color=lit_colour,
                                 showlegend=False))
    
    #fig.update_traces(width=1, selector=dict(type='box'))
    fig.update_traces(orientation='h') # horizontal box plots
    fig.update_traces(whiskerwidth=1, selector=dict(type='box'))
    if x_range:
        fig.update_layout(xaxis_range=x_range)
    fig.update_layout(width=1000, height=1000)
    #fig.update_layout(boxgap=0.15, boxgroupgap=0.4)
    if save_fig:
        fig.write_image(save_fig)
    fig.show()


## Plot Results over all Datasets

In [None]:
df_all = get_df_all('60_min')
plot_df_all(df_all)

## Plot Very Small Datasets
- Figure 5 Very Small Datasets

In [None]:
very_small_datasets =['su_bond-1','huang_flex','huang_compressive',
                 'bachir_compressive','koya_compressive','koya_rup',
                 'xiong_heigth','xiong_width','atici_compressive',
                 'su_bond-2']
plot_df_all(df_all,colums=very_small_datasets,x_range=[0,2.05])#,save_fig='AutoML_small_datasets.svg')

## Plot Small and Large Datasets
- Figure 5 Small and Large Datasets

In [None]:
large_datasets =['guo_tensile','guo_elongation','guo_yield',
                 'hu_elongation','hu_yield-strength','hu_tensile-strength',
                 'uci_compressive','yin_ifss','yin_pullout-force', 'mat-bench_yts']
plot_df_all(df_all,colums=large_datasets,x_range=[0.83,1.405])#,save_fig='AutoML_large_datasets_review.svg')

## Data Preperation for Summarizing per Framework

In [None]:
# Summarazing results
df_result = pd.concat([get_df_all('15_min'),get_df_all('60_min')])
df_result  = df_result.groupby(by=['Time','Framework','Task','Size','Shape_ratio'],as_index=False).mean()
df_best = df_result.groupby(by=['Time','Task','Size','Shape_ratio'],as_index=False).agg({'relative_r2':np.max,'relative_rmse':np.max})
df_best['Framework'] = 'best AutoML'
df_result = pd.concat([df_result, df_best])
df_result.Time = df_result.Time.replace(to_replace='_', value=' ',regex=True)

## Plot summarized Results
- METRIC = 'relative_r2'
  - Figure 4 (a)
- METRIC = 'relative_rmse'
  - Figure 4 (b)

In [None]:
import plotly.express as px
import plotly.graph_objects as go

METRIC = 'relative_rmse'

layout = go.Layout(#yaxis_range=[0.6,2.1],
                   yaxis=dict(title=METRIC,title_font={'size':20},tickfont={'size':16}, 
                              zeroline=False,linecolor='black',gridcolor='#cccccc'),
                   xaxis=dict(title=None,linecolor='black',title_font={'size':20},tickfont={'size':16},),
                   boxmode='group',
                   plot_bgcolor='white',
                   legend=dict(font_size=18,orientation="h",yanchor="top",y=1.1,xanchor='center',x=0.5)
                   )

fig = go.Figure(layout=layout)

dict_time = {'15 min': {'colour':'#62cfac'},
             '60 min': {'colour':'#0b0305'},
            }


for time, time_specs in dict_time.items():
    fig.add_trace(go.Box(
            y = df_result[METRIC].loc[df_result['Time']==time],
            x = df_result['Framework'].loc[df_result['Time']==time],
            name = time,
            marker_color=time_specs['colour'],
            #fillcolor=framework_dict['colour'],
            #line_color='black',
            line_width=1.5,
            #boxpoints='all'
        ))


fig.add_hline(y=1, line_color="black")
fig.update_traces(whiskerwidth=1, selector=dict(type='box'))
fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
fig.write_image("AutoML_summary.svg")

fig.show()

## Plot Size, Shape over Performance

### Figure 6(a)

In [None]:
layout = go.Layout(yaxis_range=[0,1.1],
                   yaxis=dict(title='R2',title_font={'size':20},tickfont={'size':16}, 
                              zeroline=False,linecolor='black',gridcolor='#cccccc'),
                   #xaxis_range=[1.6,3.1],
                   xaxis=dict(title='Dataset size',linecolor='black',title_font={'size':20},tickfont={'size':16}),
                   plot_bgcolor='white',
                   colorway=['#3575a1'],
                   )

fig = go.Figure(layout=layout)
df_plot = df_result.loc[(df_result['Framework']!='best AutoML')&(df_result['Time']=='60 min')]

fig.add_trace(go.Box(
        y = df_plot['r2'],
        x = df_plot['Size'],
        line_width=1.5,
        width = 0.035,
        fillcolor='#aec7d9',
        whiskerwidth=0
    ))

fig.update_xaxes(type="log")
fig.write_image("R2_over_datasize.svg")
fig.show()

### Figure 6 (b)

In [None]:
layout = go.Layout(yaxis_range=[0,1.1],
                   yaxis=dict(title='R2',title_font={'size':20},tickfont={'size':16}, 
                              zeroline=False,linecolor='black',gridcolor='#cccccc'),
                   #xaxis_range=[3,4.2],
                   xaxis=dict(title='Dataset size / Feature number',linecolor='black',title_font={'size':20},tickfont={'size':16}),
                   plot_bgcolor='white',
                   colorway=['#3575a1'],
                   )

fig = go.Figure(layout=layout)

df_plot = df_result.loc[(df_result['Framework']!='best AutoML')&(df_result['Time']=='60 min')]
fig.add_trace(go.Box(
        y = df_plot['r2'],
        x = df_plot['Shape_ratio'],
        line_width=1.5,
        width = 0.03,
        fillcolor='#aec7d9'

    ))

fig.update_xaxes(type="log")
fig.write_image("R2_over_datashape.svg")
fig.show()