In [7]:
from plotly.subplots import make_subplots
from scipy.stats import gaussian_kde, spearmanr

In [8]:
import copy
import numpy as np
import pandas as pd
import plotly.graph_objs as go

In [9]:
from shg_ml_benchmarks.utils import load_train, load_holdout, load_full

# Holdout set

In [10]:
df_orig = load_full()

## Dist 250

In [11]:
df_dist_250 = load_holdout(task="distribution_250")
print(f"{df_dist_250.shape = }")
print(f"{len(df_dist_250[df_dist_250['origin']=='Naccarato']) = }")
print(f"{len(df_dist_250[df_dist_250['origin']=='Naccarato'])/250 = }")
display(df_dist_250.head())

df_dist_250.shape = (250, 48)
len(df_dist_250[df_dist_250['origin']=='Naccarato']) = 67
len(df_dist_250[df_dist_250['origin']=='Naccarato'])/250 = 0.268


Unnamed: 0,formula_reduced,crystal_system,dRMS,dijk,elements,epsij,src_bandgap,src_DB_IDs,src_ehull,src_epsij,...,rot_was_symmetrized,why_not_conventional_original,why_not_conventional_rot,dKP_rot,dijk_full,dKP_full,dijk_full_neum,dKP_full_neum,FOM,is_unique_here
mp-983357,H3F2,triclinic,0.163716,"[[[0.184009792286062, 0.027359620726523003, 0....","[H, F]","[[2.127669569956909, 0.08578493607146101, 0.02...",7.9086,{},0.023431,,...,False,is_conventional,is_conventional,0.163716,"[[0.184009792286062, -0.182349477539268, 0.061...",0.163716,"[[[0.184009792286062, 0.027359620726523003, 0....",0.163716,1.294766,True
mp-8377,TeO2,orthorhombic,1.284568,"[[[0, 0, 0], [0, 0, -1.5199217380658392], [0, ...","[Te, O]","[[5.312814291853512, 0, 0], [0, 4.926657685971...",2.9774,"{'icsd': ['icsd-90733', 'icsd-167817']}",0.0,"[[4.66569314, 0.0, 0.0], [0.0, 5.2470685, 0.0]...",...,False,is_conventional,is_conventional,1.284568,"[[0.0, 0.0, 0.0, -1.5199217380658392, 0.0, 0.0...",1.284568,"[[[0.0, 0.0, 0.0], [0.0, 0.0, -1.5199217380658...",1.284568,3.824674,True
mp-18938,Tl2MoO4,monoclinic,2.445284,"[[[7.502020420862985e-14, 2.896332784132797, -...","[Tl, Mo, O]","[[4.270674163120248, 0, 0.033803653051316], [0...",3.4625,"{'icsd': ['icsd-421983', 'icsd-280608']}",0.0,"[[4.2234208729586555, 9.162841957126609e-08, 0...",...,False,is_conventional,is_conventional,2.445284,"[[0.0, -0.0, 0.0, 0.580045346345563, -0.0, 2.8...",2.445284,"[[[0.0, 2.896332784132797, -0.0], [2.896332784...",2.445284,8.466797,True
mp-5854,LiGaO2,orthorhombic,0.626602,"[[[0, 0, -0.027836245525598], [0, 0, 0], [-0.0...","[Li, Ga, O]","[[3.376550244615672, 0, 0], [0, 3.247679685508...",3.2018,"{'icsd': ['icsd-93086', 'icsd-93087', 'icsd-18...",0.0,"[[3.33914818, 0.0, 0.0], [0.0, 3.20738516, 0.0...",...,False,is_conventional,is_conventional,0.626602,"[[0.0, 0.0, 0.0, 0.0, -0.027836245525598, 0.0]...",0.626602,"[[[0.0, 0.0, -0.027836245525598], [0.0, 0.0, 0...",0.626602,2.006253,True
mp-23778,H4CN2O,tetragonal,0.991038,"[[[0, 0, 0], [0, 0, 1.172612129796452], [0, 1....","[H, C, N, O]","[[2.307967760955439, 0, 0], [0, 2.307967760955...",5.0792,"{'icsd': ['icsd-100304', 'icsd-245371', 'icsd-...",0.0,"[[2.60455784, 0.0, 0.0], [0.0, 2.22152599, 0.0...",...,False,is_conventional,is_conventional,0.991038,"[[0.0, 0.0, 0.0, 1.172612129796452, 0.0, 0.0],...",0.991038,"[[[0.0, 0.0, 0.0], [0.0, 0.0, 1.17261212979645...",0.991038,5.033681,True


In [12]:
df_tmp = df_dist_250.copy()

# histnorm = 'percent'
histnorm = ''
bin_size = 1.0


fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x = df_tmp['dKP_full_neum'],
        histnorm=histnorm,
        xbins=dict(
            # start=0.0,
            # end=1.0,
            size=bin_size
        ),
        name='dist_250',
        marker_color = 'royalblue' 
    )
)


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.5)
fig.update_layout(
    # title_text='Comparison of effective frequency distribution', # title of plot
    xaxis_title_text='<i>d<sub>KP</sub></i> (pm/V)', # xaxis label
    # yaxis_title_text='Fraction of total data subset (%)', # yaxis label
    yaxis_title_text='Count', # yaxis label
    font_size=20,
    # xaxis_range=[0,1]
)

fig.update_layout(
    autosize=False,
    font_size=20,
    # width=1100,
    # height=600,
    plot_bgcolor="white",
    # template='simple_white',
)

fig.update_layout(
    legend=dict(
        yanchor='top',
        y=0.99,
        xanchor='right',
        x=0.99
    )
)

fig.show()

histnorm = 'percent'
bin_size = 1.0


fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x = df_tmp['dKP_full_neum'],
        histnorm=histnorm,
        xbins=dict(
            # start=0.0,
            # end=1.0,
            size=bin_size
        ),
        name='dist_250',
        marker_color = 'royalblue' 
    )
)


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.5)
fig.update_layout(
    # title_text='Comparison of effective frequency distribution', # title of plot
    xaxis_title_text='<i>d<sub>KP</sub></i> (pm/V)', # xaxis label
    yaxis_title_text='Fraction of data (%)', # yaxis label
    font_size=20,
    # xaxis_range=[0,1]
)

fig.update_layout(
    autosize=False,
    font_size=20,
    # width=1100,
    # height=600,
    plot_bgcolor="white",
    # template='simple_white',
)

fig.update_layout(
    legend=dict(
        yanchor='top',
        y=0.99,
        xanchor='right',
        x=0.99
    )
)

fig.show()


# dKP-Eg-KDE with log-lin inset
# ==============================================================================================================================
fig = make_subplots(rows=1, cols=2, shared_yaxes=True, horizontal_spacing=0.02, column_widths=[0.8,0.2])

# ROW 1 COL 1 =================================================================================================================
fig.add_trace(go.Scatter(x=df_orig.drop(df_tmp.index, axis=0)['src_bandgap'],
                         y=df_orig.drop(df_tmp.index, axis=0)['dKP_full_neum'],
                         mode='markers',
                         marker=dict(
                            color="cornflowerblue",
                            opacity=0.5
                         ),
                         showlegend=True,
                         name=f'Train+Val',
                         text=[mpid for mpid in df_orig.drop(df_tmp.index, axis=0).index.values]
                         ))

fig.add_trace(go.Scatter(x=df_tmp['src_bandgap'],
                         y=df_tmp['dKP_full_neum'],
                         mode='markers',
                         marker = dict(symbol='circle',
                                       size=5,
                                       color = 'lightgreen',
                                       showscale=False,
                                       line=dict(width=1, color="black")
                                       ),
                         showlegend=True,
                         name=f'Test',
                         text=[mpid for mpid in df_tmp.index.values]
                         ))


# AXES
fig.update_xaxes(title = '<i>E<sub>g</sub></i> (eV)',
                 title_font_size=36,
                 range = [-0.1, 10.0],
                 row=1, col=1)
fig.update_yaxes(title = '<i>d</i><sub>KP</sub> (pm/V)',
                 title_font_size=36)


fig.update_layout(font={'family':'Arial', 'size': 20},
                 )

# ROW 1 COL 2 =================================================================================================================
density = gaussian_kde(np.reshape(df_tmp['dKP_full_neum'].values, (1,len(df_tmp))))
density.covariance_factor = lambda : .02 #Smoothing parameter
density._compute_covariance()

x_vals = np.linspace(min(df_tmp['dKP_full_neum'].values),
                     max(df_tmp['dKP_full_neum'].values),
                     200) # Specifying the limits of our data
kde_dist = density(x_vals)

fig.add_trace(go.Scatter(x=kde_dist, 
                         y=x_vals, 
                         mode='lines', 
                         marker_color='indianred',
                         fill='tozerox',
                         showlegend=False),
              row=1, col=2)

fig.update_xaxes(title = 'Distribution Test',
                 title_font_size=36,
                 row=1, col=2)
fig.update_yaxes(title = '', row=1, col=2)

# INSET =======================================================================================================================
inset = copy.deepcopy(fig.data[0])
inset.xaxis = 'x3'
inset.yaxis = 'y3'

inset_candidates_v1 = copy.deepcopy(fig.data[1])
inset_candidates_v1.xaxis = 'x3'
inset_candidates_v1.yaxis = 'y3'
inset


fig.update_layout(
    xaxis3=dict(domain      = [0.30, 0.75],
                anchor      = 'y3',
                range       = [-0.1, 8.4],
                linecolor   = 'black'
               ),
    yaxis3=dict(domain      = [0.60, 0.98],
                anchor      = 'x3',
                range       = [-3.5, 2.6],
                type        = 'log',
                tickvals    = [0.001, 0.01, 1, 100],
                linecolor   = 'black'
    ))

fig.add_trace(inset)
fig.add_trace(inset_candidates_v1)

fig.update_xaxes(showspikes=True, spikecolor="gray", spikethickness=2, spikesnap="cursor", spikemode="across")
fig.update_yaxes(showspikes=True, spikecolor="gray", spikethickness=2, spikesnap="cursor", spikemode='across')
fig.update_layout(hoverdistance=5)

# THEME OF GRAPH
fig.update_layout(template='simple_white')

fig.update_layout(
    width=1000,
    height=500
)

fig.show()

## Dist 125

In [13]:
df_dist_125 = load_holdout(task="distribution_125")
print(df_dist_125.shape)
print(f"{len(df_dist_125[df_dist_125['origin']=='Naccarato']) = }")
print(f"{len(df_dist_125[df_dist_125['origin']=='Naccarato'])/125 = }")
display(df_dist_125.head())

(125, 48)
len(df_dist_125[df_dist_125['origin']=='Naccarato']) = 27
len(df_dist_125[df_dist_125['origin']=='Naccarato'])/125 = 0.216


Unnamed: 0,formula_reduced,crystal_system,dRMS,dijk,elements,epsij,src_bandgap,src_DB_IDs,src_ehull,src_epsij,...,rot_was_symmetrized,why_not_conventional_original,why_not_conventional_rot,dKP_rot,dijk_full,dKP_full,dijk_full_neum,dKP_full_neum,FOM,is_unique_here
mp-552663,LiScAs2O7,monoclinic,1.116386,"[[[-5.947183391917919e-12, -0.5323990534917831...","[Li, Sc, As, O]","[[3.395783939522365, 0, -0.011046794630537001]...",3.4715,{'icsd': ['icsd-161499']},0.0,"[[3.493750704658318, 0.0, -0.02689810200728126...",...,False,is_conventional,is_conventional,1.116386,"[[-0.0, 0.0, 0.0, -0.7770201728089591, 0.0, -0...",1.116386,"[[[-0.0, -0.5323990534917831, 0.0], [-0.532399...",1.116386,3.875535,True
mp-753401,Sc2TiO5,orthorhombic,2.157938,"[[[0, 0, 1.785585157100647], [0, 0, -0.0017389...","[Sc, Ti, O]","[[4.8320848738880215, 0, 0], [0, 4.86871766489...",3.1316,{},0.0,"[[4.852012729999999, 0.0, 0.0], [0.0, 4.677700...",...,False,is_conventional,is_conventional,2.157938,"[[0.0, 0.0, 0.0, -0.0, 1.785585157100647, 0.0]...",2.157937,"[[[0.0, 0.0, 1.785585157100647], [0.0, 0.0, -0...",2.157937,6.757797,True
mp-23363,NaAlCl4,orthorhombic,0.010129,"[[[0, 0, 0], [0, 0, -0.011985143707408], [0, -...","[Na, Al, Cl]","[[2.589901269066773, 0, 0], [0, 2.589759110792...",5.6586,"{'icsd': ['icsd-30611', 'icsd-35279', 'icsd-71...",0.0,,...,False,is_conventional,is_conventional,0.010129,"[[0.0, 0.0, 0.0, -0.011985143707408, 0.0, 0.0]...",0.010129,"[[[0.0, 0.0, 0.0], [0.0, 0.0, -0.0119851437074...",0.010129,0.057318,True
mp-559961,Tl2SeO4,orthorhombic,0.725453,"[[[0, 0, 0], [0, 0, -0.858367327347738], [0, -...","[Tl, Se, O]","[[4.203852712891507, 0, 0], [0, 4.166937319491...",3.3931,{'icsd': ['icsd-99384']},0.0,"[[3.78617668, 0.0, 0.0], [0.0, 3.8270653, 0.0]...",...,False,is_conventional,is_conventional,0.725453,"[[0.0, 0.0, 0.0, -0.858367327347738, 0.0, 0.0]...",0.725453,"[[[0.0, 0.0, 0.0], [0.0, 0.0, -0.8583673273477...",0.725453,2.461534,True
mp-17066,Y2TeO6,orthorhombic,0.08069,"[[[0, 0, 0], [0, 0, 0.09547416350030201], [0, ...","[Y, Te, O]","[[4.05033252767446, 0, 0], [0, 3.9960659507616...",3.0878,{'icsd': ['icsd-240875']},0.0,"[[4.01280007, 0.0, 0.0], [0.0, 4.08179852, 0.0...",...,False,is_conventional,is_conventional,0.08069,"[[0.0, 0.0, 0.0, 0.09547416350030201, 0.0, 0.0...",0.08069,"[[[0.0, 0.0, 0.0], [0.0, 0.0, 0.09547416350030...",0.08069,0.249156,True


In [14]:
df_tmp = df_dist_125.copy()

# histnorm = 'percent'
histnorm = ''
bin_size = 1.0


fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x = df_tmp['dKP_full_neum'],
        histnorm=histnorm,
        xbins=dict(
            # start=0.0,
            # end=1.0,
            size=bin_size
        ),
        name='dist_250',
        marker_color = 'royalblue' 
    )
)


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.5)
fig.update_layout(
    # title_text='Comparison of effective frequency distribution', # title of plot
    xaxis_title_text='<i>d<sub>KP</sub></i> (pm/V)', # xaxis label
    # yaxis_title_text='Fraction of total data subset (%)', # yaxis label
    yaxis_title_text='Count', # yaxis label
    font_size=20,
    # xaxis_range=[0,1]
)

fig.update_layout(
    autosize=False,
    font_size=20,
    # width=1100,
    # height=600,
    plot_bgcolor="white",
    # template='simple_white',
)

fig.update_layout(
    legend=dict(
        yanchor='top',
        y=0.99,
        xanchor='right',
        x=0.99
    )
)

fig.show()

histnorm = 'percent'
bin_size = 1.0


fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x = df_tmp['dKP_full_neum'],
        histnorm=histnorm,
        xbins=dict(
            # start=0.0,
            # end=1.0,
            size=bin_size
        ),
        name='dist_250',
        marker_color = 'royalblue' 
    )
)


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.5)
fig.update_layout(
    # title_text='Comparison of effective frequency distribution', # title of plot
    xaxis_title_text='<i>d<sub>KP</sub></i> (pm/V)', # xaxis label
    yaxis_title_text='Fraction of data (%)', # yaxis label
    font_size=20,
    # xaxis_range=[0,1]
)

fig.update_layout(
    autosize=False,
    font_size=20,
    # width=1100,
    # height=600,
    plot_bgcolor="white",
    # template='simple_white',
)

fig.update_layout(
    legend=dict(
        yanchor='top',
        y=0.99,
        xanchor='right',
        x=0.99
    )
)

fig.show()

# dKP-Eg-KDE with log-lin inset
# ==============================================================================================================================
fig = make_subplots(rows=1, cols=2, shared_yaxes=True, horizontal_spacing=0.02, column_widths=[0.8,0.2])

# ROW 1 COL 1 =================================================================================================================
fig.add_trace(go.Scatter(x=df_orig.drop(df_tmp.index, axis=0)['src_bandgap'],
                         y=df_orig.drop(df_tmp.index, axis=0)['dKP_full_neum'],
                         mode='markers',
                         marker=dict(
                            color="cornflowerblue",
                            opacity=0.5
                         ),
                         showlegend=True,
                         name=f'Train+Val',
                         text=[mpid for mpid in df_orig.drop(df_tmp.index, axis=0).index.values]
                         ))

fig.add_trace(go.Scatter(x=df_tmp['src_bandgap'],
                         y=df_tmp['dKP_full_neum'],
                         mode='markers',
                         marker = dict(symbol='circle',
                                       size=5,
                                       color = 'lightgreen',
                                       showscale=False,
                                       line=dict(width=1, color="black")
                                       ),
                         showlegend=True,
                         name=f'Test',
                         text=[mpid for mpid in df_tmp.index.values]
                         ))


# AXES
fig.update_xaxes(title = '<i>E<sub>g</sub></i> (eV)',
                 title_font_size=36,
                 range = [-0.1, 10.0],
                 row=1, col=1)
fig.update_yaxes(title = '<i>d</i><sub>KP</sub> (pm/V)',
                 title_font_size=36)


fig.update_layout(font={'family':'Arial', 'size': 20},
                 )

# ROW 1 COL 2 =================================================================================================================
density = gaussian_kde(np.reshape(df_tmp['dKP_full_neum'].values, (1,len(df_tmp))))
density.covariance_factor = lambda : .02 #Smoothing parameter
density._compute_covariance()

x_vals = np.linspace(min(df_tmp['dKP_full_neum'].values),
                     max(df_tmp['dKP_full_neum'].values),
                     200) # Specifying the limits of our data
kde_dist = density(x_vals)

fig.add_trace(go.Scatter(x=kde_dist, 
                         y=x_vals, 
                         mode='lines', 
                         marker_color='indianred',
                         fill='tozerox',
                         showlegend=False),
              row=1, col=2)

fig.update_xaxes(title = 'Distribution Test',
                 title_font_size=36,
                 row=1, col=2)
fig.update_yaxes(title = '', row=1, col=2)

# INSET =======================================================================================================================
inset = copy.deepcopy(fig.data[0])
inset.xaxis = 'x3'
inset.yaxis = 'y3'

inset_candidates_v1 = copy.deepcopy(fig.data[1])
inset_candidates_v1.xaxis = 'x3'
inset_candidates_v1.yaxis = 'y3'
inset


fig.update_layout(
    xaxis3=dict(domain      = [0.30, 0.75],
                anchor      = 'y3',
                range       = [-0.1, 8.4],
                linecolor   = 'black'
               ),
    yaxis3=dict(domain      = [0.60, 0.98],
                anchor      = 'x3',
                range       = [-3.5, 2.6],
                type        = 'log',
                tickvals    = [0.001, 0.01, 1, 100],
                linecolor   = 'black'
    ))

fig.add_trace(inset)
fig.add_trace(inset_candidates_v1)

fig.update_xaxes(showspikes=True, spikecolor="gray", spikethickness=2, spikesnap="cursor", spikemode="across")
fig.update_yaxes(showspikes=True, spikecolor="gray", spikethickness=2, spikesnap="cursor", spikemode='across')
fig.update_layout(hoverdistance=5)

# THEME OF GRAPH
fig.update_layout(template='simple_white')

fig.update_layout(
    width=1000,
    height=500
)

fig.show()

## Rand 250

In [15]:
df_rand_250 = load_holdout(task="random_250")
print(df_rand_250.shape)
print(f"{len(df_rand_250[df_rand_250['origin']=='Naccarato']) = }")
print(f"{len(df_rand_250[df_rand_250['origin']=='Naccarato'])/250 = }")
display(df_rand_250.head())

(250, 48)
len(df_rand_250[df_rand_250['origin']=='Naccarato']) = 57
len(df_rand_250[df_rand_250['origin']=='Naccarato'])/250 = 0.228


Unnamed: 0,formula_reduced,crystal_system,dRMS,dijk,elements,epsij,src_bandgap,src_DB_IDs,src_ehull,src_epsij,...,rot_was_symmetrized,why_not_conventional_original,why_not_conventional_rot,dKP_rot,dijk_full,dKP_full,dijk_full_neum,dKP_full_neum,FOM,is_unique_here
mp-21892,Al2PbO4,orthorhombic,1.670288,"[[[0, 0, 0.375278732038964], [0, 0, 0], [0.375...","[Al, Pb, O]","[[3.738544810667184, 0, 0], [0, 3.602323597500...",4.0403,"{'icsd': ['icsd-33532', 'icsd-80128']}",0.0,"[[3.70763371, 0.0, 0.0], [0.0, 3.5657536100000...",...,False,is_conventional,is_conventional,1.670288,"[[0.0, 0.0, 0.0, 0.0, 0.375278732038964, 0.0],...",1.670288,"[[[0.0, 0.0, 0.375278732038964], [0.0, 0.0, 0....",1.670288,6.748463,True
mp-8070,Li2BeSiO4,monoclinic,0.615357,"[[[-0.9689421869262571, 0, -0.5925942777505401...","[Li, Be, Si, O]","[[2.951925853729255, 0, -0.017267249236169003]...",6.115,"{'icsd': ['icsd-193838', 'icsd-28307']}",0.0,"[[2.75137983, 0.0, 0.004776240000000017], [0.0...",...,False,is_conventional,is_conventional,0.615357,"[[-0.9689421869262571, -0.21086535866074102, 0...",0.615357,"[[[-0.9689421869262571, 0.0, -0.59259427775054...",0.615357,3.762907,True
mp-558491,BaAlBO3F2,hexagonal,0.916306,"[[[1.319868876542579, 0.14540074947442, 0], [0...","[Ba, Al, B, O, F]","[[2.87847405289687, -1.1742012652613529e-11, -...",6.0749,{'icsd': ['icsd-421141']},0.0,"[[2.8007016715447057, 3.2771409620600307e-07, ...",...,False,is_conventional,is_conventional,0.916306,"[[1.319868876542579, -1.319868876506723, -0.0,...",0.916306,"[[[1.319868876542579, 0.14540074947442, 0.0], ...",0.916306,5.566468,True
mp-632684,NaPH3NO3,hexagonal,0.633297,"[[[0, 0, 0.43409278345280605], [0, 0, 3.180367...","[Na, P, H, N, O]","[[2.488008566741476, -7.875236592855791e-12, 0...",5.1089,{'icsd': ['icsd-16608']},0.0,"[[2.5005071138804795, 8.095368818697086e-06, 3...",...,False,is_conventional,is_conventional,0.633297,"[[0.0, 0.0, 0.0, 0.0, 0.43409278345280605, 0.0...",0.633297,"[[[0.0, 0.0, 0.43409278345280605], [0.0, 0.0, ...",0.633297,3.235452,True
mp-23329,Bi2WO6,orthorhombic,8.216684,"[[[0, 0, 6.318289304622575], [0, 0, 0], [6.318...","[Bi, W, O]","[[6.09265151445429, 0, 0], [0, 6.1892275916128...",1.9543,"{'icsd': ['icsd-171327', 'icsd-23584']}",0.00045,"[[6.227617099190112, 5.491908754429176e-07, 0....",...,False,is_conventional,is_conventional,8.216684,"[[0.0, 0.0, 0.0, 0.0, 6.318289304622575, 0.0],...",8.216653,"[[[0.0, 0.0, 6.318289304622575], [0.0, 0.0, 0....",8.216653,16.057806,True


In [16]:
df_tmp = df_rand_250.copy()

# histnorm = 'percent'
histnorm = ''
bin_size = 1.0


fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x = df_tmp['dKP_full_neum'],
        histnorm=histnorm,
        xbins=dict(
            # start=0.0,
            # end=1.0,
            size=bin_size
        ),
        name='dist_250',
        marker_color = 'royalblue' 
    )
)


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.5)
fig.update_layout(
    # title_text='Comparison of effective frequency distribution', # title of plot
    xaxis_title_text='<i>d<sub>KP</sub></i> (pm/V)', # xaxis label
    # yaxis_title_text='Fraction of total data subset (%)', # yaxis label
    yaxis_title_text='Count', # yaxis label
    font_size=20,
    # xaxis_range=[0,1]
)

fig.update_layout(
    autosize=False,
    font_size=20,
    # width=1100,
    # height=600,
    plot_bgcolor="white",
    # template='simple_white',
)

fig.update_layout(
    legend=dict(
        yanchor='top',
        y=0.99,
        xanchor='right',
        x=0.99
    )
)

fig.show()

histnorm = 'percent'
bin_size = 1.0


fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x = df_tmp['dKP_full_neum'],
        histnorm=histnorm,
        xbins=dict(
            # start=0.0,
            # end=1.0,
            size=bin_size
        ),
        name='dist_250',
        marker_color = 'royalblue' 
    )
)


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.5)
fig.update_layout(
    # title_text='Comparison of effective frequency distribution', # title of plot
    xaxis_title_text='<i>d<sub>KP</sub></i> (pm/V)', # xaxis label
    yaxis_title_text='Fraction of data (%)', # yaxis label
    font_size=20,
    # xaxis_range=[0,1]
)

fig.update_layout(
    autosize=False,
    font_size=20,
    # width=1100,
    # height=600,
    plot_bgcolor="white",
    # template='simple_white',
)

fig.update_layout(
    legend=dict(
        yanchor='top',
        y=0.99,
        xanchor='right',
        x=0.99
    )
)

fig.show()

# dKP-Eg-KDE with log-lin inset
# ==============================================================================================================================
fig = make_subplots(rows=1, cols=2, shared_yaxes=True, horizontal_spacing=0.02, column_widths=[0.8,0.2])

# ROW 1 COL 1 =================================================================================================================
fig.add_trace(go.Scatter(x=df_orig.drop(df_tmp.index, axis=0)['src_bandgap'],
                         y=df_orig.drop(df_tmp.index, axis=0)['dKP_full_neum'],
                         mode='markers',
                         marker=dict(
                            color="cornflowerblue",
                            opacity=0.5
                         ),
                         showlegend=True,
                         name=f'Train+Val',
                         text=[mpid for mpid in df_orig.drop(df_tmp.index, axis=0).index.values]
                         ))

fig.add_trace(go.Scatter(x=df_tmp['src_bandgap'],
                         y=df_tmp['dKP_full_neum'],
                         mode='markers',
                         marker = dict(symbol='circle',
                                       size=5,
                                       color = 'lightgreen',
                                       showscale=False,
                                       line=dict(width=1, color="black")
                                       ),
                         showlegend=True,
                         name=f'Test',
                         text=[mpid for mpid in df_tmp.index.values]
                         ))


# AXES
fig.update_xaxes(title = '<i>E<sub>g</sub></i> (eV)',
                 title_font_size=36,
                 range = [-0.1, 10.0],
                 row=1, col=1)
fig.update_yaxes(title = '<i>d</i><sub>KP</sub> (pm/V)',
                 title_font_size=36)


fig.update_layout(font={'family':'Arial', 'size': 20},
                 )

# ROW 1 COL 2 =================================================================================================================
density = gaussian_kde(np.reshape(df_tmp['dKP_full_neum'].values, (1,len(df_tmp))))
density.covariance_factor = lambda : .02 #Smoothing parameter
density._compute_covariance()

x_vals = np.linspace(min(df_tmp['dKP_full_neum'].values),
                     max(df_tmp['dKP_full_neum'].values),
                     200) # Specifying the limits of our data
kde_dist = density(x_vals)

fig.add_trace(go.Scatter(x=kde_dist, 
                         y=x_vals, 
                         mode='lines', 
                         marker_color='indianred',
                         fill='tozerox',
                         showlegend=False),
              row=1, col=2)

fig.update_xaxes(title = 'Distribution Test',
                 title_font_size=36,
                 row=1, col=2)
fig.update_yaxes(title = '', row=1, col=2)

# INSET =======================================================================================================================
inset = copy.deepcopy(fig.data[0])
inset.xaxis = 'x3'
inset.yaxis = 'y3'

inset_candidates_v1 = copy.deepcopy(fig.data[1])
inset_candidates_v1.xaxis = 'x3'
inset_candidates_v1.yaxis = 'y3'
inset


fig.update_layout(
    xaxis3=dict(domain      = [0.30, 0.75],
                anchor      = 'y3',
                range       = [-0.1, 8.4],
                linecolor   = 'black'
               ),
    yaxis3=dict(domain      = [0.60, 0.98],
                anchor      = 'x3',
                range       = [-3.5, 2.6],
                type        = 'log',
                tickvals    = [0.001, 0.01, 1, 100],
                linecolor   = 'black'
    ))

fig.add_trace(inset)
fig.add_trace(inset_candidates_v1)

fig.update_xaxes(showspikes=True, spikecolor="gray", spikethickness=2, spikesnap="cursor", spikemode="across")
fig.update_yaxes(showspikes=True, spikecolor="gray", spikethickness=2, spikesnap="cursor", spikemode='across')
fig.update_layout(hoverdistance=5)

# THEME OF GRAPH
fig.update_layout(template='simple_white')

fig.update_layout(
    width=1000,
    height=500
)

fig.show()

## Rand 125

In [17]:
df_rand_125 = load_holdout(task="random_125")
print(df_rand_125.shape)
print(f"{len(df_rand_125[df_rand_125['origin']=='Naccarato']) = }")
print(f"{len(df_rand_125[df_rand_125['origin']=='Naccarato'])/125 = }")
display(df_rand_125.head())

(125, 48)
len(df_rand_125[df_rand_125['origin']=='Naccarato']) = 35
len(df_rand_125[df_rand_125['origin']=='Naccarato'])/125 = 0.28


Unnamed: 0,formula_reduced,crystal_system,dRMS,dijk,elements,epsij,src_bandgap,src_DB_IDs,src_ehull,src_epsij,...,rot_was_symmetrized,why_not_conventional_original,why_not_conventional_rot,dKP_rot,dijk_full,dKP_full,dijk_full_neum,dKP_full_neum,FOM,is_unique_here
mp-21892,Al2PbO4,orthorhombic,1.670288,"[[[0, 0, 0.375278732038964], [0, 0, 0], [0.375...","[Al, Pb, O]","[[3.738544810667184, 0, 0], [0, 3.602323597500...",4.0403,"{'icsd': ['icsd-33532', 'icsd-80128']}",0.0,"[[3.70763371, 0.0, 0.0], [0.0, 3.5657536100000...",...,False,is_conventional,is_conventional,1.670288,"[[0.0, 0.0, 0.0, 0.0, 0.375278732038964, 0.0],...",1.670288,"[[[0.0, 0.0, 0.375278732038964], [0.0, 0.0, 0....",1.670288,6.748463,True
mp-8070,Li2BeSiO4,monoclinic,0.615357,"[[[-0.9689421869262571, 0, -0.5925942777505401...","[Li, Be, Si, O]","[[2.951925853729255, 0, -0.017267249236169003]...",6.115,"{'icsd': ['icsd-193838', 'icsd-28307']}",0.0,"[[2.75137983, 0.0, 0.004776240000000017], [0.0...",...,False,is_conventional,is_conventional,0.615357,"[[-0.9689421869262571, -0.21086535866074102, 0...",0.615357,"[[[-0.9689421869262571, 0.0, -0.59259427775054...",0.615357,3.762907,True
mp-558491,BaAlBO3F2,hexagonal,0.916306,"[[[1.319868876542579, 0.14540074947442, 0], [0...","[Ba, Al, B, O, F]","[[2.87847405289687, -1.1742012652613529e-11, -...",6.0749,{'icsd': ['icsd-421141']},0.0,"[[2.8007016715447057, 3.2771409620600307e-07, ...",...,False,is_conventional,is_conventional,0.916306,"[[1.319868876542579, -1.319868876506723, -0.0,...",0.916306,"[[[1.319868876542579, 0.14540074947442, 0.0], ...",0.916306,5.566468,True
mp-632684,NaPH3NO3,hexagonal,0.633297,"[[[0, 0, 0.43409278345280605], [0, 0, 3.180367...","[Na, P, H, N, O]","[[2.488008566741476, -7.875236592855791e-12, 0...",5.1089,{'icsd': ['icsd-16608']},0.0,"[[2.5005071138804795, 8.095368818697086e-06, 3...",...,False,is_conventional,is_conventional,0.633297,"[[0.0, 0.0, 0.0, 0.0, 0.43409278345280605, 0.0...",0.633297,"[[[0.0, 0.0, 0.43409278345280605], [0.0, 0.0, ...",0.633297,3.235452,True
mp-23329,Bi2WO6,orthorhombic,8.216684,"[[[0, 0, 6.318289304622575], [0, 0, 0], [6.318...","[Bi, W, O]","[[6.09265151445429, 0, 0], [0, 6.1892275916128...",1.9543,"{'icsd': ['icsd-171327', 'icsd-23584']}",0.00045,"[[6.227617099190112, 5.491908754429176e-07, 0....",...,False,is_conventional,is_conventional,8.216684,"[[0.0, 0.0, 0.0, 0.0, 6.318289304622575, 0.0],...",8.216653,"[[[0.0, 0.0, 6.318289304622575], [0.0, 0.0, 0....",8.216653,16.057806,True


In [18]:
df_tmp = df_rand_125.copy()

# histnorm = 'percent'
histnorm = ''
bin_size = 1.0


fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x = df_tmp['dKP_full_neum'],
        histnorm=histnorm,
        xbins=dict(
            # start=0.0,
            # end=1.0,
            size=bin_size
        ),
        name='dist_250',
        marker_color = 'royalblue' 
    )
)


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.5)
fig.update_layout(
    # title_text='Comparison of effective frequency distribution', # title of plot
    xaxis_title_text='<i>d<sub>KP</sub></i> (pm/V)', # xaxis label
    # yaxis_title_text='Fraction of total data subset (%)', # yaxis label
    yaxis_title_text='Count', # yaxis label
    font_size=20,
    # xaxis_range=[0,1]
)

fig.update_layout(
    autosize=False,
    font_size=20,
    # width=1100,
    # height=600,
    plot_bgcolor="white",
    # template='simple_white',
)

fig.update_layout(
    legend=dict(
        yanchor='top',
        y=0.99,
        xanchor='right',
        x=0.99
    )
)

fig.show()

histnorm = 'percent'
bin_size = 1.0


fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x = df_tmp['dKP_full_neum'],
        histnorm=histnorm,
        xbins=dict(
            # start=0.0,
            # end=1.0,
            size=bin_size
        ),
        name='dist_250',
        marker_color = 'royalblue' 
    )
)


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.5)
fig.update_layout(
    # title_text='Comparison of effective frequency distribution', # title of plot
    xaxis_title_text='<i>d<sub>KP</sub></i> (pm/V)', # xaxis label
    yaxis_title_text='Fraction of data (%)', # yaxis label
    font_size=20,
    # xaxis_range=[0,1]
)

fig.update_layout(
    autosize=False,
    font_size=20,
    # width=1100,
    # height=600,
    plot_bgcolor="white",
    # template='simple_white',
)

fig.update_layout(
    legend=dict(
        yanchor='top',
        y=0.99,
        xanchor='right',
        x=0.99
    )
)

fig.show()

# dKP-Eg-KDE with log-lin inset
# ==============================================================================================================================
fig = make_subplots(rows=1, cols=2, shared_yaxes=True, horizontal_spacing=0.02, column_widths=[0.8,0.2])

# ROW 1 COL 1 =================================================================================================================
fig.add_trace(go.Scatter(x=df_orig.drop(df_tmp.index, axis=0)['src_bandgap'],
                         y=df_orig.drop(df_tmp.index, axis=0)['dKP_full_neum'],
                         mode='markers',
                         marker=dict(
                            color="cornflowerblue",
                            opacity=0.5
                         ),
                         showlegend=True,
                         name=f'Train+Val',
                         text=[mpid for mpid in df_orig.drop(df_tmp.index, axis=0).index.values]
                         ))

fig.add_trace(go.Scatter(x=df_tmp['src_bandgap'],
                         y=df_tmp['dKP_full_neum'],
                         mode='markers',
                         marker = dict(symbol='circle',
                                       size=5,
                                       color = 'lightgreen',
                                       showscale=False,
                                       line=dict(width=1, color="black")
                                       ),
                         showlegend=True,
                         name=f'Test',
                         text=[mpid for mpid in df_tmp.index.values]
                         ))


# AXES
fig.update_xaxes(title = '<i>E<sub>g</sub></i> (eV)',
                 title_font_size=36,
                 range = [-0.1, 10.0],
                 row=1, col=1)
fig.update_yaxes(title = '<i>d</i><sub>KP</sub> (pm/V)',
                 title_font_size=36)


fig.update_layout(font={'family':'Arial', 'size': 20},
                 )

# ROW 1 COL 2 =================================================================================================================
density = gaussian_kde(np.reshape(df_tmp['dKP_full_neum'].values, (1,len(df_tmp))))
density.covariance_factor = lambda : .02 #Smoothing parameter
density._compute_covariance()

x_vals = np.linspace(min(df_tmp['dKP_full_neum'].values),
                     max(df_tmp['dKP_full_neum'].values),
                     200) # Specifying the limits of our data
kde_dist = density(x_vals)

fig.add_trace(go.Scatter(x=kde_dist, 
                         y=x_vals, 
                         mode='lines', 
                         marker_color='indianred',
                         fill='tozerox',
                         showlegend=False),
              row=1, col=2)

fig.update_xaxes(title = 'Distribution Test',
                 title_font_size=36,
                 row=1, col=2)
fig.update_yaxes(title = '', row=1, col=2)

# INSET =======================================================================================================================
inset = copy.deepcopy(fig.data[0])
inset.xaxis = 'x3'
inset.yaxis = 'y3'

inset_candidates_v1 = copy.deepcopy(fig.data[1])
inset_candidates_v1.xaxis = 'x3'
inset_candidates_v1.yaxis = 'y3'
inset


fig.update_layout(
    xaxis3=dict(domain      = [0.30, 0.75],
                anchor      = 'y3',
                range       = [-0.1, 8.4],
                linecolor   = 'black'
               ),
    yaxis3=dict(domain      = [0.60, 0.98],
                anchor      = 'x3',
                range       = [-3.5, 2.6],
                type        = 'log',
                tickvals    = [0.001, 0.01, 1, 100],
                linecolor   = 'black'
    ))

fig.add_trace(inset)
fig.add_trace(inset_candidates_v1)

fig.update_xaxes(showspikes=True, spikecolor="gray", spikethickness=2, spikesnap="cursor", spikemode="across")
fig.update_yaxes(showspikes=True, spikecolor="gray", spikethickness=2, spikesnap="cursor", spikemode='across')
fig.update_layout(hoverdistance=5)

# THEME OF GRAPH
fig.update_layout(template='simple_white')

fig.update_layout(
    width=1000,
    height=500
)

fig.show()

# Design new 125 holdout sets

In [19]:
df_orig = load_full()

In [20]:
for i, j in zip(np.arange(0, 200, 25),np.arange(25, 225, 25)):
    print(f"{i}<=dKP<{j}:")
    print(len(df_orig.query(f"dKP_full_neum<{j} and dKP_full_neum>={i}")))

0<=dKP<25:
2212
25<=dKP<50:
188
50<=dKP<75:
91
75<=dKP<100:
60
100<=dKP<125:
35
125<=dKP<150:
26
150<=dKP<175:
14
175<=dKP<200:
0


In [21]:
df_tmp = df_orig.copy()

bin_size = 1.0

histnorm = ''

fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x = df_tmp['dKP_full_neum'],
        histnorm=histnorm,
        xbins=dict(
            # start=0.0,
            # end=1.0,
            size=bin_size
        ),
        name='dist_250',
        marker_color = 'royalblue' 
    )
)


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.5)
fig.update_layout(
    # title_text='Comparison of effective frequency distribution', # title of plot
    xaxis_title_text='<i>d<sub>KP</sub></i> (pm/V)', # xaxis label
    # yaxis_title_text='Fraction of total data subset (%)', # yaxis label
    yaxis_title_text='Count', # yaxis label
    font_size=20,
    # xaxis_range=[0,1]
)

fig.update_layout(
    autosize=False,
    font_size=20,
    # width=1100,
    # height=600,
    plot_bgcolor="white",
    # template='simple_white',
)

fig.update_layout(
    legend=dict(
        yanchor='top',
        y=0.99,
        xanchor='right',
        x=0.99
    )
)

fig.show()

histnorm = 'percent'

fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x = df_tmp['dKP_full_neum'],
        histnorm=histnorm,
        xbins=dict(
            # start=0.0,
            # end=1.0,
            size=bin_size
        ),
        name='dist_250',
        marker_color = 'royalblue' 
    )
)


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.5)
fig.update_layout(
    # title_text='Comparison of effective frequency distribution', # title of plot
    xaxis_title_text='<i>d<sub>KP</sub></i> (pm/V)', # xaxis label
    yaxis_title_text='Fraction of data (%)', # yaxis label
    font_size=20,
    # xaxis_range=[0,1]
)

fig.update_layout(
    autosize=False,
    font_size=20,
    # width=1100,
    # height=600,
    plot_bgcolor="white",
    # template='simple_white',
)

fig.update_layout(
    legend=dict(
        yanchor='top',
        y=0.99,
        xanchor='right',
        x=0.99
    )
)

fig.show()

## Mix 0.4 uniform - 0.6 random

In [22]:
np.random.seed(seed=42)
random_uniform = np.random.uniform(low=0.0, high=np.max(df_orig['dKP_full_neum']), size=50)
idx_uniform = []
df_tmp = df_orig.copy()
for r in random_uniform:
    df_tmp['diff_tmp'] = np.abs(df_tmp['dKP_full_neum']-r)
    idx_min = df_tmp['diff_tmp'].idxmin()
    idx_uniform.append(idx_min)
    df_tmp = df_tmp.drop(idx_min, axis=0)


idx_uniform.extend(np.random.choice(df_orig.drop(idx_uniform, axis=0).index, size=75, replace=False))


df_tmp = df_orig.filter(idx_uniform, axis=0)

#========================================================================================================
bin_size = 1.0

histnorm = ''
fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x = df_tmp['dKP_full_neum'],
        histnorm=histnorm,
        xbins=dict(
            # start=0.0,
            # end=1.0,
            size=bin_size
        ),
        name='dist_250',
        marker_color = 'royalblue' 
    )
)


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.5)
fig.update_layout(
    # title_text='Comparison of effective frequency distribution', # title of plot
    xaxis_title_text='<i>d<sub>KP</sub></i> (pm/V)', # xaxis label
    # yaxis_title_text='Fraction of total data subset (%)', # yaxis label
    yaxis_title_text='Count', # yaxis label
    font_size=20,
    # xaxis_range=[0,1]
)

fig.update_layout(
    autosize=False,
    font_size=20,
    # width=1100,
    # height=600,
    plot_bgcolor="white",
    # template='simple_white',
)

fig.update_layout(
    legend=dict(
        yanchor='top',
        y=0.99,
        xanchor='right',
        x=0.99
    )
)

fig.show()

histnorm = 'percent'

fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x = df_tmp['dKP_full_neum'],
        histnorm=histnorm,
        xbins=dict(
            # start=0.0,
            # end=1.0,
            size=bin_size
        ),
        name='dist_250',
        marker_color = 'royalblue' 
    )
)


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.5)
fig.update_layout(
    # title_text='Comparison of effective frequency distribution', # title of plot
    xaxis_title_text='<i>d<sub>KP</sub></i> (pm/V)', # xaxis label
    yaxis_title_text='Fraction of data (%)', # yaxis label
    font_size=20,
    # xaxis_range=[0,1]
)

fig.update_layout(
    autosize=False,
    font_size=20,
    # width=1100,
    # height=600,
    plot_bgcolor="white",
    # template='simple_white',
)

fig.update_layout(
    legend=dict(
        yanchor='top',
        y=0.99,
        xanchor='right',
        x=0.99
    )
)

fig.show()

# dKP-Eg-KDE with log-lin inset
# ==============================================================================================================================
fig = make_subplots(rows=1, cols=2, shared_yaxes=True, horizontal_spacing=0.02, column_widths=[0.8,0.2])

# ROW 1 COL 1 =================================================================================================================
fig.add_trace(go.Scatter(x=df_orig.drop(df_tmp.index, axis=0)['src_bandgap'],
                         y=df_orig.drop(df_tmp.index, axis=0)['dKP_full_neum'],
                         mode='markers',
                         marker=dict(
                            color="cornflowerblue",
                            opacity=0.5
                         ),
                         showlegend=True,
                         name=f'Train+Val',
                         text=[mpid for mpid in df_orig.drop(df_tmp.index, axis=0).index.values]
                         ))

fig.add_trace(go.Scatter(x=df_tmp['src_bandgap'],
                         y=df_tmp['dKP_full_neum'],
                         mode='markers',
                         marker = dict(symbol='circle',
                                       size=5,
                                       color = 'lightgreen',
                                       showscale=False,
                                       line=dict(width=1, color="black")
                                       ),
                         showlegend=True,
                         name=f'Test',
                         text=[mpid for mpid in df_tmp.index.values]
                         ))


# AXES
fig.update_xaxes(title = '<i>E<sub>g</sub></i> (eV)',
                 title_font_size=36,
                 range = [-0.1, 10.0],
                 row=1, col=1)
fig.update_yaxes(title = '<i>d</i><sub>KP</sub> (pm/V)',
                 title_font_size=36)


fig.update_layout(font={'family':'Arial', 'size': 20},
                 )

# ROW 1 COL 2 =================================================================================================================
density = gaussian_kde(np.reshape(df_tmp['dKP_full_neum'].values, (1,len(df_tmp))))
density.covariance_factor = lambda : .02 #Smoothing parameter
density._compute_covariance()

x_vals = np.linspace(min(df_tmp['dKP_full_neum'].values),
                     max(df_tmp['dKP_full_neum'].values),
                     200) # Specifying the limits of our data
kde_dist = density(x_vals)

fig.add_trace(go.Scatter(x=kde_dist, 
                         y=x_vals, 
                         mode='lines', 
                         marker_color='indianred',
                         fill='tozerox',
                         showlegend=False),
              row=1, col=2)

fig.update_xaxes(title = 'Distribution Test',
                 title_font_size=36,
                 row=1, col=2)
fig.update_yaxes(title = '', row=1, col=2)

# INSET =======================================================================================================================
inset = copy.deepcopy(fig.data[0])
inset.xaxis = 'x3'
inset.yaxis = 'y3'

inset_candidates_v1 = copy.deepcopy(fig.data[1])
inset_candidates_v1.xaxis = 'x3'
inset_candidates_v1.yaxis = 'y3'
inset


fig.update_layout(
    xaxis3=dict(domain      = [0.30, 0.75],
                anchor      = 'y3',
                range       = [-0.1, 8.4],
                linecolor   = 'black'
               ),
    yaxis3=dict(domain      = [0.60, 0.98],
                anchor      = 'x3',
                range       = [-3.5, 2.6],
                type        = 'log',
                tickvals    = [0.001, 0.01, 1, 100],
                linecolor   = 'black'
    ))

fig.add_trace(inset)
fig.add_trace(inset_candidates_v1)

fig.update_xaxes(showspikes=True, spikecolor="gray", spikethickness=2, spikesnap="cursor", spikemode="across")
fig.update_yaxes(showspikes=True, spikecolor="gray", spikethickness=2, spikesnap="cursor", spikemode='across')
fig.update_layout(hoverdistance=5)

# THEME OF GRAPH
fig.update_layout(template='simple_white')

fig.update_layout(
    width=1000,
    height=500
)

fig.show()

In [23]:
# What's left in the full dataset (=val+train)
df_tmp = df_orig.drop(idx_uniform, axis=0)

bin_size = 1.0

histnorm = ''
fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x = df_tmp['dKP_full_neum'],
        histnorm=histnorm,
        xbins=dict(
            # start=0.0,
            # end=1.0,
            size=bin_size
        ),
        name='dist_250',
        marker_color = 'royalblue' 
    )
)


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.5)
fig.update_layout(
    # title_text='Comparison of effective frequency distribution', # title of plot
    xaxis_title_text='<i>d<sub>KP</sub></i> (pm/V)', # xaxis label
    # yaxis_title_text='Fraction of total data subset (%)', # yaxis label
    yaxis_title_text='Count', # yaxis label
    font_size=20,
    # xaxis_range=[0,1]
)

fig.update_layout(
    autosize=False,
    font_size=20,
    # width=1100,
    # height=600,
    plot_bgcolor="white",
    # template='simple_white',
)

fig.update_layout(
    legend=dict(
        yanchor='top',
        y=0.99,
        xanchor='right',
        x=0.99
    )
)

fig.show()

histnorm = 'percent'

fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x = df_tmp['dKP_full_neum'],
        histnorm=histnorm,
        xbins=dict(
            # start=0.0,
            # end=1.0,
            size=bin_size
        ),
        name='dist_250',
        marker_color = 'royalblue' 
    )
)


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.5)
fig.update_layout(
    # title_text='Comparison of effective frequency distribution', # title of plot
    xaxis_title_text='<i>d<sub>KP</sub></i> (pm/V)', # xaxis label
    yaxis_title_text='Fraction of data (%)', # yaxis label
    font_size=20,
    # xaxis_range=[0,1]
)

fig.update_layout(
    autosize=False,
    font_size=20,
    # width=1100,
    # height=600,
    plot_bgcolor="white",
    # template='simple_white',
)

fig.update_layout(
    legend=dict(
        yanchor='top',
        y=0.99,
        xanchor='right',
        x=0.99
    )
)

fig.show()

## Rand-2-binned 125 

In [24]:
import numpy as np
np.random.seed(seed=42)

lim_bin = 10

list_idx_sampled = list(np.random.choice(df_orig.query(f"dKP_full_neum < {lim_bin}").index, size=50, replace=False))
list_idx_sampled.extend(np.random.choice(df_orig.query(f"dKP_full_neum >= {lim_bin}").index, size=75, replace=False))

In [25]:
df_tmp = df_orig.filter(list_idx_sampled, axis=0).copy()

bin_size = 1.0

histnorm = ''
fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x = df_tmp['dKP_full_neum'],
        histnorm=histnorm,
        xbins=dict(
            # start=0.0,
            # end=1.0,
            size=bin_size
        ),
        name='dist_250',
        marker_color = 'royalblue' 
    )
)


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.5)
fig.update_layout(
    # title_text='Comparison of effective frequency distribution', # title of plot
    xaxis_title_text='<i>d<sub>KP</sub></i> (pm/V)', # xaxis label
    # yaxis_title_text='Fraction of total data subset (%)', # yaxis label
    yaxis_title_text='Count', # yaxis label
    font_size=20,
    # xaxis_range=[0,1]
)

fig.update_layout(
    autosize=False,
    font_size=20,
    # width=1100,
    # height=600,
    plot_bgcolor="white",
    # template='simple_white',
)

fig.update_layout(
    legend=dict(
        yanchor='top',
        y=0.99,
        xanchor='right',
        x=0.99
    )
)

fig.show()

histnorm = 'percent'

fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x = df_tmp['dKP_full_neum'],
        histnorm=histnorm,
        xbins=dict(
            # start=0.0,
            # end=1.0,
            size=bin_size
        ),
        name='dist_250',
        marker_color = 'royalblue' 
    )
)


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.5)
fig.update_layout(
    # title_text='Comparison of effective frequency distribution', # title of plot
    xaxis_title_text='<i>d<sub>KP</sub></i> (pm/V)', # xaxis label
    yaxis_title_text='Fraction of data (%)', # yaxis label
    font_size=20,
    # xaxis_range=[0,1]
)

fig.update_layout(
    autosize=False,
    font_size=20,
    # width=1100,
    # height=600,
    plot_bgcolor="white",
    # template='simple_white',
)

fig.update_layout(
    legend=dict(
        yanchor='top',
        y=0.99,
        xanchor='right',
        x=0.99
    )
)

fig.show()

# dKP-Eg-KDE with log-lin inset
# ==============================================================================================================================
fig = make_subplots(rows=1, cols=2, shared_yaxes=True, horizontal_spacing=0.02, column_widths=[0.8,0.2])

# ROW 1 COL 1 =================================================================================================================
fig.add_trace(go.Scatter(x=df_orig.drop(df_tmp.index, axis=0)['src_bandgap'],
                         y=df_orig.drop(df_tmp.index, axis=0)['dKP_full_neum'],
                         mode='markers',
                         marker=dict(
                            color="cornflowerblue",
                            opacity=0.5
                         ),
                         showlegend=True,
                         name=f'Train+Val',
                         text=[mpid for mpid in df_orig.drop(df_tmp.index, axis=0).index.values]
                         ))

fig.add_trace(go.Scatter(x=df_tmp['src_bandgap'],
                         y=df_tmp['dKP_full_neum'],
                         mode='markers',
                         marker = dict(symbol='circle',
                                       size=5,
                                       color = 'lightgreen',
                                       showscale=False,
                                       line=dict(width=1, color="black")
                                       ),
                         showlegend=True,
                         name=f'Test',
                         text=[mpid for mpid in df_tmp.index.values]
                         ))


# AXES
fig.update_xaxes(title = '<i>E<sub>g</sub></i> (eV)',
                 title_font_size=36,
                 range = [-0.1, 10.0],
                 row=1, col=1)
fig.update_yaxes(title = '<i>d</i><sub>KP</sub> (pm/V)',
                 title_font_size=36)


fig.update_layout(font={'family':'Arial', 'size': 20},
                 )

# ROW 1 COL 2 =================================================================================================================
density = gaussian_kde(np.reshape(df_tmp['dKP_full_neum'].values, (1,len(df_tmp))))
density.covariance_factor = lambda : .02 #Smoothing parameter
density._compute_covariance()

x_vals = np.linspace(min(df_tmp['dKP_full_neum'].values),
                     max(df_tmp['dKP_full_neum'].values),
                     200) # Specifying the limits of our data
kde_dist = density(x_vals)

fig.add_trace(go.Scatter(x=kde_dist, 
                         y=x_vals, 
                         mode='lines', 
                         marker_color='indianred',
                         fill='tozerox',
                         showlegend=False),
              row=1, col=2)

fig.update_xaxes(title = 'Distribution Test',
                 title_font_size=36,
                 row=1, col=2)
fig.update_yaxes(title = '', row=1, col=2)

# INSET =======================================================================================================================
inset = copy.deepcopy(fig.data[0])
inset.xaxis = 'x3'
inset.yaxis = 'y3'

inset_candidates_v1 = copy.deepcopy(fig.data[1])
inset_candidates_v1.xaxis = 'x3'
inset_candidates_v1.yaxis = 'y3'
inset


fig.update_layout(
    xaxis3=dict(domain      = [0.30, 0.75],
                anchor      = 'y3',
                range       = [-0.1, 8.4],
                linecolor   = 'black'
               ),
    yaxis3=dict(domain      = [0.60, 0.98],
                anchor      = 'x3',
                range       = [-3.5, 2.6],
                type        = 'log',
                tickvals    = [0.001, 0.01, 1, 100],
                linecolor   = 'black'
    ))

fig.add_trace(inset)
fig.add_trace(inset_candidates_v1)

fig.update_xaxes(showspikes=True, spikecolor="gray", spikethickness=2, spikesnap="cursor", spikemode="across")
fig.update_yaxes(showspikes=True, spikecolor="gray", spikethickness=2, spikesnap="cursor", spikemode='across')
fig.update_layout(hoverdistance=5)

# THEME OF GRAPH
fig.update_layout(template='simple_white')

fig.update_layout(
    width=1000,
    height=500
)

fig.show()

In [26]:
# What's left in the full dataset (=val+train)
df_tmp = df_orig.drop(list_idx_sampled, axis=0)

bin_size = 1.0

histnorm = ''
fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x = df_tmp['dKP_full_neum'],
        histnorm=histnorm,
        xbins=dict(
            # start=0.0,
            # end=1.0,
            size=bin_size
        ),
        name='dist_250',
        marker_color = 'royalblue' 
    )
)


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.5)
fig.update_layout(
    # title_text='Comparison of effective frequency distribution', # title of plot
    xaxis_title_text='<i>d<sub>KP</sub></i> (pm/V)', # xaxis label
    # yaxis_title_text='Fraction of total data subset (%)', # yaxis label
    yaxis_title_text='Count', # yaxis label
    font_size=20,
    # xaxis_range=[0,1]
)

fig.update_layout(
    autosize=False,
    font_size=20,
    # width=1100,
    # height=600,
    plot_bgcolor="white",
    # template='simple_white',
)

fig.update_layout(
    legend=dict(
        yanchor='top',
        y=0.99,
        xanchor='right',
        x=0.99
    )
)

fig.show()

histnorm = 'percent'

fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x = df_tmp['dKP_full_neum'],
        histnorm=histnorm,
        xbins=dict(
            # start=0.0,
            # end=1.0,
            size=bin_size
        ),
        name='dist_250',
        marker_color = 'royalblue' 
    )
)


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.5)
fig.update_layout(
    # title_text='Comparison of effective frequency distribution', # title of plot
    xaxis_title_text='<i>d<sub>KP</sub></i> (pm/V)', # xaxis label
    yaxis_title_text='Fraction of data (%)', # yaxis label
    font_size=20,
    # xaxis_range=[0,1]
)

fig.update_layout(
    autosize=False,
    font_size=20,
    # width=1100,
    # height=600,
    plot_bgcolor="white",
    # template='simple_white',
)

fig.update_layout(
    legend=dict(
        yanchor='top',
        y=0.99,
        xanchor='right',
        x=0.99
    )
)

fig.show()