In [None]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [None]:
#load the dataframes consisting of presaved plot data
df = pd.read_pickle('lobdos.pkl')  
df_lso= pd.read_pickle('lsolobdos.pkl')

In [None]:
features=['band_center','band_width','band_skew','band_kurtosis']

In [None]:
def get_band_feature_comp_plot(feature,df, path='.',extension='pdf'):
    fig = make_subplots(rows=1, cols=3, shared_xaxes=False, shared_yaxes=False,
                    x_title='VASP (eV)', y_title='LOBSTER (eV)', horizontal_spacing=0.035)
    
    # Add scatter plots to each subplot
    fig.add_trace(go.Scatter(x=df['s_{}_VASP'.format(feature)], 
                             y=df['s_{}_LOBS'.format(feature)], mode='markers',name='s',
                            hovertext=df.index+'<br>Composition :'+ df.Composition), row=1, col=1)
    fig.add_trace(go.Scatter(x=df['p_{}_VASP'.format(feature)], 
                             y=df['p_{}_LOBS'.format(feature)], mode='markers',name='p',
                            hovertext=df.index+'<br>Composition :'+ df.Composition), row=1, col=2)
    fig.add_trace(go.Scatter(x=df['d_{}_VASP'.format(feature)], 
                             y=df['d_{}_LOBS'.format(feature)], mode='markers',name='d',
                            hovertext=df.index+'<br>Composition :'+ df.Composition), row=1, col=3)
    
    X_s=df[(df['s_{}_VASP'.format(feature)].notna() & 
      df['s_{}_LOBS'.format(feature)].notna())]['s_{}_VASP'.format(feature)]
    Y_s=df[(df['s_{}_VASP'.format(feature)].notna() & 
          df['s_{}_LOBS'.format(feature)].notna())]['s_{}_LOBS'.format(feature)]
    model_s = LinearRegression().fit(np.array(X_s).reshape(-1,1),Y_s)
    y_hat_s = model_s.predict(np.array(X_s).reshape(-1,1))
    fig.add_trace(go.Scatter(x=X_s, y=y_hat_s, mode='lines',showlegend=False,line_color='#f57f1f'),row=1, col=1)
    
    X_p=df[(df['p_{}_VASP'.format(feature)].notna() & 
      df['p_{}_LOBS'.format(feature)].notna())]['p_{}_VASP'.format(feature)]
    Y_p=df[(df['p_{}_VASP'.format(feature)].notna() & 
          df['p_{}_LOBS'.format(feature)].notna())]['p_{}_LOBS'.format(feature)]
    model_p = LinearRegression().fit(np.array(X_p).reshape(-1,1),Y_p)
    y_hat_p = model_p.predict(np.array(X_p).reshape(-1,1))
    fig.add_trace(go.Scatter(x=X_p, y=y_hat_p, mode='lines',showlegend=False,line_color='#f57f1f'),row=1, col=2)
    
    X_d=df[(df['d_{}_VASP'.format(feature)].notna() & 
      df['d_{}_LOBS'.format(feature)].notna())]['d_{}_VASP'.format(feature)]
    Y_d=df[(df['d_{}_VASP'.format(feature)].notna() & 
          df['d_{}_LOBS'.format(feature)].notna())]['d_{}_LOBS'.format(feature)]
    model_d = LinearRegression().fit(np.array(X_d).reshape(-1,1),Y_d)
    y_hat_d = model_d.predict(np.array(X_d).reshape(-1,1))
    fig.add_trace(go.Scatter(x=X_d, y=y_hat_d, mode='lines',showlegend=False,line_color='#f57f1f'),row=1, col=3)
    
    fig.update_layout(title='{} {}'.format(feature.split('_')[0].capitalize(),feature.split('_')[1]), 
                  title_x=0.5,
                      #margin=dict(l=50, r=50, t=100, b=50),
                  height=700, width=1900,
                  showlegend=False,
                  )
    fig.update_traces(marker=dict(size=10, color='#1878b6'))
    
    fig.add_annotation(xref='x domain', yref='y domain',
                   x=0.95, y=0.5,
                   text=r"$R^2={}$".format(round(model_s.score(np.array(X_s).reshape(-1,1),Y_s),4)),
                   showarrow=False, row=1, col=1, font=dict(size=24, color='black'))
    fig.add_annotation(xref='x domain', yref='y domain',
                   x=0.05, y=0.95,
                   text='s band',
                   showarrow=False, row=1, col=1, font=dict(size=24, color='black'))
    fig.add_annotation(xref='x domain', yref='y domain',
                       x=0.95, y=0.5,
                       text=r"$R^2={}$".format(round(model_p.score(np.array(X_p).reshape(-1,1),Y_p),4)),
                       showarrow=False, row=1, col=2, font=dict(size=24, color='black'))
    fig.add_annotation(xref='x domain', yref='y domain',
                       x=0.05, y=0.95,
                       text='p band',
                       showarrow=False, row=1, col=2, font=dict(size=24, color='black'))
    fig.add_annotation(xref='x domain', yref='y domain',
                       x=0.95, y=0.5,
                       text=r"$R^2={}$".format(round(model_d.score(np.array(X_d).reshape(-1,1),Y_d),4)),
                       showarrow=False, row=1, col=3, font=dict(size=24, color='black'))
    fig.add_annotation(xref='x domain', yref='y domain',
                       x=0.05, y=0.95,
                       text='d band',
                       showarrow=False, row=1, col=3, font=dict(size=24, color='black'))
    
    for i in range(1, 4):
        fig.update_yaxes(title_font=dict(size=24), color='black',row=1, col=i,tickfont = dict(size=22))
        fig.update_xaxes(title_font=dict(size=24), color='black',row=1, col=i,tickfont = dict(size=22))
        fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True, row=1, col=i,autorange=True)
        fig.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True, row=1, col=i,autorange=True)
        fig.update_xaxes(ticks="inside", tickwidth=1, tickcolor='black', ticklen=5, row=1, col=i)
        fig.update_yaxes(ticks="inside", tickwidth=1, tickcolor='black', ticklen=5, row=1, col=i)
    fig.update_layout(template='simple_white')
    
    if extension=='pdf':
        fig.write_image("{}/{}.{}".format(path,feature,extension),format= 'pdf', width=1900, height=700)
    if extension=='svg':
        fig.write_image("{}/{}.{}".format(path,feature,extension),width=1900, height=700)
    if extension=='html':
        fig.write_html("{}/{}.{}".format(path,feature,extension),include_mathjax = 'cdn')
    #fig.show()

#### Create directory to save the plots and then run the following two code blocks

`mkdir NON_LSO LSO`

In [None]:
for feature in features:
    get_band_feature_comp_plot(feature=feature,df=df, 
                           path='NON_LSO', extension='pdf')
    get_band_feature_comp_plot(feature=feature,df=df, 
                           path='NON_LSO', extension='html')

In [None]:
for feature in features:
    get_band_feature_comp_plot(feature=feature,df=df_lso, 
                           path='LSO', extension='pdf')
    get_band_feature_comp_plot(feature=feature,df=df_lso, 
                           path='LSO', extension='html')

### Fingerprint Tanimoto index plots

Procedure followed for obtaining data of the plots:
1. Exclude the compounds if the number of electrons obtained from VASP summed projected DOS exceeds the actual valence electrons (Intergrate the VASP summed PDOS upto fermi level to get total valence electrons)
2. Exclude datapoints of each orbital if contribution in the energy range (-15,0 eV) is less than 5 %
3. For Tanimoto index > 0.7, The fingerprints are considered to be similar

In [None]:
df_fil = df_lso.loc[((df_lso.NELEC_VASP/df_lso.NELEC_SYS)<=1.05)]
s = df_fil.loc[((df_fil.s_contri_LOBS>=5) & (df_fil.s_contri_VASP>=5))]
p = df_fil.loc[((df_fil.p_contri_LOBS>=5) & (df_fil.p_contri_VASP>=5))]
d = df_fil.loc[((df_fil.d_contri_LOBS>=5) & (df_fil.d_contri_VASP>=5))]

# Calculate fingerprint similarity percentages for annotation in plots
frames = [s,p,d, df_fil]
names=['_s','_p','_d','']
percent=[]
for frame, name in zip(frames,names):
    similarity_mask = frame['Tanimoto_similarity{}'.format(name)] >= 0.70
    similarity_count = frame['Tanimoto_similarity{}'.format(name)].count()
    similar_count = frame.loc[similarity_mask, 'Tanimoto_similarity{}'.format(name)].count()
    similar_percentage = int((similar_count / similarity_count) * 100)
    percent.append(similar_percentage)

In [None]:
fig = go.Figure()

fig.add_trace(go.Histogram(x=s['Tanimoto_similarity_s'].values,
                               name = 's',nbinsx=56,xbins=dict(size=0.1), histnorm ='percent'))
fig.add_trace(go.Histogram(x=p['Tanimoto_similarity_p'].values,
                               name = 'p',nbinsx=56,xbins=dict(size=0.1), histnorm ='percent'))
fig.add_trace(go.Histogram(x=d['Tanimoto_similarity_d'].values,
                               name = 'd',nbinsx=56,xbins=dict(size=0.1), histnorm ='percent'))
fig.add_trace(go.Histogram(x=df_fil['Tanimoto_similarity'].values,
                               name = 'summed',nbinsx=56,xbins=dict(size=0.1), histnorm ='percent'))
fig.update_layout(barmode='group')
fig.update_traces(opacity=0.65)
fig.update_layout(yaxis = dict(tickfont = dict(size=18)))
fig.update_layout(xaxis = dict(tickfont = dict(size=18)))
fig.update_layout(template='simple_white')
fig.update_layout( xaxis_title = 'Tanimoto similarity', yaxis_title='Percent of compounds')
fig.update_yaxes(title_font=dict(size=22), color='black')
fig.update_xaxes(title_font=dict(size=22), color='black')
fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True)
fig.update_xaxes(ticks="inside", tickwidth=1, tickcolor='black', ticklen=5)
fig.update_yaxes(ticks="inside", tickwidth=1, tickcolor='black', ticklen=5)
fig.update_layout(width=1000,height=650)
fig.update_layout(
    legend=dict(
        x=0.05,
        y=0.98,
        orientation="h",
        traceorder="normal",
        font=dict(
            family="sans-serif",
            size=20,
            color="black"
        ),
    )
)
fig.add_annotation(
text='Tanimoto index >0.70 <br> <br> s-orbital ~ {} %<br> p-orbital ~ {} %<br> d-orbital ~ {} %<br>Summed: {} %'.format(percent[0],
                                                                                      percent[1],
                                                                                     percent[2],
                                                                                     percent[3]), 
        align='center',
        showarrow=False,
        xref='x domain',
        yref='y domain',
        x=0.09,
        y=0.9,
        bordercolor='black',
        font=dict(
family="sans-serif",
size=20,
color="black"
),
borderwidth=1)
#fig.write_image("lso_spd_tanimoto.pdf",width=1000, height=650, format='pdf')
#fig.write_html("lso_spd_tanimoto.html",include_mathjax = 'cdn')

In [None]:
#Low Tanimoto index histogram
fig = go.Figure()

fig.add_trace(go.Histogram(x=s.loc[s.Tanimoto_similarity_s<=0.70]['Tanimoto_similarity_s'].values,
                               name = 's',nbinsx=56,xbins=dict(size=0.1)))# histnorm ='percent'))
fig.add_trace(go.Histogram(x=p.loc[p.Tanimoto_similarity_p<=0.70]['Tanimoto_similarity_p'].values,
                               name = 'p',nbinsx=56,xbins=dict(size=0.1)))# histnorm ='percent'))
fig.add_trace(go.Histogram(x=d.loc[d.Tanimoto_similarity_d<=0.70]['Tanimoto_similarity_d'].values,
                               name = 'd',nbinsx=56,xbins=dict(size=0.1)))# histnorm ='percent'))
fig.add_trace(go.Histogram(x=df_fil.loc[df_fil.Tanimoto_similarity<=0.70]['Tanimoto_similarity'].values,
                               name = 'summed',nbinsx=56,xbins=dict(size=0.1)))# histnorm ='percent'))
fig.update_layout(barmode='group')
fig.update_traces(opacity=0.65)
fig.update_layout(yaxis = dict(tickfont = dict(size=18)))
fig.update_layout(xaxis = dict(tickfont = dict(size=18)))
fig.update_layout(template='simple_white')
fig.update_layout( xaxis_title = 'Tanimoto similarity', yaxis_title='Number of compounds')
fig.update_yaxes(title_font=dict(size=22), color='black')
fig.update_xaxes(title_font=dict(size=22), color='black')
fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True)
fig.update_xaxes(ticks="inside", tickwidth=1, tickcolor='black', ticklen=5)
fig.update_yaxes(ticks="inside", tickwidth=1, tickcolor='black', ticklen=5)
fig.update_layout(width=1000,height=650)
fig.update_layout(
    legend=dict(
        x=0,
        y=1,
        traceorder="normal",
        font=dict(
            family="sans-serif",
            size=20,
            color="black"
        ),
    )
)
#fig.write_image("lso_spd_tanimoto_low.pdf",width=1000, height=650, format='pdf')
#fig.write_html("lso_spd_tanimoto_low.html",include_mathjax = 'cdn')

### s-orbital exception list

In [None]:
s.loc[s.Tanimoto_similarity_s<=0.70]

### p-orbital exception list

In [None]:
p.loc[p.Tanimoto_similarity_p<=0.70]

### d-orbital exception list

In [None]:
d.loc[d.Tanimoto_similarity_d<=0.70]

### summed PDOS exception list

In [None]:
df_fil.loc[df_fil.Tanimoto_similarity<=0.70]