In [None]:
import pandas as pd
import plotly.express as px
import umap
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
# Read DF, separates training reactions and virtual reactions

df = pd.read_csv('../Data/CPA-virtual_database.csv')
df_training = df[df['Rxn_Type'] == 'Training']
df_training_rxns = df_training['rxn']
df_virtual = df[df['Rxn_Type'] == 'Virtual']
df_virtual = df_virtual[df_virtual['Catalyst_Ar_grp'] == 'L15']
df_virtual = df_virtual.reset_index(drop=True)

In [3]:
df_virtual

Unnamed: 0.1,Unnamed: 0,Rxn_Type,Imine,Nucleophile,rxn,net_rxn,Catalyst_Ar_grp,Temp,ddG,iminium_nNH,...,nuc_iXH,nuc_HOMO,nuc_LUMO,nuc_Polarizability,nuc_L,nuc_B1,nuc_B5,nuc_bondDistanceHX,nuc_H_X_Nu,nuc_H_X_CNu
0,364,Virtual,(Z)-Iminium 45,N11,(Z)-Iminium 45N11,(Z)-Iminium 45N11L15,L15,,,3515.80,...,39.7567,-0.34143,0.03293,59.54,3.90,1.43,4.94,0.96,180,180
1,365,Virtual,(E)-Iminium 90,N11,(E)-Iminium 90N11,(E)-Iminium 90N11L15,L15,,,3578.89,...,39.7567,-0.34143,0.03293,59.54,3.90,1.43,4.94,0.96,180,180
2,366,Virtual,(E)-Iminium 70,N11,(E)-Iminium 70N11,(E)-Iminium 70N11L15,L15,,,3521.09,...,39.7567,-0.34143,0.03293,59.54,3.90,1.43,4.94,0.96,180,180
3,367,Virtual,(E)-Iminium 72,N11,(E)-Iminium 72N11,(E)-Iminium 72N11L15,L15,,,3583.29,...,39.7567,-0.34143,0.03293,59.54,3.90,1.43,4.94,0.96,180,180
4,368,Virtual,(E)-Iminium 77,N11,(E)-Iminium 77N11,(E)-Iminium 77N11L15,L15,,,3574.03,...,39.7567,-0.34143,0.03293,59.54,3.90,1.43,4.94,0.96,180,180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8359,8723,Virtual,(E)-Iminium 151,N26,(E)-Iminium 151N26,(E)-Iminium 151N26L15,L15,,,3566.65,...,6.6755,-0.26243,-0.01304,141.58,6.16,1.67,8.42,1.08,180,180
8360,8724,Virtual,(E)-Iminium 111,N26,(E)-Iminium 111N26,(E)-Iminium 111N26L15,L15,,,3556.35,...,6.6755,-0.26243,-0.01304,141.58,6.16,1.67,8.42,1.08,180,180
8361,8725,Virtual,(Z)-Iminium 3,N26,(Z)-Iminium 3N26,(Z)-Iminium 3N26L15,L15,,,3493.49,...,6.6755,-0.26243,-0.01304,141.58,6.16,1.67,8.42,1.08,180,180
8362,8726,Virtual,(E)-Iminium 112,N26,(E)-Iminium 112N26,(E)-Iminium 112N26L15,L15,,,3552.69,...,6.6755,-0.26243,-0.01304,141.58,6.16,1.67,8.42,1.08,180,180


In [4]:
indexes = []
for rxn in df_virtual['rxn']:
    if rxn in df_training_rxns.values:
        index_names = df_virtual[ df_virtual['rxn'] == rxn].index
        indexes.append(index_names)

In [5]:
## Initial Processing ##

vt = VarianceThreshold()

parameters = df_virtual.loc[:,['iminium_nNH', 'iminium_iNH', 'iminium_N', 'iminium_H', 'iminium_C', 
                    'iminium_SubL', 'iminium_SubS', 'iminium_PG', 'iminium_PGL', 'iminium_PGB1', 
                    'iminium_PGB5', 'iminium_SL', 'iminium_SB1', 'iminium_SB5', 'iminium_LL', 'iminium_LB1', 
                    'iminium_LB5', 'iminium_HOMO', 'iminium_LUMO', 'iminium_Polarizability', 'iminium_EnergyDiff',
                    'nuc_H', 'nuc_X', 'nuc_Nu', 'nuc_nXH', 'nuc_iXH', 'nuc_HOMO', 'nuc_LUMO', 'nuc_Polarizability', 
                    'nuc_L', 'nuc_B1', 'nuc_B5', 'nuc_bondDistanceHX', 'nuc_H_X_Nu', 'nuc_H_X_CNu'
                    ]] ## Set parameters to desired columns ##
thres = vt.fit(parameters)
parameters = thres.transform(parameters)

scaled_data = StandardScaler().fit_transform(parameters)

In [6]:
## UMAP for Visualisation ##
import umap.umap_ as umap
reducer_vis = umap.UMAP(
    random_state=25,
    min_dist=0,  # Default is 0.1
    n_neighbors=len(scaled_data)/2,  # Default is 15
    n_components=2)

umap_2d = reducer_vis.fit_transform(scaled_data)


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [7]:
indx = pd.DataFrame(indexes)
df_virtual.iloc[indx[0],1] = 'Original'

fig = px.scatter(x=umap_2d[:,0], y=umap_2d[:,1], hover_data=[],
                labels={'x':'UMAP1', 'y':'UMAP2'},
                width=980, height=720,
                color_discrete_sequence=[px.colors.qualitative.T10[0], px.colors.qualitative.T10[2]],
                template='simple_white',
                color=df_virtual.Rxn_Type)

fig.update_traces(marker=dict(size=17,
                              line=dict(width=2,
                                        color='Black')),
                  selector=dict(mode='markers'))

fig.update_layout(
    legend=dict(
        yanchor="top",
        y=.99,
        xanchor="left",
        x=0.01,
        orientation='h',
        bordercolor="Black",
        borderwidth = 0
    ),
    legend_title_text=''
)

fig.update_xaxes(mirror=True)
fig.update_yaxes(mirror=True)


fig.update_layout(
    font_size=20,
    font_family="Arial"
)

fig.update_layout(showlegend=True)

fig.show()


In [8]:
df = pd.read_csv('../Data/CPA_virtual_clusters_predictions.csv')
df

Unnamed: 0,Rxn_Type,Imine,Nucleophile,rxn,net_rxn,Catalyst_Ar_grp,cluster_label,Predicted ee
0,Virtual,(Z)-Iminium 45,N11,(Z)-Iminium 45N11,(Z)-Iminium 45N11L15,L15,3,62.699802
1,Virtual,(E)-Iminium 90,N11,(E)-Iminium 90N11,(E)-Iminium 90N11L15,L15,14,74.473810
2,Virtual,(E)-Iminium 70,N11,(E)-Iminium 70N11,(E)-Iminium 70N11L15,L15,16,84.758680
3,Virtual,(E)-Iminium 72,N11,(E)-Iminium 72N11,(E)-Iminium 72N11L15,L15,14,75.421135
4,Virtual,(E)-Iminium 77,N11,(E)-Iminium 77N11,(E)-Iminium 77N11L15,L15,14,78.793106
...,...,...,...,...,...,...,...,...
125455,Virtual,(E)-Iminium 151,N26,(E)-Iminium 151N26,(E)-Iminium 151N26L9,L9,39,72.179700
125456,Virtual,(E)-Iminium 111,N26,(E)-Iminium 111N26,(E)-Iminium 111N26L9,L9,48,80.431854
125457,Virtual,(Z)-Iminium 3,N26,(Z)-Iminium 3N26,(Z)-Iminium 3N26L9,L9,0,65.993330
125458,Virtual,(E)-Iminium 112,N26,(E)-Iminium 112N26,(E)-Iminium 112N26L9,L9,48,73.164080


# Reaction space ee plot

In [13]:
cats = np.unique(df['Catalyst_Ar_grp'])

for cat in cats:
    df_cat = df[df['Catalyst_Ar_grp'] == cat]
    ddg = df_cat['Predicted ee']
    fig = px.scatter(x=umap_2d[:,0], y=umap_2d[:,1], hover_data=[],
                labels={'x':'UMAP1', 'y':'UMAP2'},
                width=980, height=720,
                color=ddg,
                template='simple_white',
                )

    fig.update_traces(marker=dict(size=17,
                                line=dict(width=2,
                                            color='Black')),
                    selector=dict(mode='markers'))

    fig.update_layout(
        legend=dict(
            yanchor="top",
            y=.99,
            xanchor="left",
            x=0.01,
            orientation='h',
            bordercolor="Black",
            borderwidth = 0
        ),
        legend_title_text=''
    )

    fig.update_xaxes(mirror=True)
    fig.update_yaxes(mirror=True)


    fig.update_layout(
        font_size=20,
        font_family="Arial"
    )

    fig.update_layout(showlegend=True)

    fig.write_html(f"../Plots/CPA_react_space_plot_{cat}.html")

# Reaction space $\Delta$ ee plot

In [14]:
for cat in cats:
    df_cat = df[df['Catalyst_Ar_grp'] == cat]
    df_cat_ref = df[df['Catalyst_Ar_grp'] == 'L2']
    ddg = np.array(df_cat['Predicted ee'])-np.array(df_cat_ref['Predicted ee'])
    fig = px.scatter(x=umap_2d[:,0], y=umap_2d[:,1], hover_data=[],
                labels={'x':'UMAP1', 'y':'UMAP2'},
                width=980, height=720,
                color=ddg,
                template='simple_white',color_continuous_scale='RdYlGn'
                )

    fig.update_traces(marker=dict(size=17,
                                line=dict(width=2,
                                            color='Black')),
                    selector=dict(mode='markers'))

    fig.update_layout(
        legend=dict(
            yanchor="top",
            y=.99,
            xanchor="left",
            x=0.01,
            orientation='h',
            bordercolor="Black",
            borderwidth = 0
        ),
        legend_title_text=''
    )

    fig.update_xaxes(mirror=True)
    fig.update_yaxes(mirror=True)


    fig.update_layout(
        font_size=20,
        font_family="Arial"
    )

    fig.update_layout(showlegend=True)

    fig.write_html(f"../Plots/CPA_Diff_react_space_plot_{cat}.html")

# Difficult Reaction subspace $\Delta$ ee plot

In [15]:
for cat in cats:
    
    df_cat_ref = df[(df['Catalyst_Ar_grp'] == 'L2') & (df['Predicted ee'] < 40)]
    df_cat = df[(df['Catalyst_Ar_grp'] == cat)].reset_index()
    df_cat = df_cat[df_cat['rxn'].isin(df_cat_ref['rxn'].values)]
    ddg = np.array(df_cat['Predicted ee']) - np.array(df_cat_ref['Predicted ee'])
    
    # Filter the data based on the condition
    umap_filtered = umap_2d[df_cat['rxn'].index]
    
    # Create the scatter plot
    fig = px.scatter(x=umap_filtered[:, 0], y=umap_filtered[:, 1],
                     hover_data=[], labels={'x': 'UMAP1', 'y': 'UMAP2'},
                     width=980, height=720, color=ddg,
                     template='simple_white', color_continuous_scale='RdYlGn')

    fig.update_traces(marker=dict(size=17, line=dict(width=2, color='Black')), selector=dict(mode='markers'))

    fig.update_layout(
        legend=dict(yanchor="top", y=.99, xanchor="left", x=0.01, orientation='h', bordercolor="Black", borderwidth=0),
        legend_title_text=''
    )

    fig.update_xaxes(mirror=True)
    fig.update_yaxes(mirror=True)

    fig.update_layout(font_size=20, font_family="Arial")
    fig.update_layout(showlegend=True)

    fig.write_html(f"../Plots/CPA_Filtered_Diff_react_space_plot_{cat}.html")