In [None]:
#from openbabel import pybel
#import openbabel
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
import sys

import rdkit
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import QED
from rdkit.Chem import RDConfig
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
import sascorer

from IPython.display import Image, display

import selfies as sf

def display_images(filenames):
    """Helper to pretty-print images."""
    for file in filenames:
      display(Image(file))

def mols_to_pngs(mols, basename="generated_mol"):
    """Helper to write RDKit mols to png files."""
    filenames = []
    for i, mol in enumerate(mols):
        filename = "%s%d.png" % (basename, i)
        Draw.MolToFile(mol, filename)
        filenames.append(filename)
    return filenames

def preprocess_smiles(smiles):
  return sf.encoder(smiles)  

def keys_int(symbol_to_int):
  d={}
  i=0
  for key in symbol_to_int.keys():
    d[i]=key
    i+=1
  return d

In [None]:
new_vis = pd.read_csv('Dataset_after_cycle_4.csv', index_col=0).dropna().drop_duplicates(subset=['Original_SMILES'])

In [None]:
for index, row in new_vis.iterrows():
    try:
        # Create the molecule from SMILES
        mol = Chem.MolFromSmiles(row['Original_SMILES'] + 'OP(C)(=O)F')
        
        # Sanitize the molecule (check for invalid valencies or bonding)
        try:
            Chem.SanitizeMol(mol)
        except Exception as e:
            print(f"Sanitization failed for {row['Original_SMILES']} due to {str(e)}")
            sas_score = np.nan
            qed = np.nan
            new_vis.at[index, "QED"] = qed
            new_vis.at[index, "SA_score"] = sas_score
            continue  # Skip the rest of the loop for this molecule
        
        # Calculate QED if sanitization was successful
        qed = QED.default(mol)
        
        # Calculate SAS score
        try:
            sas_score = sascorer.calculateScore(mol)
        except Exception as e:
            print(f"SAS calculation failed for {row['Original_SMILES']} due to {str(e)}")
            sas_score = np.nan
        
    except Exception as e:
        print(f"Error processing {row['Original_SMILES']} due to {str(e)}")
        sas_score = np.nan
        qed = np.nan
    
    # Store the results in the DataFrame
    new_vis.at[index, "QED"] = qed
    new_vis.at[index, "SA_score"] = sas_score

new_vis = new_vis.dropna()

# Visualization

In [None]:
selfies_list = np.asanyarray(new_vis.selfies)
selfies_alphabet = sf.get_alphabet_from_selfies(selfies_list)
selfies_alphabet.add('[nop]')  # Add the "no operation" symbol as a padding character
#selfies_alphabet.remove('[P]')
selfies_alphabet = list(sorted(selfies_alphabet))

chunk_size = len(selfies_alphabet)
#initial = np.zeros((chunk_size,), dtype=int).tolist()
#first_iter = np.zeros((chunk_size,), dtype=int).tolist()
#second_iter = np.zeros((chunk_size,), dtype=int).tolist()

def split_list(lst, chunk_size):
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

In [None]:
histogram = {}
keys = ['Initial', 'First Iter', 'Second Iter', 'Third Iter']
values = []

selfies_list = np.asanyarray(new_vis.selfies) # added this line for a subset only
selfies_alphabet = sf.get_alphabet_from_selfies(selfies_list)
selfies_alphabet.add('[nop]')  # Add the "no operation" symbol as a padding character
selfies_alphabet = list(sorted(selfies_alphabet))
symbol_to_int = dict((c, i) for i, c in enumerate(selfies_alphabet))
int_mol=keys_int(symbol_to_int)
largest_selfie_len = max(sf.len_selfies(s) for s in selfies_list)


for i in keys:
    print(i)
    subset = new_vis.loc[new_vis['Origin'] == i] # Initial; First Iter; Second Iter ...

    selfies_list = np.asanyarray(subset.selfies) # added this line for a subset only
    
    onehots=sf.batch_selfies_to_flat_hot(selfies_list, symbol_to_int, largest_selfie_len)

    chunk_size = len(selfies_alphabet)
    summation = np.zeros((chunk_size,), dtype=int).tolist()
    
    for i in onehots:
        chunks_out = split_list(i, chunk_size)
        
        for row in chunks_out:
            new_out = [row[i] + summation[i] for i in range(len(row))]
            summation = new_out
            
        value_iter = summation
    
    values.append(value_iter)

normalized_values = []

for i in values:
    v = np.array(i)
    n_v = v/ np.sqrt(np.sum(v**2))
    normalized_values.append(n_v.tolist())

histogram =  dict(list(enumerate(normalized_values)))
histogram['Selfie'] = selfies_alphabet

histogram2 =  dict(list(enumerate(values)))
histogram2['Selfie'] = selfies_alphabet

In [None]:
# import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 6))

d_f1 = pd.DataFrame(histogram).rename(columns={0: "Initial", 1: "1st Iter", 2: "2nd Iter", 3: "3rd Iter"})
d_f2 = d_f1.sort_values('Initial', ascending=True)

# Plot with custom labels
d_f2.plot(kind='bar', ax=ax, x='Selfie', y=['Initial', '1st Iter', '2nd Iter', '3rd Iter'], rot=90)

ax.set_xlabel("SELFIES token", fontsize=14)
ax.set_ylabel("Frequency", fontsize=14)

ax.set_yscale('log')

# Change the legend labels
ax.legend(
    labels=["Training dataset", "1st Iteration", "2nd Iteration", "3rd Iteration"],  # New legend labels
    title="Iterations",  # Adds a title to the legend
    loc='upper left',  # Change the position of the legend
    fontsize=12,  # Modify the font size of the legend text
    title_fontsize=14,  # Modify the font size of the title
    fancybox=True,  # Makes the legend box have rounded corners
    framealpha=0.8  # Adjusts the transparency of the legend box
)

In [None]:
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

columns = 2  # Adjust the number of columns you want in the grid
rows = 2  # Calculate the number of rows needed

fig, axes = plt.subplots(rows, columns, figsize=(15, rows * 5))  # Adjust size as needed
axes = axes.flatten()  # Flatten axes array for easy indexing

df = new_vis.dropna(subset=["DFT_opt_Charge_P"])

# Plot histograms for each 'Origin' category
unique_origins = df["Origin"].unique()
colors = plt.cm.tab10.colors  # Using a color map to differentiate

for i, origin in enumerate(unique_origins):
    subset = df[df["Origin"] == origin]
    axes[0].hist(subset["DFT_opt_Charge_P"], bins=10, alpha=0.7, 
            label=origin, histtype='step', linestyle='-', linewidth=2, color=colors[i])
    axes[1].hist(subset["SELFIES_Length"], bins=10, alpha=0.7, 
                 label=origin, histtype='step', linestyle='-', linewidth=2, color=colors[i])
    axes[2].hist(subset["QED"], bins=10, alpha=0.7, 
                 label=origin, histtype='step', linestyle='-', linewidth=2, color=colors[i])
    axes[3].hist(subset["SA_score"], bins=10, alpha=0.7, 
                 label=origin, histtype='step', linestyle='-', linewidth=2, color=colors[i])
    
axes[0].grid(True, linestyle='--', alpha=0.5)
axes[0].set_xlabel("DFT Charge on Phosphorus")
axes[0].set_ylabel("Frequency")
axes[0].legend(title="")

axes[1].grid(True, linestyle='--', alpha=0.5)
axes[1].set_xlabel("Lenght of SELFIES representation")
axes[1].set_ylabel("")

axes[2].grid(True, linestyle='--', alpha=0.5)
axes[2].set_xlabel("QED")
axes[2].set_ylabel("Frequency")

axes[3].grid(True, linestyle='--', alpha=0.5)
axes[3].set_xlabel("SA score")
axes[3].set_ylabel("")

for i in range(0, 4):
    for spine in axes[i].spines.values():
        spine.set_edgecolor('black')  # Dark black border for the plot
        spine.set_linewidth(1)  # Make the spine lines thicker

plt.tight_layout()

# Show the histogram grid plot
# Save the plot with high dpi for publication quality
# plt.savefig('Heavy_atoms_histograms_kde_plot_publication.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
  
sns.set_theme(style="whitegrid")

sns.histplot(data=new_vis.dropna(), x="DFT_opt_Charge_P", hue="Origin", kde=True, element="step")

#plt.legend() 
plt.show()

In [None]:
new_vis['SELFIES_Length'] = new_vis['selfies'].apply(lambda x: sf.len_selfies(x))

import seaborn as sns
import matplotlib.pyplot as plt

fig, ax = plt.subplots()

sns.set_theme(style="whitegrid")

# ax.hist(new_vis['SELFIES_Length'], 15, density=True, histtype='step')#, stepped=True)
ax = sns.histplot(data=new_vis.dropna(), x="SELFIES_Length", hue="Origin", kde=True, element="step", bins = 16)
ax.set_xlabel('Selfie String Length')

In [None]:
fig, ax = plt.subplots()

sns.set_theme(style="whitegrid")

ax = sns.histplot(data=new_vis, x="SA_score", hue="Origin", kde=True, element="step", bins = 16)
ax.set_xlabel('SA_score')

In [None]:
fig, ax = plt.subplots()

sns.set_theme(style="whitegrid")

ax = sns.histplot(data=new_vis, x="QED", hue="Origin", kde=True, element="step", bins = 8)
ax.set_xlabel('QED')

In [None]:
new_vis

In [None]:
for origin in ['Initial', 'First Iter', 'Second Iter', 'Third Iter']:
    filtered_df = new_vis[new_vis['Origin'] == origin]
    filtered_df = filtered_df.sort_values('DFT_opt_Charge_P', ascending=True)  # Sort by DFT_opt_Charge_P
    print(filtered_df.head(3))  # This will display the DataFrame as a table in Jupyter

In [None]:
smiles_list = [
    'CC1CC2C3CC23C1OP(C)(=O)F', 'NNOP(C)(=O)F', 'C=CNNOP(C)(=O)F', 'CN=NCNNOP(C)(=O)F',     # The loowest DFT charge  0.471910, 0.456156, 0.444674, 0.398281
    'CC1CC2C1C2C=COP(C)(=O)F', 'N#COP(C)(=O)F', 'C=1=NC=1OP(C)(=O)F', 'N#CC#COP(C)(=O)F', # The highest DFT charge  0.489211, 0.522957, 0.516866, 0.535253
    'CCN(CC)C(C)=N[P](C)(F)=O'] # A230

smiles_list_names = ['CC1CC2C1C2C=C', 'N#C', 'C=1=NC=1', 'N#CC#C', # The highest DFT charge  0.489211, 0.522957, 0.516866, 0.535253
              'CC1CC2C3CC23C1', 'NN', 'C=CNN', 'CN=NCNN',     # The loowest DFT charge  0.471910, 0.456156, 0.444674, 0.398281
              'CCN(CC)C(C)=N[P](C)(F)=O'] # A230
smiles_property_list = [0.471910, 0.456156, 0.444674, 0.398281,
                        0.489211, 0.522957, 0.516866, 0.535253,
                        0.528948]
molecules = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]

# Plot the molecules in a grid
num_molecules = len(molecules)
columns = 4 # Number of columns in the grid (adjusted for 14 molecules)
rows = (num_molecules + columns - 1) // columns  # Calculate number of rows

# Create the figure and axes for the grid layout
fig, axes = plt.subplots(rows, columns, figsize=(13, rows * 2.5))  # Reduced vertical size

# Flatten axes array for easy indexing
axes = axes.flatten()

# Draw each molecule on a separate axis and add annotation
for i, mol in enumerate(molecules):
    # Draw the molecule on the i-th subplot axis
    img = Draw.MolToImage(mol, size=(300, 300))
    axes[i].imshow(img)
    axes[i].axis('off')  # Hide axes to make the plot cleaner
    # Add the SMILES string as an annotation below each molecule
    if i == 8:
        axes[i].text(0.5, -0.1, smiles_list[i], ha='center', va='top', fontsize=8.75, color='red', transform=axes[i].transAxes)
        axes[i].text(0.5, 0, smiles_property_list[i], ha='center', va='top', fontsize=8.75, color='red', transform=axes[i].transAxes)
    else:
        axes[i].text(0.5, -0.1, smiles_list[i], ha='center', va='top', fontsize=8.75, color='black', transform=axes[i].transAxes)
        axes[i].text(0.5, 0, smiles_property_list[i], ha='center', va='top', fontsize=8.75, color='black', transform=axes[i].transAxes)

axes[0].text(0.5, 0.9, 'Initial Dataset', ha='center', va='top', fontsize=8.75, color='black', transform=axes[0].transAxes)
axes[1].text(0.5, 0.9, '1st Iteration', ha='center', va='top', fontsize=8.75, color='black', transform=axes[1].transAxes)
axes[2].text(0.5, 0.9, '2nd Iteration', ha='center', va='top', fontsize=8.75, color='black', transform=axes[2].transAxes)
axes[3].text(0.5, 0.9, '3rd Iteration', ha='center', va='top', fontsize=8.75, color='black', transform=axes[3].transAxes)

# Hide any unused subplots if there are fewer molecules than axes
for i in range(num_molecules, len(axes)):
    axes[i].axis('off')

# Adjust layout to prevent overlapping labels and make the plot more readable
plt.tight_layout()
plt.subplots_adjust(hspace=0.2, wspace=0.2)  # Adjust vertical and horizontal spacing

# Show the plot
plt.show()