### Summary:

In this notebook, I calculated the **SkNN Score** for each **Analog Compound** based on the **Tanimoto similarity**, **binary labels** (binders or non-binders), and **rank** among the nearest neighbors. The **SkNN score** is computed using the following formula:

$$
S_{kNN} = \sum_{i=1}^{k} \left( \frac{s_i^2 \cdot b_i}{r_i} \right)
$$

Where:
- \(s_i\) is the Tanimoto similarity for the \(i\)-th neighbor.
- \(b_i\) is a binary label (1 for binders, 0 for non-binders, based on \(KD < 1\)).
- \(r_i\) is the rank of the \(i\)-th neighbor (higher similarity gets a better rank).

After calculating the scores, I sorted them in descending order, so the higher scores appear at the top. Finally, I saved the results in a new Excel file named `MU3116_Analog_SkNN_Scores_Ranked.xlsx`.


In [1]:
import pandas as pd

# Load the data from the Excel file
file_path = 'Top_5_similar_compounds_per_MU3116_analog.xlsx'
df = pd.read_excel(file_path)

# Show the first few rows of the data to confirm it's loaded correctly
df.head()


Unnamed: 0,Analog Compound,Cache Challenge Compound,Cache KD (M),Similarity
0,Analog 1,Cc1ccc2c(c1)c(cc(c1cccs1)n2)C(=O)Nc1cccc(c1)C(...,1.0,1.0
1,Analog 1,Cc1c(ccc(c2cccs2)n1)C(=O)Nc1cccc(c1)C(=O)N(C)C...,1.0,0.710145
2,Analog 1,Cc1ccc2c(c1)c(cc(C1CC1)n2)C(=O)Nc1cccc(c1)C(=O...,8e-07,0.704225
3,Analog 1,Cc1ccc(c2cc(C(=O)Nc3cccc(c3)C(=O)N(C)CC(=O)N)c...,1.0,0.605263
4,Analog 1,Cc1ccc(c2cc(C(=O)Nc3cccc(c3)C(=O)NCCC(=O)N)c3c...,1.0,0.423529


In [2]:
def calculate_skNN_score(df):
    # Initialize an empty list to store the scores
    scores = []

    # Iterate over each unique Analog Compound
    for analog in df['Analog Compound'].unique():
        # Filter the data for the current analog compound
        analog_data = df[df['Analog Compound'] == analog]
        
        # Sort the neighbors by similarity in descending order (highest similarity first)
        analog_data = analog_data.sort_values(by='Similarity', ascending=False)
        
        # Initialize the score for the current analog compound
        skNN_score = 0
        
        # Iterate through the sorted neighbors and assign rank based on similarity
        for rank, (index, row) in enumerate(analog_data.iterrows(), start=1):
            # Extract the values for similarity, KD, and rank
            similarity = row['Similarity']
            KD = row['Cache KD (M)']
            
            # Calculate the binary label (1 for binder if KD < 1, else 0)
            binary_label = 1 if KD < 1 else 0
            
            # Apply the SkNN formula: si^2 * bi / ri
            skNN_score += (similarity ** 2 * binary_label) / rank
        
        # Append the result for the current analog compound
        scores.append({'Analog Compound': analog, 'SkNN Score': skNN_score})
    
    # Convert the scores list into a DataFrame
    scores_df = pd.DataFrame(scores)
    
    return scores_df


In [3]:
# Apply the SkNN scoring function
skNN_scores_df = calculate_skNN_score(df)

# Show the resulting DataFrame with SkNN scores
skNN_scores_df.head()

Unnamed: 0,Analog Compound,SkNN Score
0,Analog 1,0.165311
1,Analog 2,0.0
2,Analog 3,0.0
3,Analog 4,0.0
4,Analog 5,0.0


In [4]:
# Sort the DataFrame by 'SkNN Score' in descending order
skNN_scores_df_sorted = skNN_scores_df.sort_values(by='SkNN Score', ascending=False)

# Show the sorted DataFrame
skNN_scores_df_sorted.head()


Unnamed: 0,Analog Compound,SkNN Score
132,Analog 133,0.437633
272,Analog 273,0.425331
257,Analog 258,0.356427
159,Analog 160,0.346021
98,Analog 99,0.324024


In [5]:
# Save the sorted DataFrame to a new Excel file
skNN_scores_df_sorted.to_excel('MU3116_Analog_SkNN_Scores_Ranked.xlsx', index=False)


In [6]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
from openpyxl import Workbook
from openpyxl.drawing.image import Image
from openpyxl.styles import Alignment, Font
import io

# Load the two Excel files
sknn_df = pd.read_excel('MU3116_Analog_SkNN_Scores_Ranked.xlsx')
smiles_df = pd.read_excel('MU3116-clustered_compounds_with_smiles.xlsx')

# Ensure both dataframes have the same "Analog Compound" format (e.g., "Analog 1")
sknn_df['Analog Compound'] = sknn_df['Analog Compound'].str.strip()
smiles_df['Compound ID'] = smiles_df['Compound ID'].str.strip()

# Merge the two dataframes on the correct column names
merged_df = pd.merge(sknn_df, smiles_df[['Compound ID', 'Structure (SMILES)']], left_on='Analog Compound', right_on='Compound ID', how='left')

# Function to generate image from SMILES
def smiles_to_image(smiles):
    mol = Chem.MolFromSmiles(smiles)
    img = Draw.MolToImage(mol, size=(300, 300))
    return img

# Create a new "Analog Images" column and generate images from SMILES
image_paths = []
for idx, row in merged_df.iterrows():
    img = smiles_to_image(row['Structure (SMILES)'])
    # Save the image in memory (as a byte object)
    img_byte_arr = io.BytesIO()
    img.save(img_byte_arr, format='PNG')
    img_byte_arr.seek(0)  # Rewind the BytesIO object
    image_paths.append(img_byte_arr)

# Create a new workbook to embed images and adjust the formatting
wb = Workbook()
ws = wb.active

# Add headers and make them bold
header_row = ["Analog Compound", "SkNN Score", "Structure (SMILES)", "Analog Images"]
ws.append(header_row)

# Bold the header row and set font size
for cell in ws[1]:
    cell.font = Font(bold=True, size=14)

# Adjust column width for better visibility and wrap text for SMILES column
header_columns = ['Analog Compound', 'SkNN Score', 'Structure (SMILES)', 'Analog Images']
ws.column_dimensions['A'].width = 20  # Adjust "Analog Compound" width
ws.column_dimensions['B'].width = 15  # Adjust "SkNN Score" width
ws.column_dimensions['C'].width = 40  # Adjust "Structure (SMILES)" width
ws.column_dimensions['D'].width = 20  # Adjust "Analog Images" width

# Center text alignment for all header columns
for col in ws.columns:
    for cell in col:
        cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)

# Write the data and embed images
for idx, row in merged_df.iterrows():
    # Write the compound data
    ws.append([row['Analog Compound'], row['SkNN Score'], row['Structure (SMILES)']])
    
    # Create an image from the byte data and embed it
    img = Image(image_paths[idx])
    img.width = 150  # Resize image if needed
    img.height = 150  # Resize image if needed
    cell = ws.cell(row=idx + 2, column=4)  # Positioning image in the 4th column
    ws.add_image(img, cell.coordinate)

    # Adjust row height to fit the image
    ws.row_dimensions[idx + 2].height = 150  # Adjust row height as needed

# Increase the row height for the header to give it more space
ws.row_dimensions[1].height = 40  # Adjust header row height

# Save the Excel file with embedded images
wb.save('MU3116_Analog_SkNN_Scores_with_SMILES_and_Images_Formatted.xlsx')

print("Excel file saved with images and formatting successfully.")


Excel file saved with images and formatting successfully.


In [7]:
import openpyxl
from openpyxl.styles import Alignment

# Load the Excel file
wb = openpyxl.load_workbook('MU3116_Analog_SkNN_Scores_with_SMILES_and_Images_Formatted.xlsx')
ws = wb.active

# Define columns for alignment and wrapping
columns_to_center = ['A', 'B', 'C']  # Columns "Analog Compound", "SkNN Score", "Structure (SMILES)"

# Center and wrap text for the specified columns
for col in columns_to_center:
    for cell in ws[col]:
        # Apply both horizontal and vertical centering
        cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)

# Adjust the "Structure (SMILES)" column for wrapping (specifically for SMILES)
ws.column_dimensions['C'].width = 40  # Adjust width for better visibility

# Save the modified workbook
wb.save('MU3116_Analog_SkNN_Scores_with_SMILES_and_Images_Formatted_Updated.xlsx')

print("The file with updated formatting has been saved as 'MU3116_Analog_SkNN_Scores_with_SMILES_and_Images_Formatted_Updated.xlsx'.")


The file with updated formatting has been saved as 'MU3116_Analog_SkNN_Scores_with_SMILES_and_Images_Formatted_Updated.xlsx'.
