In [2]:
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, Descriptors, AllChem
from rdkit.Chem.Draw import IPythonConsole

from itertools import combinations

import IPython
from IPython.display import display, Image
from PIL import Image

import numpy as np
import pandas as pd

import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("SMILES_Big_Data_Set.csv")
df["SMILES"][0]

'O=S(=O)(Nc1cccc(-c2cnc3ccccc3n2)c1)c1cccs1'

In [8]:
# Define SMILES parsing into atoms and bonds
def parse_smiles(smiles):
    atoms = []
    bonds = []
    atom_stack = []
    
    i = 0
    while i < len(smiles):
        char = smiles[i]
        
        if char.isalpha():  # Atom symbol (e.g., C, O, N)
            atoms.append(char)
            if atom_stack:
                bonds.append((atom_stack[-1], len(atoms) - 1))  # Single bond to last atom
            atom_stack.append(len(atoms) - 1)  # Add atom index to stack
        elif char == '(':  # Branch start (if applicable)
            atom_stack.append(len(atoms))  # Start new branch
        elif char == ')':  # Branch end (if applicable)
            atom_stack.pop()  # Pop last branch
        elif char in '-=#':  # Bond types (single, double, triple)
            pass  # For simplicity, assume single bond for now
        i += 1
    
    return atoms, bonds

smiles = 'O=S(=O)(Nc1cccc(-c2cnc3ccccc3n2)c1)c1cccs1' 
atoms, bonds = parse_smiles(smiles)

print("Atoms:", atoms)
print("Bonds:", bonds)


Atoms: ['O', 'S', 'O', 'N', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'n', 'c', 'c', 'c', 'c', 'c', 'c', 'n', 'c', 'c', 'c', 'c', 'c', 's']
Bonds: [(0, 1), (2, 2), (3, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (9, 9), (9, 10), (10, 11), (11, 12), (12, 13), (13, 14), (14, 15), (15, 16), (16, 17), (17, 18), (17, 19), (17, 20), (20, 21), (21, 22), (22, 23), (23, 24)]


In [17]:
def generate_fingerprint(atoms, bonds):
    # Define possible atom types for this simple example (C, O, N, H)
    atom_types = atoms
    bond_types = ['-','=','#']  # Single bonds only for simplicity
    fingerprint_length = len(atom_types) * len(bond_types)
    
    fingerprint = [0] * fingerprint_length  # Initialize a fingerprint array of zeros

    # Set bits for atom types present in the molecule
    for atom in atoms:
        if atom in atom_types:
            fingerprint[atom_types.index(atom)] = 1

    # Set bits for bond types (simplified for single bonds only)
    for bond in bonds:
        # In this simple version, we'll only set a bond type bit for a single bond
        fingerprint[len(atom_types) + 0] = 1  # Single bond between atoms
    
    return fingerprint

# Generate the fingerprint for the molecule
fingerprint = generate_fingerprint(atoms, bonds)

print("Fingerprint Array:", fingerprint)


Fingerprint Array: [1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
