In [85]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

df_toxicity = pd.read_csv('Acute Toxicity_mouse_intraperitoneal_LD50.csv')

def identify_features(smiles):
    molecule = Chem.MolFromSmiles(smiles)
    features = {
        'Rings': 0,
        'Ring Types': set(),
        'O': 0,
        'Hydroxyl or Ether': 0,
        'Ketones, aldehydes, carboxylic acids, or esters': 0,
        'N': 0,
        'N Forms': set(),
        'Halogens': set(),
        'S': 0,
        'P': 0,
        'Other Metal': 0,
        'Other Metal Symbol': ''
    }
    if rdMolDescriptors.CalcNumRings(molecule) > 0:
        features['Rings'] = 1
        ring_info = molecule.GetRingInfo()
        for ring in ring_info.AtomRings():
            ring_atoms = [molecule.GetAtomWithIdx(idx) for idx in ring]
            if all(atom.GetIsAromatic() for atom in ring_atoms):
                features['Ring Types'].add('Aromatic')
            elif any(atom.GetIsAromatic() for atom in ring_atoms):
                features['Ring Types'].add('Partially Aromatic')
            else:
                features['Ring Types'].add('Aliphatic')

    for atom in molecule.GetAtoms():
        if atom.GetSymbol() == 'O':
            features['O'] = 1
            bonds = [bond.GetBondTypeAsDouble() for bond in atom.GetBonds()]
            if 2 in bonds:
                features['Ketones, aldehydes, carboxylic acids, or esters'] = 1
            elif 1 in bonds and len(bonds) == 1:
                features['Hydroxyl or Ether'] = 1
        elif atom.GetSymbol() == 'N':
            features['N'] = 1
            bonds = atom.GetDegree()
            if bonds == 3:
                features['N Forms'].add('Tertiary Amine or Amide Nitrogen')
            elif bonds == 2:
                features['N Forms'].add('Secondary Amine or Aromatic Nitrogen')
            elif bonds == 1:
                features['N Forms'].add('Primary Amine or Ammonium Ion')
        elif atom.GetSymbol() in {'Cl', 'F', 'Br', 'I', 'At'}:
            features['Halogens'].add(atom.GetSymbol())
        elif atom.GetSymbol() == 'S':
            features['S'] = 1
        elif atom.GetSymbol() == 'P':
            features['P'] = 1
        elif atom.GetSymbol() in other_metals_symbols:
            features['Other Metal'] = 1
            features['Other Metal Symbol'] = atom.GetSymbol()

    return features

other_metals_symbols = set([
        'He', 'Li', 'Be', 'B', 'Al', 'Si', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn',
        'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Kr', 'Rb', 'Sr', 'Y', 'Zr',
        'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'Xe', 'Cs',
        'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm',
        'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi',
        'Po', 'Rn', 'Fr', 'Ra', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf',
        'Es', 'Fm', 'Md', 'No', 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn',
        'Fl'
    ])

result_rows = []
for index, row in df_toxicity.iterrows():
    smiles = row['SMILES']
    feature_dict = identify_features(smiles)
    result_rows.append({
        "SMILES": smiles,
        "Rings": feature_dict['Rings'],
        "O": feature_dict['O'],
        "N": feature_dict['N'],
        "Halogens": 1 if feature_dict['Halogens'] else 0,
        "S": feature_dict['S'],
        "P": feature_dict['P'],
        "Other Metal": feature_dict['Other Metal'],
        "Existing forms of Rings": ', '.join(feature_dict['Ring Types']),
        "Hydroxyl or Ether": feature_dict['Hydroxyl or Ether'],
        "Ketones, aldehydes, carboxylic acids, or esters": feature_dict['Ketones, aldehydes, carboxylic acids, or esters'],
        "Existing forms of N": ', '.join(feature_dict['N Forms']),
        "Which Halogens": ', '.join(feature_dict['Halogens']),
        "Which Other Metal": feature_dict['Other Metal Symbol'],
        "mouse intraperitoneal LD50": row['mouse_intraperitoneal_LD50']
    })

df_result = pd.DataFrame(result_rows)
df_result.to_excel('Result.xlsx', index=False)
