<a href="https://colab.research.google.com/github/mallikaalvala/demo/blob/master/toxicity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
!pip install rdkit
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Function to calculate molecular descriptors
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    descriptors = {
        'MolWt': Descriptors.MolWt(mol),
        'MolLogP': Descriptors.MolLogP(mol),
        'NumHAcceptors': Descriptors.NumHAcceptors(mol),
        'NumHDonors': Descriptors.NumHDonors(mol)
    }
    return descriptors

# Example dataset with SMILES and toxicity labels (1 for toxic, 0 for non-toxic)
data = {
    'SMILES': ['CCO', 'CCN', 'CCCl', 'CCC', 'CCBr'],
    'Toxicity': [0, 1, 1, 0, 1]
}

# Convert the dataset to a DataFrame
df = pd.DataFrame(data)

# Calculate descriptors for each molecule and add them to the DataFrame
descriptors_list = [calculate_descriptors(smiles) for smiles in df['SMILES']]
descriptors_df = pd.DataFrame(descriptors_list)
df = pd.concat([df, descriptors_df], axis=1)

# Split the data into training and testing sets
X = df.drop(['SMILES', 'Toxicity'], axis=1)
y = df['Toxicity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict toxicity on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Predict toxicity for a new molecule
new_smiles = 'COCC'
new_descriptors = calculate_descriptors(new_smiles)
new_data = pd.DataFrame([new_descriptors])
toxicity_prediction = clf.predict(new_data)
print(f'Toxicity prediction for {new_smiles}: {toxicity_prediction}')


Accuracy: 0.0
Toxicity prediction for COCC: [1]
