In [None]:
#  !pip install PyTDC
#  !pip install pysmiles
#  !pip install rdkit

In [None]:
from tdc.single_pred import Tox
import pandas as pd
from rdkit import Chem
import rdkit
from rdkit.Chem import Descriptors
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

data = Tox(name = 'AMES')
split = data.get_split()

In [3]:
train_df = pd.DataFrame(split['train'])
val_df = pd.DataFrame(split['valid'])
test_df = pd.DataFrame(split['test'])

# Data checking + getting features

In [4]:
# Checks if mutagenicity- "Y" value is 0 or 1. Otherwise, filter it out
train_df = train_df[train_df["Y"].isin([0, 1])]
val_df = val_df[val_df["Y"].isin([0, 1])]
test_df = test_df[test_df["Y"].isin([0, 1])]

In [22]:
def get_descriptor(dataset):
  data = []
  label = []

  for i in range(len(dataset)):
    smiles = dataset.loc[i, "Drug"]
    label.append(dataset.loc[i, "Y"])
    mol = Chem.MolFromSmiles(smiles)
    if mol: # checks for valid smile strings
        fpD1 = Descriptors.FpDensityMorgan1(mol)
        fpD2 = Descriptors.FpDensityMorgan2(mol)
        fpD3 = Descriptors.FpDensityMorgan3(mol)
        mw = Descriptors.ExactMolWt(mol)
        heavyAtom = Descriptors.HeavyAtomCount(mol)
        data.append([fpD1, fpD2, fpD3, mw, heavyAtom])

  columns = ['FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'ExactMolWt', "HeavyAtomCount"]
  descriptor_df = pd.DataFrame(data, columns=columns)
  descriptor_df['Label'] = label

  return descriptor_df

# Splitting code up into features and labels

In [None]:
print("Train:")
print(len(train_df[train_df["Y"] == 1]))
print(len(train_df[train_df["Y"] == 0]))
print("=========================================")
print("Test:")
print(len(test_df[test_df["Y"] == 1]))
print(len(test_df[test_df["Y"] == 0]))
print("=========================================")
print("Val:")
print(len(val_df[val_df["Y"] == 1]))
print(len(val_df[val_df["Y"] == 0]))

print("=========================================")
print("Total:")
print(len(train_df[train_df["Y"] == 1]) + len(test_df[test_df["Y"] == 1]) + len(val_df[val_df["Y"] == 1]))
print(len(train_df[train_df["Y"] == 0]) + len(test_df[test_df["Y"] == 0]) + len(val_df[val_df["Y"] == 0]))

In [24]:
descriptor_df_train = get_descriptor(train_df)
descriptor_df_val = get_descriptor(val_df)
descriptor_df_test = get_descriptor(test_df)

X_train, y_train = descriptor_df_train.drop(columns=['Label']), descriptor_df_train['Label']
X_val, y_val = descriptor_df_val.drop(columns=['Label']), descriptor_df_val['Label']
X_test, y_test = descriptor_df_test.drop(columns=['Label']), descriptor_df_test['Label']

In [None]:
descriptor_df_train.head(10)
# to print csv uncomment
# descriptor_df_train.to_csv('full_dataset.csv', index=False)

# Fit RFC

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=1000, random_state=42)

rf_classifier.fit(X_train, y_train)

In [None]:
y_pred = rf_classifier.predict(X_test)
from sklearn.metrics import roc_auc_score

auc_test = roc_auc_score(y_test, y_pred)
auc_train = roc_auc_score(y_train, rf_classifier.predict(X_train))
auc_val = roc_auc_score(y_val, rf_classifier.predict(X_val))

print(f"Test AUC: {auc_test}")
print(f"Train AUC: {auc_train}")
print(f"Val AUC: {auc_val}")

In [None]:

importances = rf_classifier.feature_importances_

feature_names = X_train.columns
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print(feature_importance_df)

In [None]:
import matplotlib.pyplot as plt

# Plot feature importance
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.gca().invert_yaxis()  # Invert y-axis to have the most important features at the top
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.show()

# Fit kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
kNN = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='brute', p=1, n_jobs=-1)
kNN.fit(X_train, y_train)

In [None]:
y_pred = kNN.predict(X_test)
from sklearn.metrics import roc_auc_score

auc_test = roc_auc_score(y_test, kNN.predict(X_test))
auc_train = roc_auc_score(y_train, kNN.predict(X_train))
auc_val = roc_auc_score(y_val, kNN.predict(X_val))

print(f"Test AUC: {auc_test}")
print(f"Train AUC: {auc_train}")
print(f"Val AUC: {auc_val}")

In [None]:
from sklearn.inspection import permutation_importance
import pandas as pd

# Calculate permutation importance for kNN
perm_importance = permutation_importance(kNN, X_train, y_train, scoring='roc_auc', n_repeats=30, random_state=42)

# Prepare a DataFrame for feature importance
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,  
    'Importance': perm_importance.importances_mean  
})

# Sort by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the feature importance DataFrame
print(feature_importance_df)

In [None]:
import matplotlib.pyplot as plt

# Assuming rf_classifier.feature_importances_ is calculated
rfc_importances = rf_classifier.feature_importances_

# Combine RFC and kNN feature importance into a DataFrame
feature_importance_comparison = pd.DataFrame({
    'Feature': X_train.columns,
    'RFC Importance': rfc_importances,
    'kNN Importance': perm_importance.importances_mean
})

# Plot the comparison
feature_importance_comparison.set_index('Feature').plot(kind='bar')
plt.title('Feature Importance Comparison: RFC vs kNN')
plt.ylabel('Importance')
plt.xlabel(' ')
plt.legend(title='Model')
plt.tight_layout()
plt.show()

In [None]:
#uncomment to freeze requirements
# pip freeze > requirements.txt