In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    log_loss,
    f1_score,
    roc_auc_score,
    precision_score,
    recall_score,
    confusion_matrix,
)
from tpot import TPOTClassifier
import matplotlib.pyplot as plt

# Data preparation

In [2]:
# Load pd_train
pd_train = pd.read_parquet("data/training_class_mixed.parquet")
pd_train["label"] = pd_train["Liver"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_train.shape)
print(pd_train["label"].value_counts())
pd_train.head()

(1221, 16095)
label
1    723
0    498
Name: count, dtype: int64


Unnamed: 0_level_0,Smiles,Liver,FP1,FP2,FP3,FP4,FP5,FP6,FP7,FP8,...,APC2D10_I_B,APC2D10_I_Si,APC2D10_I_X,APC2D10_B_B,APC2D10_B_Si,APC2D10_B_X,APC2D10_Si_Si,APC2D10_Si_X,APC2D10_X_X,label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
998,N1C[C@H]2C[C@H](c3c2cc2nccnc2c3)C1,NonHepatotoxicity,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,CCC1=C(C2=CC=CC=C2O1)C(=O)C3=CC(=C(C(=C3)I)O)I,Hepatotoxicity,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
283,O[C@@]12[C@H]3[C@@H]([C@@]4([C@H](CC3)C[C@@H](...,NonHepatotoxicity,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1082,O=C1[C@@]2(C(C3C([C@@]4(C(=CC3)C[C@@H](O)CC4)C...,Hepatotoxicity,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
122,O[C@@]12[C@@]3(CCN([C@H]1Cc1c3cc(O)cc1)CC1CCC1...,NonHepatotoxicity,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Load pd_test
pd_test = pd.read_parquet("data/testing_class_mixed.parquet")
pd_test["label"] = pd_test["Liver"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_test.shape)
print(pd_test["label"].value_counts())
pd_test.head()

(306, 16095)
label
1    181
0    125
Name: count, dtype: int64


Unnamed: 0_level_0,Smiles,Liver,FP1,FP2,FP3,FP4,FP5,FP6,FP7,FP8,...,APC2D10_I_B,APC2D10_I_Si,APC2D10_I_X,APC2D10_B_B,APC2D10_B_Si,APC2D10_B_X,APC2D10_Si_Si,APC2D10_Si_X,APC2D10_X_X,label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
724,Clc1c(cc([C@]2(O)CCN(CC2)CCC[C@@H](c2ccc(F)cc2...,NonHepatotoxicity,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
82,CN(C)N/N=C\1/C(=NC=N1)C(=O)N,Hepatotoxicity,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
384,F[C@]12[C@H]([C@H]3[C@@]([C@](O)(CC3)C(=O)CO)(...,NonHepatotoxicity,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
627,O=C(N1[C@@H](Cc2c(C1)cc(OC)c(OC)c2)C(=O)O)[C@@...,Hepatotoxicity,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
896,Oc1cc([C@@H]([C@H](CN(C)C)C)CC)ccc1,Hepatotoxicity,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
pd_data = pd.concat([pd_train, pd_test], axis=0)
print(pd_data.shape)

(1527, 16095)


In [5]:
# get X, y
X = pd_data.drop(columns=["Liver", "label", "Smiles"])
y = pd_data["label"]

print(X.shape)
print(y.shape)

(1527, 16092)
(1527,)


# Model training

## TPOT classifier

In [6]:
# Split into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize TPOTClassifier with 5-fold cross-validation
tpot = TPOTClassifier(
    generations=10,                # Number of iterations
    population_size=40,           # Number of pipelines to evaluate in each generation
    cv=5,                         # 5-fold cross-validation
    random_state=42,
    verbosity=2,                   # Output progress
    n_jobs=16,  # Use 16 cores
)

# Fit the TPOT classifier on the training data
tpot.fit(X, y)

Optimization Progress:   0%|          | 0/440 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.7380435015536269

Generation 2 - Current best internal CV score: 0.7419779277831352

Generation 3 - Current best internal CV score: 0.744598735669131

Generation 4 - Current best internal CV score: 0.744598735669131

Generation 5 - Current best internal CV score: 0.7452544733740492

Generation 6 - Current best internal CV score: 0.7511325404478731

Generation 7 - Current best internal CV score: 0.7511325404478731

Generation 8 - Current best internal CV score: 0.7511325404478731

Generation 9 - Current best internal CV score: 0.7511325404478731

Generation 10 - Current best internal CV score: 0.7511325404478731

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.4, min_samples_leaf=3, min_samples_split=3, n_estimators=100)


In [7]:
# Export the best pipeline
tpot.export('models/tpot_best_pipeline_automl_mixed.py')