In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt



In [2]:
df = pd.read_csv('../data/combined_bbb_classification.csv')
df.head()

Unnamed: 0,name,smiles,BBB,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,sulphasalazine,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,0,12.34101,12.34101,0.023055,-3.794932,0.540588,11.428571,398.4,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,moxalactam,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,0,13.190522,13.190522,0.042537,-2.144257,0.133795,22.0,520.48,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,clioquinol,Oc1c(I)cc(Cl)c2cccnc12,0,9.654043,9.654043,0.195,0.195,0.758308,10.615385,305.502,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,bbcpd11 (cimetidine analog) (y-g13),CCNC(=NCCSCc1ncccc1Br)NC#N,0,8.544584,8.544584,0.532052,0.532052,0.272365,10.894737,342.266,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,schembl614298,CN1CC[C@]23c4c5ccc(OC6O[C@H](C(=O)O)[C@@H](O)[...,0,11.445328,11.445328,0.165306,-1.798901,0.346256,45.30303,461.467,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Data Preprocessing


In [3]:
# check for missing values
df.isna().sum()

name                 1109
smiles                  0
BBB                     0
MaxAbsEStateIndex      13
MaxEStateIndex         13
                     ... 
fr_thiazole            13
fr_thiocyan            13
fr_thiophene           13
fr_unbrch_alkane       13
fr_urea                13
Length: 220, dtype: int64

In [4]:
# checking 13 missing values in fr_urea column
df[df.fr_urea.isnull()]

Unnamed: 0,name,smiles,BBB,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
4933,mepenzolatebromide,C[N+]1(C)CCC[C@H](OC(=O)[C+](O)(c2ccccc2)c2ccc...,0,,,,,,,,...,,,,,,,,,,
7627,tiotidine,C[N+]1(C)CCCC(OC(=O)[C+](O)(c2ccccc2)c2ccccc2)C1,0,,,,,,,,...,,,,,,,,,,
7753,15,O=N([O-])C1=C(CN=C1NCCSCc2ncccc2)Cc3ccccc3,1,,,,,,,,...,,,,,,,,,,
7755,22767,c1(nc(NC(N)=[NH2])sc1)CSCCNC(=[NH]C#N)NC,1,,,,,,,,...,,,,,,,,,,
8052,ICI17148,Cc1nc(sc1)\[NH]=C(\N)N,1,,,,,,,,...,,,,,,,,,,
8230,5-6,s1cc(CSCCN\C(NC)=[NH]\C#N)nc1\[NH]=C(\N)N,1,,,,,,,,...,,,,,,,,,,
8257,12,c1c(c(ncc1)CSCCN\C(=[NH]\C#N)NCC)Br,0,,,,,,,,...,,,,,,,,,,
8260,16,n1c(csc1\[NH]=C(\N)N)c1ccccc1,1,,,,,,,,...,,,,,,,,,,
8261,17,n1c(csc1\[NH]=C(\N)N)c1cccc(c1)N,0,,,,,,,,...,,,,,,,,,,
8262,18,n1c(csc1\[NH]=C(\N)N)c1cccc(c1)NC(C)=O,0,,,,,,,,...,,,,,,,,,,


In [5]:
# dropping these with missing values
df.dropna(subset=['fr_urea'], inplace=True)

In [6]:
# drop the name and smiles column as it is not needed for modeling
df.drop(columns=['name','smiles'], inplace=True)

# fill any remaining missing values with zero
df.fillna(0, inplace=True)

In [7]:
df.head()

Unnamed: 0,BBB,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,0,12.34101,12.34101,0.023055,-3.794932,0.540588,11.428571,398.4,384.288,398.068491,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,13.190522,13.190522,0.042537,-2.144257,0.133795,22.0,520.48,500.32,520.101247,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0,9.654043,9.654043,0.195,0.195,0.758308,10.615385,305.502,300.462,304.910439,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,8.544584,8.544584,0.532052,0.532052,0.272365,10.894737,342.266,326.138,341.030979,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,11.445328,11.445328,0.165306,-1.798901,0.346256,45.30303,461.467,434.251,461.168581,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Standardizing data

In [8]:
from sklearn.preprocessing import StandardScaler

### Standardizing data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop('BBB', axis=1))
scaled_df = pd.DataFrame(scaled_features, columns=df.columns[1:])  # Exclude the target column 'BBB'
scaled_df['BBB'] = df['BBB'].values

In [9]:
scaled_df

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,BBB
0,0.250015,0.250034,-0.571809,-2.157358,-0.221684,-1.081406,0.103693,0.181675,0.104283,-0.022282,...,5.173161,-0.141653,-0.094397,-0.118378,-0.176261,0.0,-0.129887,-0.142629,-0.183929,0
1,0.523433,0.523449,-0.502652,-0.871079,-2.041236,-0.247425,0.839054,0.930083,0.839923,0.720790,...,-0.161298,-0.141653,-0.094397,8.447538,-0.176261,0.0,-0.129887,-0.142629,-0.183929,0
2,-0.614792,-0.614766,0.038556,0.951774,0.752159,-1.145558,-0.455888,-0.359004,-0.457294,-1.198811,...,-0.161298,-0.141653,-0.094397,-0.118378,-0.176261,0.0,-0.129887,-0.142629,-0.183929,0
3,-0.971875,-0.971845,1.235018,1.214421,-1.421426,-1.123520,-0.234436,-0.193394,-0.239552,-0.641508,...,-0.161298,-0.141653,-0.094397,-0.118378,-0.176261,0.0,-0.129887,-0.142629,-0.183929,0
4,-0.038263,-0.038241,-0.066849,-0.601962,-1.090916,1.590951,0.483583,0.503937,0.484664,0.535022,...,-0.161298,-0.141653,-0.094397,-0.118378,-0.176261,0.0,-0.129887,-0.142629,-0.183929,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9619,-0.147463,-0.147441,-0.519876,0.019848,-0.554977,-1.138419,-0.633386,-0.536063,-0.637902,-0.858237,...,-0.161298,-0.141653,-0.094397,-0.118378,-0.176261,0.0,-0.129887,-0.142629,-0.183929,1
9620,-0.278757,-0.278733,0.094755,-0.201012,-1.096669,0.202540,0.103964,0.129952,0.104697,0.070602,...,-0.161298,-0.141653,-0.094397,-0.118378,-0.176261,0.0,-0.129887,-0.142629,-0.183929,1
9621,0.100731,0.100751,-0.317458,0.410353,0.427956,-1.059335,-0.354294,-0.334739,-0.353413,-0.331895,...,-0.161298,-0.141653,-0.094397,-0.118378,-0.176261,0.0,-0.129887,-0.142629,5.147803,1
9622,-0.894903,-0.894873,1.534880,1.280246,-0.008154,-1.132121,0.007676,0.000842,0.008560,0.070602,...,-0.161298,-0.141653,-0.094397,-0.118378,-0.176261,0.0,-0.129887,-0.142629,-0.183929,1


# Build Model

In [10]:
from sklearn.model_selection import train_test_split

# splitting data into train and test data
train_df, test_df = train_test_split(scaled_df, test_size=0.2, random_state=42)


In [11]:
train_df.shape, test_df.shape

((7699, 218), (1925, 218))

### Using Lazy Predict

In [12]:
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

X = scaled_df.drop(columns='BBB')
y = scaled_df['BBB']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=123)

clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 3173, number of negative: 1639
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021782 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23792
[LightGBM] [Info] Number of data points in the train set: 4812, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.659393 -> initscore=0.660591
[LightGBM] [Info] Start training from score 0.660591
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LGBMClassifier                     0.89               0.87     0.87      0.89   
ExtraTreesClassifier               0.89               0.87     0.87      0.89   
RandomForestClassifier             0.89               0.87     0.87      0.89   
BaggingClassifier                  0.88               0.86     0.86      0.88   
LabelSpreading         

### QSAR Models

In [13]:
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, matthews_corrcoef, average_precision_score
import joblib


In [15]:
y.shape

(9624,)

In [17]:
# Define the models

models = [
    ('KNN', KNeighborsClassifier(n_neighbors=3)),
    ('SVM', SVC(probability=True)),
    ('RF', RandomForestClassifier(max_depth=8, n_estimators=100)),
    ('LR', LogisticRegression(max_iter=1000)),
    ('XGB', XGBClassifier())
]


# using stratified kfold
from sklearn.model_selection import StratifiedKFold
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []
conf_matrices = []
saved_models = {}


# Perform cross-validation on each model
for name, model in models:
    if name is 'XGB':
        model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    # Get cross-validated predictions
    print('Performing cross validation on ',name)
    y_pred = cross_val_predict(model, X, y, cv=fold)
    
    # Get probability predictions with error handling
    try:
        y_proba = cross_val_predict(model, X, y, cv=fold, method='predict_proba')
    except Exception as e:
        print(f"Warning: Could not get probabilities for {name}: {e}")
        y_proba = None
    
    # Compute the confusion matrix
    cm = confusion_matrix(y, y_pred)
    
    # Extract confusion matrix values
    TN, FP, FN, TP = cm.ravel()
    
    # Calculate metrics
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    recall = TP / (TP + FN) if (TP + FN) != 0 else 0
    specificity = TN / (TN + FP) if (TN + FP) != 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    
    # Calculate ROC AUC and PR AUC if probabilities are available
    roc_auc = roc_auc_score(y, y_proba[:, 1]) if y_proba is not None else None
    pr_auc = average_precision_score(y, y_proba[:, 1]) if y_proba is not None else None
    
    # Save results
    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'Specificity': specificity,
        'F1 Score': f1,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'mcc': matthews_corrcoef(y, y_pred)
    })
    
    # Store confusion matrix
    conf_matrices.append(cm)
    
    # Fitting models
    print('Fitting ', name)
    model.fit(X, y)
    
    # Save models for future prediction
    saved_models[name] = model
    print(f'... Successfully saved {name} model.')

Performing cross validation on  KNN
Fitting  KNN
... Successfully saved KNN model.
Performing cross validation on  SVM
Fitting  SVM
... Successfully saved SVM model.
Performing cross validation on  RF
Fitting  RF
... Successfully saved RF model.
Performing cross validation on  LR
Fitting  LR
... Successfully saved LR model.
Performing cross validation on  XGB


AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [None]:

# # Save each model
# for name, model in saved_models.items():
#     joblib.dump(model, f'{name}_model.pkl')

# Create a results DataFrame
results_df = pd.DataFrame(results)

# Display the results
results_df