In [None]:
from sklearn.model_selection import  GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix
import os
import pickle
import datetime

In [None]:
pip install fancyimpute

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
from scipy.stats import ttest_ind, chi2_contingency

from sklearn.feature_selection import SelectFromModel
from keras.models import Sequential
from keras.layers import Dense
from keras import regularizers
import numpy as np
from fancyimpute import KNN
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
data = pd.read_csv('ERBB1.csv')

# Preview the first few rows of the dataset
print(data.head())


       CHEMBL_ID   ALogP    ALogp2      AMR       apol  naAromAtom  nAromBond  \
0  CHEMBL3970738 -1.9574  3.831415  39.6468  69.184239          22         23   
1  CHEMBL4099008 -0.8417  0.708459  96.6035  82.723755          12         12   
2   CHEMBL122132 -0.2815  0.079242  16.1212  38.097930          16         17   
3  CHEMBL2048907  0.2958  0.087498  54.4247  73.422239          24         26   
4  CHEMBL4562744 -2.6356  6.946387  72.0931  83.590169          21         23   

   nAtom  nHeavyAtom  nH  ...   AATSC0s   AATSC1s   AATSC2s   AATSC3s  \
0     57          34  23  ...  1.857649 -0.358469  0.364975 -0.119000   
1     73          38  35  ...  1.143273 -0.018942  0.098510  0.026696   
2     29          19  10  ...  0.582240 -0.151546  0.069277 -0.083976   
3     59          36  23  ...  1.187459 -0.161320  0.075772  0.048802   
4     73          40  33  ...  0.828027 -0.017491  0.072255 -0.086880   

    AATSC4s   AATSC5s   AATSC6s   AATSC7s   AATSC8s   Activity  
0 -0.1673

In [None]:

# Check for duplicates
print('Number of duplicates in the dataset:', data.duplicated().sum())


Number of duplicates in the dataset: 0


In [None]:

# Remove duplicates
data.drop_duplicates(inplace=True)

In [None]:
X = data.drop("CHEMBL_ID", axis='columns')

In [None]:
X = X.drop("Activity", axis  = 'columns') 

In [None]:
le = LabelEncoder()
Y = pd.DataFrame(le.fit_transform(data['Activity']), columns = ['Activity'])

In [None]:
Y.head()

Unnamed: 0,Activity
0,0
1,0
2,1
3,0
4,1


In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
def preprocess(X):
    X = pd.DataFrame(KNN(k=3).fit_transform(X), columns = X.columns)
    X = pd.DataFrame(np.log(X.abs() + 1), columns = X.columns)
    scaler = MinMaxScaler()
    X =  pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

    return X

In [None]:
X_train = preprocess(X_train)

In [None]:
X_test = preprocess(X_test)

In [None]:
X_test = preprocess(X_test)


In [None]:

# Define models
models = [
    ("LR", LogisticRegression(random_state=42)),
    ("SVM", SVC(random_state=42)),
    ("RF", RandomForestClassifier(random_state=42)),
    ("KNN", KNeighborsClassifier()),
    ("NB", GaussianNB()),
    ("ADA", AdaBoostClassifier(random_state=42)),
    ("XGB", XGBClassifier(random_state=42))
]

# Define hyperparameters to optimize
params = {
    "LR": {"model__C": np.logspace(-4, 4, 9)},
    "SVM": {"model__C": np.logspace(-4, 4, 9), "model__kernel": ["linear", "poly", "rbf", "sigmoid"]},
    "RF": {"model__n_estimators": [10, 50, 100, 200, 500]},
    "KNN": {"model__n_neighbors": [3, 5, 7, 9, 11]},
    "NB": {},
    "ADA": {"model__n_estimators": [10, 50, 100, 200, 500]},
    "XGB": {"model__n_estimators": [10, 50, 100, 200, 500]}
}

# Create folder to save models and results
model_dir = os.path.join("ML_Classifiers", datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"))
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Train models and save to file
accuracy_list = []
for name, model in models:
    clf = Pipeline([
        # ('scaler', StandardScaler()),
        ('model', model)
    ])
    grid_search = GridSearchCV(clf, params[name], cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    model_path = os.path.join(model_dir, name + ".pkl")
    y_pred = grid_search.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracy_list.append(acc)
    with open(model_path, 'wb') as f:
        pickle.dump(grid_search.best_estimator_, f)

# Plot accuracy for all models
plt.bar([name for name, _ in models], accuracy_list)
plt.ylim([0.0, 1.0])
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison')
plt.show()


print("Models and results saved in {}".format(model_dir))