In [30]:
def create_imputation(df):
    df_copy = df.copy()
    imputation = {}

    # Handle numeric columns
    for column in df_copy.select_dtypes(include=['int', 'float']).columns:
        if column not in ['CLASS', 'ID']:
            if df_copy[column].isnull().all():
                imputation[column] = 0
            else:
                mean_value = df_copy[column].mean()
                df_copy[column].fillna(mean_value, inplace=True)
                imputation[column] = mean_value

    # Handle categorical and object columns
    for column in df_copy.select_dtypes(include=['object', 'category']).columns:
        if column not in ['CLASS', 'ID']:
            if df_copy[column].isnull().all():
                if is_categorical_dtype(df_copy[column]):
                    imputation[column] = df_copy[column].cat.categories[0]
                else:
                    imputation[column] = ""
            else:
                mode_value = df_copy[column].mode()[0]
                df_copy[column].fillna(mode_value, inplace=True)
                imputation[column] = mode_value

    return df_copy, imputation

def apply_imputation(df, imputation):
    df_copy = df.copy()

    for column, value in imputation.items():
        df_copy[column].fillna(value, inplace=True)

    return df_copy

In [22]:
# Import necessary libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def test_hyperparameter_all_models_random_search(X, y, test_size=0.3):
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    # Define a list of models to test
    models = [
        ("Linear Regression", LinearRegression()),
        ("Random Forest", RandomForestClassifier()),
        ("Support Vector Machine", SVC()),
        ("K-Nearest Neighbors", KNeighborsClassifier()),
        ("Gaussian Naive Bayes", GaussianNB()),
        ("Multi-layer Perceptron", MLPClassifier(max_iter=1000)),
    ]

    # Results storage
    results = []

    # Hyperparameter search for each model
    for model_name, model in models:
        if model_name == "Random Forest":
            param_grid = {
                'n_estimators': [10, 50, 100, 200],
                'max_depth': [None, 10, 20, 30],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'bootstrap': [True, False],
            }
        elif model_name == "Support Vector Machine":
            param_grid = {
                'C': [0.1, 1, 10, 100],
                'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
                'gamma': ['scale', 'auto'],
            }
        elif model_name == "K-Nearest Neighbors":
            param_grid = {
                'n_neighbors': [3, 5, 7, 10],
                'weights': ['uniform', 'distance'],
                'p': [1, 2],
            }
        elif model_name == "Multi-layer Perceptron":
            param_grid = {
                'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
                'activation': ['relu', 'tanh'],
                'alpha': [0.0001, 0.001, 0.01],
            }
        else:
            print(f"Unsupported model: {model_name}")
            continue

        # Randomized search for hyperparameter tuning using training and validation sets
        random_search = RandomizedSearchCV(model, param_grid, n_iter=10, cv=5, random_state=42, n_jobs=-1)

        # Train the model on the combined training and validation sets
        random_search.fit(X_train, y_train)

        # Get the best model from the search
        best_model = random_search.best_estimator_

        # Make predictions on the test set
        y_pred = best_model.predict(X_test)

        # Evaluate the model on the test set and store accuracy
        accuracy = accuracy_score(y_test, y_pred)

        # Print the results
        print(f"\n{model_name} Best Parameters: {random_search.best_params_}")
        print(f"{model_name} Accuracy: {accuracy:.4f}")

        # Store results
        results.append({
            'Model': model_name,
            'Accuracy': accuracy,
        })

    # Create a bar plot for accuracy
    models_names = [result['Model'] for result in results]
    accuracies = [result['Accuracy'] for result in results]
    plt.figure(figsize=(10, 6))
    plt.bar(models_names, accuracies, color='blue', alpha=0.7)
    plt.xlabel('Model')
    plt.ylabel('Accuracy')
    plt.title('Accuracy for Different Models')
    plt.show()



<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=3129096e-2078-4742-bd9d-c05dc1a0bf39' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>

In [23]:
from sklearn.impute import SimpleImputer

x = pd.read_csv("training_with_207_features.csv")
x.drop(columns="SMILES", inplace=True)
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(x)
traning_data = pd.read_csv("training_smiles.csv")
y = traning_data["ACTIVE"].astype("category")

test_hyperparameter_all_models_random_search(X_imputed, y)


Random Forest Best Parameters: {'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': None, 'bootstrap': True}
Random Forest Accuracy: 1.0000


KeyboardInterrupt: 

In [41]:
from skopt import BayesSearchCV

def find_hyperparameters_bayes_optimization(X, y, test_size=0.2):
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    # Define hyperparameter search spaces for each algorithm
    models = {
        # 'Random Forest': (RandomForestClassifier(), 
        #                   {'n_estimators': (10, 200), 
        #                    'max_depth': (1, 20),
        #                    'min_samples_split': (2, 10),
        #                    'min_samples_leaf': (1, 10)}),
        'Support Vector Machine': (SVC(), 
                                   {'C': (1e-6, 1e+6, 
                                    'log-uniform'),
                                    'gamma': (1e-6, 1e+1,
                                    'log-uniform')}),
        'K-Nearest Neighbors': (KNeighborsClassifier(), 
                                {'n_neighbors': (1, 10), 
                                 'weights': ['uniform', 'distance']}),
        'Multi-layer Perceptron': (MLPClassifier(), 
                                   {'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50, 25)], 
                                    'activation': ['relu', 'tanh'], 
                                    'alpha': (1e-6, 1e-2, 'log-uniform')}),
    }

    best_hyperparameters = {}

    for model_name, (model, params) in models.items():
        # Perform Bayesian optimization for each algorithm
        opt = BayesSearchCV(model, params, n_iter=50, cv=5)
        opt.fit(X_train, y_train)

        # Evaluate on the test set and calculate accuracy
        accuracy = opt.score(X_test, y_test)

        # Save the best hyperparameters and test accuracy for each algorithm
        best_hyperparameters[model_name] = {'params': opt.best_params_, 'accuracy': accuracy}

        # Print the results for each model
        print(f"{model_name} - Best Hyperparameters: {opt.best_params_}, Accuracy: {accuracy}")

    return best_hyperparameters

# Example usage:
# best_hyperparameters = find_hyperparameters_bayes_optimization(X, y)
# print(best_hyperparameters)



In [42]:

x_imp, imputation = create_imputation(x)

find_hyperparameters_bayes_optimization(x_imp, y)