In [None]:
!pip install optuna

In [None]:
!pip install catboost

In [None]:
!pip install scikeras

In [None]:
# Import libraries
import optuna
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier
# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/NEW_WORK/work_2/5_CTDT/CTDT_main_positive_features.csv")
main_n = pd.read_csv("/content/drive/MyDrive/NEW_WORK/work_2/5_CTDT/CTDT_main_negative_features.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/NEW_WORK/work_2/5_CTDT/CTDT_validation_positive_features.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/NEW_WORK/work_2/5_CTDT/CTDT_validation_negative_features.csv")



In [None]:
import optuna
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# Combine positive and negative samples


X_train = pd.concat([main_p, main_n])
y_train = np.concatenate([np.ones(len(main_p)), np.zeros(len(main_n))])
# Define a cross-validation accuracy function
def cross_val_accuracy(model, X_train, y_train):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    return accuracies.mean()

# Define the objective function for grouped models
def objective(trial, model_name, X_train, y_train):
    if model_name == "SVM":
        kernel = trial.suggest_categorical('kernel', ['linear', 'rbf'])
        C = trial.suggest_loguniform('C', 1e-5, 1e5)
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
        model = SVC(kernel=kernel, C=C, gamma=gamma)

    elif model_name == "Decision Tree":
        max_depth = trial.suggest_int('max_depth', 3, 30)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
        criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
        model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split,
                                       min_samples_leaf=min_samples_leaf, criterion=criterion)

    elif model_name == "Random Forest":
        n_estimators = trial.suggest_int('n_estimators', 100, 300)
        max_depth = trial.suggest_int('max_depth', 3, 30)
        max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                        max_features=max_features, min_samples_split=min_samples_split)


    # Calculate cross-validation accuracy
    score = cross_val_accuracy(model, X_train, y_train)
    return score

# Grouped models for optimization
grouped_models = ["SVM", "Decision Tree", "Random Forest"]

# Store results
grouped_best_params = {}
grouped_best_accuracies = []

for model_name in grouped_models:
    print(f"Optimizing {model_name}...")
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, model_name, X_train, y_train), n_trials=50)
    grouped_best_params[model_name] = study.best_params
    grouped_best_accuracies.append(study.best_value)

# Save results
grouped_results = pd.DataFrame({
    'Model': grouped_models,
    'Best Accuracy': grouped_best_accuracies,
    'Best Parameters': [grouped_best_params[model] for model in grouped_models]
})
print(grouped_results)


In [None]:
#Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier

def objective_gradient_boosting(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    subsample = trial.suggest_uniform('subsample', 0.5, 1.0)

    model = GradientBoostingClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=subsample
    )

    score = cross_val_accuracy(model, X_train, y_train)
    return score

# Run Optuna for Gradient Boosting
study_gradient_boosting = optuna.create_study(direction='maximize')
study_gradient_boosting.optimize(objective_gradient_boosting, n_trials=50)

# Print and save results
print("Gradient Boosting Results:")
print("Best Parameters:", study_gradient_boosting.best_params)
print("Best Accuracy:", study_gradient_boosting.best_value)

# Save results
gradient_boosting_results = pd.DataFrame([{
    'Model': 'Gradient Boosting',
    'Best Accuracy': study_gradient_boosting.best_value,
    'Best Parameters': study_gradient_boosting.best_params
}])


In [None]:
#XGBoost

from xgboost import XGBClassifier

def objective_xgboost(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.5, 1.0)

    model = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        colsample_bytree=colsample_bytree,
        use_label_encoder=False,
        eval_metric="logloss"
    )

    score = cross_val_accuracy(model, X_train, y_train)
    return score

# Run Optuna for XGBoost
study_xgboost = optuna.create_study(direction='maximize')
study_xgboost.optimize(objective_xgboost, n_trials=50)

# Print and save results
print("XGBoost Results:")
print("Best Parameters:", study_xgboost.best_params)
print("Best Accuracy:", study_xgboost.best_value)

# Save results
xgboost_results = pd.DataFrame([{
    'Model': 'XGBoost',
    'Best Accuracy': study_xgboost.best_value,
    'Best Parameters': study_xgboost.best_params
}])


In [None]:
#LightGBM

from lightgbm import LGBMClassifier

def objective_lightgbm(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
    num_leaves = trial.suggest_int('num_leaves', 20, 50)

    model = LGBMClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        num_leaves=num_leaves
    )

    score = cross_val_accuracy(model, X_train, y_train)
    return score

# Run Optuna for LightGBM
study_lightgbm = optuna.create_study(direction='maximize')
study_lightgbm.optimize(objective_lightgbm, n_trials=50)

# Print and save results
print("LightGBM Results:")
print("Best Parameters:", study_lightgbm.best_params)
print("Best Accuracy:", study_lightgbm.best_value)

# Save results
lightgbm_results = pd.DataFrame([{
    'Model': 'LightGBM',
    'Best Accuracy': study_lightgbm.best_value,
    'Best Parameters': study_lightgbm.best_params
}])


In [None]:
#CatBoost

def objective_catboost(trial):
    depth = trial.suggest_int('depth', 3, 12)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
    iterations = trial.suggest_int('iterations', 50, 200)
    subsample = trial.suggest_uniform('subsample', 0.5, 1.0)
    model = CatBoostClassifier(depth=depth, learning_rate=learning_rate,
                               iterations=iterations, subsample=subsample, verbose=0)
    return cross_val_accuracy(model, X_train, y_train)

study_catboost = optuna.create_study(direction='maximize')
study_catboost.optimize(objective_catboost, n_trials=50)
print("CatBoost Results:", study_catboost.best_params)


In [None]:
#AdaBoost

def objective_adaboost(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
    algorithm = trial.suggest_categorical('algorithm', ['SAMME', 'SAMME.R'])
    model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algorithm)
    return cross_val_accuracy(model, X_train, y_train)

study_adaboost = optuna.create_study(direction='maximize')
study_adaboost.optimize(objective_adaboost, n_trials=50)
print("AdaBoost Results:", study_adaboost.best_params)


In [None]:
#Neural Network
def create_nn(num_units, dropout_rate, learning_rate, input_shape): # Define the create_nn function here
    model = Sequential([
        Dense(num_units, activation='relu', input_shape=input_shape),
        Dropout(dropout_rate),
        Dense(num_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

def objective_nn(trial):
    num_units = trial.suggest_int('num_units', 32, 256)
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.0, 0.5)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)
    model = KerasClassifier(build_fn=create_nn, num_units=num_units, # Now create_nn is defined and can be used
                            dropout_rate=dropout_rate, learning_rate=learning_rate,
                            input_shape=(X_train.shape[1],), epochs=5, batch_size=32, verbose=0)
    return cross_val_accuracy(model, X_train, y_train)

study_nn = optuna.create_study(direction='maximize')
study_nn.optimize(objective_nn, n_trials=50)
print("Neural Network Results:", study_nn.best_params)

In [None]:
#MLP

from sklearn.neural_network import MLPClassifier

def objective_mlp(trial):
    hidden_layer_sizes = trial.suggest_categorical('hidden_layer_sizes', [(32,), (64,), (128,), (64, 64)])
    alpha = trial.suggest_loguniform('alpha', 1e-5, 1e-1)
    learning_rate_init = trial.suggest_loguniform('learning_rate_init', 1e-5, 1e-1)
    activation = trial.suggest_categorical('activation', ['identity', 'logistic', 'tanh', 'relu'])
    model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, alpha=alpha,
                          learning_rate_init=learning_rate_init, activation=activation, max_iter=300)
    return cross_val_accuracy(model, X_train, y_train)

study_mlp = optuna.create_study(direction='maximize')
study_mlp.optimize(objective_mlp, n_trials=50)
print("MLP Results:", study_mlp.best_params)


In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from scikeras.wrappers import KerasClassifier

# Add the target column to the dataset
probability_dataset = pd.DataFrame(y_train, columns=["Target"])

# Function to add probabilities
def add_probabilities(model, model_name, X_train, probability_dataset):
    if hasattr(model, "predict_proba"):
        probabilities = model.predict_proba(X_train)[:, 1]  # Positive class probabilities
    else:
        probabilities = model.predict(X_train).flatten()  # For Neural Networks or others
    probability_dataset[model_name] = probabilities
    return probability_dataset

# Train and add probabilities for each algorithm

# 1. SVM
print("Training SVM...")
svm_model = SVC(**grouped_best_params["SVM"], probability=True)
svm_model.fit(X_train, y_train)
probability_dataset = add_probabilities(svm_model, "SVM", X_train, probability_dataset)

# 2. Decision Tree
print("Training Decision Tree...")
decision_tree_model = DecisionTreeClassifier(**grouped_best_params["Decision Tree"])
decision_tree_model.fit(X_train, y_train)
probability_dataset = add_probabilities(decision_tree_model, "Decision_Tree", X_train, probability_dataset)

# 3. Random Forest
print("Training Random Forest...")
random_forest_model = RandomForestClassifier(**grouped_best_params["Random Forest"])
random_forest_model.fit(X_train, y_train)
probability_dataset = add_probabilities(random_forest_model, "Random_Forest", X_train, probability_dataset)

# 4. Gradient Boosting
print("Training Gradient Boosting...")
gradient_boosting_model = GradientBoostingClassifier(**study_gradient_boosting.best_params)
gradient_boosting_model.fit(X_train, y_train)
probability_dataset = add_probabilities(gradient_boosting_model, "Gradient_Boosting", X_train, probability_dataset)

# 5. XGBoost
print("Training XGBoost...")
xgboost_model = XGBClassifier(**study_xgboost.best_params, use_label_encoder=False, eval_metric="logloss")
xgboost_model.fit(X_train, y_train)
probability_dataset = add_probabilities(xgboost_model, "XGBoost", X_train, probability_dataset)

# 6. LightGBM
print("Training LightGBM...")
lightgbm_model = LGBMClassifier(**study_lightgbm.best_params)
lightgbm_model.fit(X_train, y_train)
probability_dataset = add_probabilities(lightgbm_model, "LightGBM", X_train, probability_dataset)

# 7. CatBoost
print("Training CatBoost...")
catboost_model = CatBoostClassifier(**study_catboost.best_params, verbose=0)
catboost_model.fit(X_train, y_train)
probability_dataset = add_probabilities(catboost_model, "CatBoost", X_train, probability_dataset)

# 8. AdaBoost
print("Training AdaBoost...")
adaboost_model = AdaBoostClassifier(**study_adaboost.best_params)
adaboost_model.fit(X_train, y_train)
probability_dataset = add_probabilities(adaboost_model, "AdaBoost", X_train, probability_dataset)

# 9. Logistic Regression
print("Training Logistic Regression...")
logistic_model = LogisticRegression(**grouped_best_params["Logistic Regression"])
logistic_model.fit(X_train, y_train)
probability_dataset = add_probabilities(logistic_model, "Logistic_Regression", X_train, probability_dataset)

# 10. k-NN
print("Training k-NN...")
knn_model = KNeighborsClassifier(**grouped_best_params["k-NN"])
knn_model.fit(X_train, y_train)
probability_dataset = add_probabilities(knn_model, "k-NN", X_train, probability_dataset)

# 11. Naive Bayes
print("Training Naive Bayes...")
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(X_train, y_train)
probability_dataset = add_probabilities(naive_bayes_model, "Naive_Bayes", X_train, probability_dataset)

# 12. Neural Network
print("Training Neural Network...")
nn_model = Sequential([
    Dense(study_nn.best_params["num_units"], activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(study_nn.best_params["dropout_rate"]),
    Dense(study_nn.best_params["num_units"], activation='relu'),
    Dropout(study_nn.best_params["dropout_rate"]),
    Dense(1, activation='sigmoid')
])
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=0)
nn_probs = nn_model.predict(X_train).flatten()
probability_dataset["Neural_Network"] = nn_probs

# 13. MLP
print("Training MLP...")
mlp_model = MLPClassifier(**study_mlp.best_params, max_iter=300)
mlp_model.fit(X_train, y_train)
probability_dataset = add_probabilities(mlp_model, "MLP", X_train, probability_dataset)

# Save the Combined Probability Dataset
output_path = "/content/CTDT_OPTUNA_Dataset.csv"
probability_dataset.to_csv(output_path, index=False)
print(f"Combined probability dataset saved at {output_path}")
