# DPC

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


CPFV (Combined Probability and Class Feature Vector)

In [2]:
#marge the column
import pandas as pd

# Define the file paths and prefixes
data_paths = {
    "ACC": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CPFV (Combined Probability and Class Feature Vector)/CPFV_AAC.csv",
    "CTDC": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CPFV (Combined Probability and Class Feature Vector)/CPFV_CTDC.csv",
    "CTD": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CPFV (Combined Probability and Class Feature Vector)/CPFV_CTD.csv",
    "GDC": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CPFV (Combined Probability and Class Feature Vector)/CPFV_GDC.csv",
    "PAAC": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CPFV (Combined Probability and Class Feature Vector)/CPFV_PAAC.csv",
    "PCP": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CPFV (Combined Probability and Class Feature Vector)/CPFV_PCP.csv",
    "TPC": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CPFV (Combined Probability and Class Feature Vector)/CPFV_TPC.csv",
    "CTDT": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CPFV (Combined Probability and Class Feature Vector)/CPFV_CTDT.csv",
    "DPC": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CPFV (Combined Probability and Class Feature Vector)/CPFV_DPC.csv",
    "CTDD": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CPFV (Combined Probability and Class Feature Vector)/CPFV_CTDD.csv",
}

processed_dfs = []
final_target_column = None  # To store the unique Target column

# Process each file in the dictionary
for prefix, path in data_paths.items():
    # Load the dataset
    df = pd.read_csv(path)

    # If a 'Target' column exists, store it and ensure only one is retained
    if 'Target' in df.columns:
        if final_target_column is None:
            final_target_column = df['Target']  # Retain the first Target column
        df.drop(columns=['Target'], inplace=True)  # Drop from the current dataset

    # Drop the 'True_Label' column if it exists
    df.drop(columns=['True_Label'], inplace=True, errors='ignore')

    # Rename columns with the prefix
    df = df.rename(columns=lambda col: f"{col}_{prefix}" if 'probabilities' not in col else col)

    # Append the processed DataFrame to the list
    processed_dfs.append(df)

# Concatenate all DataFrames column-wise
combined_df = pd.concat(processed_dfs, axis=1)

# Add the retained 'Target' column to the final dataset
if final_target_column is not None:
    combined_df['Target'] = final_target_column

# Save the combined DataFrame
output_file = "/content/Dataset Marge CPFV.csv"
combined_df.to_csv(output_file, index=False)

print(f"Processed dataset saved to: {output_file}")

Processed dataset saved to: /content/Dataset Marge CPFV.csv


In [3]:
# Check the shape of the merged dataset
print("Shape of the merged dataset:", combined_df.shape)

Shape of the merged dataset: (300, 240)


In [5]:
df=pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CPFV (Combined Probability and Class Feature Vector)/Optuna_Dataset Marge CPFV.csv")

In [6]:
df.shape

(300, 241)

In [7]:
!pip install deap

Collecting deap
  Downloading deap-1.4.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading deap-1.4.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/135.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deap
Successfully installed deap-1.4.2


In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from deap import base, creator, tools, algorithms
import random

# Load the dataset
data_path = '/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CPFV (Combined Probability and Class Feature Vector)/Optuna_Dataset Marge CPFV.csv'
data = pd.read_csv(data_path)

# Assuming the last column is the target, split features and labels
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target

# Encode target labels if necessary
if y.dtype == object or np.issubdtype(y.dtype, np.number):  # Handle both string and numeric labels
    le = LabelEncoder()
    y = le.fit_transform(y.astype(str))  # Ensure all targets are treated as strings for classification

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the individual and fitness functions
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(X.columns))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Classifier to evaluate fitness
def evaluate(individual):
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    if len(selected_features) == 0:  # Prevent division by zero
        return 0,

    X_train_selected = X_train.iloc[:, selected_features]
    X_val_selected = X_val.iloc[:, selected_features]

    model = RandomForestClassifier(random_state=42)
    model.fit(X_train_selected, y_train)
    y_pred = model.predict(X_val_selected)

    accuracy = accuracy_score(y_val, y_pred)
    return accuracy,

toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate", evaluate)

# Parameters for the Genetic Algorithm
population_size = 50
generations = 20
crossover_probability = 0.8
mutation_probability = 0.1

# Initialize population
population = toolbox.population(n=population_size)

# Run the Genetic Algorithm
result_population, logbook = algorithms.eaSimple(
    population,
    toolbox,
    cxpb=crossover_probability,
    mutpb=mutation_probability,
    ngen=generations,
    verbose=True
)

# Find the best individual
best_individual = tools.selBest(result_population, k=1)[0]
selected_features = [i for i, bit in enumerate(best_individual) if bit == 1 and i < len(X.columns)]  # Bounds check

# Select top 20 features based on their importance
if len(selected_features) > 15:
    feature_importances = pd.Series(best_individual).sort_values(ascending=False)
    selected_features = list(feature_importances.head(15).index)

# Evaluate performance using the top 15 features
X_train_selected = X_train.iloc[:, selected_features]
X_val_selected = X_val.iloc[:, selected_features]

final_model = RandomForestClassifier(random_state=42)
final_model.fit(X_train_selected, y_train)
final_predictions = final_model.predict(X_val_selected)
final_accuracy = accuracy_score(y_val, final_predictions)

print(f"Top 15 Selected features: {selected_features}")
print(f"Final Accuracy with top 15 selected features: {final_accuracy}")

# Save the top 15 selected features
pd.DataFrame({'Selected Features': selected_features}).to_csv('/content/CPFV_top_15_selected_features.csv', index=False)


gen	nevals
0  	50    
1  	45    
2  	45    
3  	42    
4  	44    
5  	40    
6  	37    
7  	38    
8  	38    
9  	44    
10 	37    
11 	42    
12 	41    
13 	30    
14 	41    
15 	43    
16 	44    
17 	48    
18 	41    
19 	40    
20 	40    
Top 15 Selected features: [0, 78, 133, 138, 139, 140, 142, 148, 155, 156, 162, 164, 165, 167, 169]
Final Accuracy with top 15 selected features: 0.9166666666666666


In [10]:
selected_feature_columns = data.columns[selected_features]
# Create a filtered dataset with only the selected top 15 features
filtered_data = data[selected_feature_columns.tolist() + [data.columns[-1]]]
filtered_data_path = '/content/CPFV_Top_15_Features.csv'
filtered_data.to_csv(filtered_data_path, index=False)

print(f"Filtered dataset with top 15 features saved to: {filtered_data_path}")

Filtered dataset with top 15 features saved to: /content/CPFV_Top_15_Features.csv


In [12]:
data=pd.read_csv("/content/CPFV_Top_15_Features.csv")


In [13]:
data.shape

(300, 16)

Class Feature Vector (CFV)

In [None]:
#marge the column
import pandas as pd

# Define the file paths and prefixes
data_paths = {
    "ACC": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Class Feature Vector (CFV)/CFV_AAC.csv",
    "CTDC": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Class Feature Vector (CFV)/CFV_CTDC (1).csv",
    "CTD": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Class Feature Vector (CFV)/CFV_CTD (3).csv",
    "GDC": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Class Feature Vector (CFV)/CFV_GDC.csv",
    "PAAC": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Class Feature Vector (CFV)/CFV_PAAC.csv",
    "PCP": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Class Feature Vector (CFV)/CFV_PCP.csv",
    "TPC": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Class Feature Vector (CFV)/CFV_TPC.csv",
    "CTDT": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Class Feature Vector (CFV)/CFV_CTDT.csv",
    "DPC": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Class Feature Vector (CFV)/CFV_DPC (1).csv",
    "CTDD": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Class Feature Vector (CFV)/CFV_CTDD.csv",
}

processed_dfs = []
final_target_column = None  # To store the unique Target column

# Process each file in the dictionary
for prefix, path in data_paths.items():
    # Load the dataset
    df = pd.read_csv(path)

    # If a 'Target' column exists, store it and ensure only one is retained
    if 'Target' in df.columns:
        if final_target_column is None:
            final_target_column = df['Target']  # Retain the first Target column
        df.drop(columns=['Target'], inplace=True)  # Drop from the current dataset

    # Drop the 'True_Label' column if it exists
    df.drop(columns=['True_Label'], inplace=True, errors='ignore')

    # Rename columns with the prefix
    df = df.rename(columns=lambda col: f"{col}_{prefix}" if 'probabilities' not in col else col)

    # Append the processed DataFrame to the list
    processed_dfs.append(df)

# Concatenate all DataFrames column-wise
combined_df = pd.concat(processed_dfs, axis=1)

# Add the retained 'Target' column to the final dataset
if final_target_column is not None:
    combined_df['Target'] = final_target_column

# Save the combined DataFrame
output_file = "/content/Optuna_Dataset Marge CFV.csv"
combined_df.to_csv(output_file, index=False)

print(f"Processed dataset saved to: {output_file}")

Processed dataset saved to: /content/Optuna_Dataset Marge CFV.csv


In [None]:
# Check the shape of the merged dataset
print("Shape of the merged dataset:", combined_df.shape)

Shape of the merged dataset: (300, 120)


In [None]:
df=pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Class Feature Vector (CFV)/Optuna_Dataset Marge CFV.csv")

In [None]:
df.shape

(300, 121)

In [None]:
!pip install deap



In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from deap import base, creator, tools, algorithms
import random

# Load the dataset
data_path = '/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Class Feature Vector (CFV)/Optuna_Dataset Marge CFV.csv'
data = pd.read_csv(data_path)

# Assuming the last column is the target, split features and labels
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target

# Encode target labels if necessary
if y.dtype == object or np.issubdtype(y.dtype, np.number):  # Handle both string and numeric labels
    le = LabelEncoder()
    y = le.fit_transform(y.astype(str))  # Ensure all targets are treated as strings for classification

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the individual and fitness functions
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(X.columns))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Classifier to evaluate fitness
def evaluate(individual):
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    if len(selected_features) == 0:  # Prevent division by zero
        return 0,

    X_train_selected = X_train.iloc[:, selected_features]
    X_val_selected = X_val.iloc[:, selected_features]

    model = RandomForestClassifier(random_state=42)
    model.fit(X_train_selected, y_train)
    y_pred = model.predict(X_val_selected)

    accuracy = accuracy_score(y_val, y_pred)
    return accuracy,

toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate", evaluate)

# Parameters for the Genetic Algorithm
population_size = 50
generations = 20
crossover_probability = 0.8
mutation_probability = 0.1

# Initialize population
population = toolbox.population(n=population_size)

# Run the Genetic Algorithm
result_population, logbook = algorithms.eaSimple(
    population,
    toolbox,
    cxpb=crossover_probability,
    mutpb=mutation_probability,
    ngen=generations,
    verbose=True
)

# Find the best individual
best_individual = tools.selBest(result_population, k=1)[0]
selected_features = [i for i, bit in enumerate(best_individual) if bit == 1 and i < len(X.columns)]  # Bounds check

# Select top 15 features based on their importance
if len(selected_features) > 15:
    feature_importances = pd.Series(best_individual).sort_values(ascending=False)
    selected_features = list(feature_importances.head(15).index)

# Evaluate performance using the top 15 features
X_train_selected = X_train.iloc[:, selected_features]
X_val_selected = X_val.iloc[:, selected_features]

final_model = RandomForestClassifier(random_state=42)
final_model.fit(X_train_selected, y_train)
final_predictions = final_model.predict(X_val_selected)
final_accuracy = accuracy_score(y_val, final_predictions)

print(f"Top 15 Selected features: {selected_features}")
print(f"Final Accuracy with top 15 selected features: {final_accuracy}")

# Save the top 20 selected features
pd.DataFrame({'Selected Features': selected_features}).to_csv('/content/CFV_top_15_selected_features.csv', index=False)




gen	nevals
0  	50    
1  	40    
2  	38    
3  	36    
4  	35    
5  	46    
6  	42    
7  	46    
8  	47    
9  	37    
10 	36    
11 	37    
12 	48    
13 	48    
14 	43    
15 	43    
16 	41    
17 	39    
18 	42    
19 	34    
20 	41    
Top 15 Selected features: [60, 73, 36, 46, 48, 50, 55, 56, 58, 59, 64, 65, 67, 69, 71]
Final Accuracy with top 15 selected features: 0.9166666666666666


In [15]:
selected_feature_columns = data.columns[selected_features]
# Create a filtered dataset with only the selected top 15 features
filtered_data = data[selected_feature_columns.tolist() + [data.columns[-1]]]
filtered_data_path = '/content/CFV_Top_15_Features.csv'
filtered_data.to_csv(filtered_data_path, index=False)

print(f"Filtered dataset with top 15 features saved to: {filtered_data_path}")

Filtered dataset with top 15 features saved to: /content/CFV_Top_15_Features.csv


PFV (Probability Feature Vector)

In [None]:
#marge the column
import pandas as pd

# Define the file paths and prefixes
data_paths = {
    "ACC": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Probabilites Dataset Marge/N_AAC_OPTUNA_probability_predictions.csv",
    "CTDC": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Probabilites Dataset Marge/N_CTDC_OPTUNA_probability_predictions.csv",
    "CTD": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Probabilites Dataset Marge/N_CTD_OPTUNA_probability_predictions.csv",
    "GDC": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Probabilites Dataset Marge/N_GDC_OPTUNA_probability_predictions.csv",
    "PAAC": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Probabilites Dataset Marge/N_PAAC_OPTUNA_probability_predictions.csv",
    "PCP": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Probabilites Dataset Marge/N_PCP_OPTUNA_probability_predictions.csv",
    "TPC": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Probabilites Dataset Marge/N_TPC_OPTUNA_probability_predictions.csv",
    "CTDT": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Probabilites Dataset Marge/N_CTDT_OPTUNA_probability_predictions.csv",
    "DPC": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Probabilites Dataset Marge/N_DPC_OPTUNA_probability_predictions.csv",
    "CTDD": "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Probabilites Dataset Marge/N_CTDD_OPTUNA_probability_predictions.csv",
}

processed_dfs = []
final_target_column = None  # To store the unique Target column

# Process each file in the dictionary
for prefix, path in data_paths.items():
    # Load the dataset
    df = pd.read_csv(path)

    # If a 'Target' column exists, store it and ensure only one is retained
    if 'Target' in df.columns:
        if final_target_column is None:
            final_target_column = df['Target']  # Retain the first Target column
        df.drop(columns=['Target'], inplace=True)  # Drop from the current dataset

    # Drop the 'True_Label' column if it exists
    df.drop(columns=['True_Label'], inplace=True, errors='ignore')

    # Rename columns with the prefix
    df = df.rename(columns=lambda col: f"{col}_{prefix}" if 'probabilities' not in col else col)

    # Append the processed DataFrame to the list
    processed_dfs.append(df)

# Concatenate all DataFrames column-wise
combined_df = pd.concat(processed_dfs, axis=1)

# Add the retained 'Target' column to the final dataset
if final_target_column is not None:
    combined_df['Target'] = final_target_column

# Save the combined DataFrame
output_file = "/content/Optuna_Dataset Marge PFV.csv"
combined_df.to_csv(output_file, index=False)

print(f"Processed dataset saved to: {output_file}")

Processed dataset saved to: /content/Optuna_Dataset Marge PFV.csv


In [None]:
# Check the shape of the merged dataset
print("Shape of the merged dataset:", combined_df.shape)

Shape of the merged dataset: (300, 121)


In [None]:
df=pd.read_csv("/content/Optuna_Dataset Marge PFV.csv")

In [None]:
df.shape

(300, 121)

In [None]:
df.columns

Index(['SVM_ACC', 'Decision Tree_ACC', 'Random Forest_ACC',
       'Logistic Regression_ACC', 'k-NN_ACC', 'Naive Bayes_ACC',
       'Gradient Boosting_ACC', 'XGBoost_ACC', 'LightGBM_ACC', 'AdaBoost_ACC',
       ...
       'Logistic Regression_CTDD', 'k-NN_CTDD', 'Naive Bayes_CTDD',
       'Gradient Boosting_CTDD', 'XGBoost_CTDD', 'LightGBM_CTDD',
       'AdaBoost_CTDD', 'Neural Network_CTDD', 'MLP_CTDD', 'Target'],
      dtype='object', length=121)

In [None]:
!pip install deap



In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from deap import base, creator, tools, algorithms
import random

# Load the dataset
data_path = '/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/Probabilites Dataset Marge/Optuna_Dataset Marge PFV (1).csv'
data = pd.read_csv(data_path)

# Assuming the last column is the target, split features and labels
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target

# Encode target labels if necessary
if y.dtype == object or np.issubdtype(y.dtype, np.number):  # Handle both string and numeric labels
    le = LabelEncoder()
    y = le.fit_transform(y.astype(str))  # Ensure all targets are treated as strings for classification

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the individual and fitness functions
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(X.columns))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Classifier to evaluate fitness
def evaluate(individual):
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    if len(selected_features) == 0:  # Prevent division by zero
        return 0,

    X_train_selected = X_train.iloc[:, selected_features]
    X_val_selected = X_val.iloc[:, selected_features]

    model = RandomForestClassifier(random_state=42)
    model.fit(X_train_selected, y_train)
    y_pred = model.predict(X_val_selected)

    accuracy = accuracy_score(y_val, y_pred)
    return accuracy,

toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate", evaluate)

# Parameters for the Genetic Algorithm
population_size = 50
generations = 20
crossover_probability = 0.8
mutation_probability = 0.1

# Initialize population
population = toolbox.population(n=population_size)

# Run the Genetic Algorithm
result_population, logbook = algorithms.eaSimple(
    population,
    toolbox,
    cxpb=crossover_probability,
    mutpb=mutation_probability,
    ngen=generations,
    verbose=True
)

# Find the best individual
best_individual = tools.selBest(result_population, k=1)[0]
selected_features = [i for i, bit in enumerate(best_individual) if bit == 1 and i < len(X.columns)]  # Bounds check

# Select top 15 features based on their importance
if len(selected_features) > 15:
    feature_importances = pd.Series(best_individual).sort_values(ascending=False)
    selected_features = list(feature_importances.head(15).index)

# Evaluate performance using the top 15 features
X_train_selected = X_train.iloc[:, selected_features]
X_val_selected = X_val.iloc[:, selected_features]

final_model = RandomForestClassifier(random_state=42)
final_model.fit(X_train_selected, y_train)
final_predictions = final_model.predict(X_val_selected)
final_accuracy = accuracy_score(y_val, final_predictions)

print(f"Top 15 Selected features: {selected_features}")
print(f"Final Accuracy with top 15 selected features: {final_accuracy}")

# Save the top 20 selected features
pd.DataFrame({'Selected Features': selected_features}).to_csv('/content/PFV_top_15_selected_features.csv', index=False)




gen	nevals
0  	50    
1  	36    
2  	41    
3  	43    
4  	42    
5  	36    
6  	38    
7  	35    
8  	40    
9  	46    
10 	41    
11 	38    
12 	45    
13 	45    
14 	43    
15 	34    
16 	41    
17 	39    
18 	44    
19 	40    
20 	39    
Top 15 Selected features: [0, 36, 75, 74, 72, 70, 69, 68, 66, 65, 55, 52, 51, 49, 48]
Final Accuracy with top 15 selected features: 0.8833333333333333


In [17]:
selected_feature_columns = data.columns[selected_features]
# Create a filtered dataset with only the selected top 15 features
filtered_data = data[selected_feature_columns.tolist() + [data.columns[-1]]]
filtered_data_path = '/content/PFV_Top_15_Features.csv'
filtered_data.to_csv(filtered_data_path, index=False)

print(f"Filtered dataset with top 15 features saved to: {filtered_data_path}")

Filtered dataset with top 15 features saved to: /content/PFV_Top_15_Features.csv


Three technique dataset are marge

In [18]:
#marge the column
import pandas as pd

# Define the file paths and prefixes
data_paths = {
    "CFV": "/content/CFV_Top_15_Features.csv",
    "CPFV": "/content/CPFV_Top_15_Features.csv",
    "PFV": "/content/PFV_Top_15_Features.csv",
}

processed_dfs = []
final_target_column = None  # To store the unique Target column

# Process each file in the dictionary
for prefix, path in data_paths.items():
    # Load the dataset
    df = pd.read_csv(path)

    # If a 'Target' column exists, store it and ensure only one is retained
    if 'Target' in df.columns:
        if final_target_column is None:
            final_target_column = df['Target']  # Retain the first Target column
        df.drop(columns=['Target'], inplace=True)  # Drop from the current dataset

    # Drop the 'True_Label' column if it exists
    df.drop(columns=['True_Label'], inplace=True, errors='ignore')

    # Rename columns with the prefix
    df = df.rename(columns=lambda col: f"{col}_{prefix}" if 'probabilities' not in col else col)

    # Append the processed DataFrame to the list
    processed_dfs.append(df)

# Concatenate all DataFrames column-wise
combined_df = pd.concat(processed_dfs, axis=1)

# Add the retained 'Target' column to the final dataset
if final_target_column is not None:
    combined_df['Target'] = final_target_column

# Save the combined DataFrame
output_file = "/content/three_T_Marge.csv"
combined_df.to_csv(output_file, index=False)

print(f"Processed dataset saved to: {output_file}")

Processed dataset saved to: /content/three_T_Marge.csv


In [19]:
df=pd.read_csv("/content/three_T_Marge.csv")

In [20]:
df.shape

(300, 46)

Hybrid CNN-LSTM Model for Cell-Penetrating Peptide Classification

In [21]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, BatchNormalization
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Load the dataset
dataset_path = "/content/three_T_Marge.csv"
data = pd.read_csv(dataset_path)

# Inspect the dataset (Optional: Uncomment if needed to verify column names)
print("Dataset Columns:", data.columns)
print(data.head())

# Ensure the dataset contains a 'Target' column for binary classification
if 'Target' not in data.columns:
    raise ValueError("The dataset must include a 'Target' column for binary classification.")

# Separate features and labels
X = data.drop(columns=['Target']).values  # Features (all columns except 'Target')
y = data['Target'].values                 # Labels (the 'Target' column)

# Split data into training and validation sets (80-20 split)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Reshape data for Conv1D input
X_train = X_train[..., np.newaxis]  # Adding channel dimension
X_val = X_val[..., np.newaxis]      # Adding channel dimension

# Model Architecture
model = Sequential()

# Stacked Conv1D layers with BatchNormalization and Dropout
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Conv1D(filters=256, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

# LSTM layer for sequential dependencies
model.add(LSTM(64, return_sequences=False, activation='relu'))

# Dense Layers for final prediction with Dropout for regularization
model.add(Dense(128, activation='swish'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='swish'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model Summary
model.summary()

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=32, verbose=1)

# Evaluate the model on the validation data
val_predictions = (model.predict(X_val) > 0.5).astype(int)
accuracy = accuracy_score(y_val, val_predictions)

print("\nValidation Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_val, val_predictions))




Dataset Columns: Index(['SVM_PCP_CFV', 'Decision Tree_TPC_CFV', 'SVM_GDC_CFV',
       'Neural Network_GDC_CFV', 'SVM_PAAC_CFV', 'Random Forest_PAAC_CFV',
       'XGBoost_PAAC_CFV', 'LightGBM_PAAC_CFV', 'Neural Network_PAAC_CFV',
       'MLP_PAAC_CFV', 'k-NN_PCP_CFV', 'Naive Bayes_PCP_CFV',
       'XGBoost_PCP_CFV', 'AdaBoost_PCP_CFV', 'MLP_PCP_CFV',
       'SVM_Class_ACC_CPFV', 'Logistic Regression_Class_GDC_CPFV',
       'Gradient Boosting_Prob_PCP_CPFV', 'AdaBoost_Class_PCP_CPFV',
       'AdaBoost_Prob_PCP_CPFV',
       'Neural Network (MLPClassifier)_Class_PCP_CPFV',
       'Multilayer Perceptron (Custom MLP)_Class_PCP_CPFV',
       'Random Forest_Class_TPC_CPFV', 'Naive Bayes_Prob_TPC_CPFV',
       'Gradient Boosting_Class_TPC_CPFV', 'AdaBoost_Class_TPC_CPFV',
       'Neural Network (MLPClassifier)_Class_TPC_CPFV',
       'Neural Network (MLPClassifier)_Prob_TPC_CPFV',
       'Multilayer Perceptron (Custom MLP)_Prob_TPC_CPFV',
       'SVM_Prob_CTDT_CPFV', 'SVM_ACC_PFV', 'SVM_GDC_PF

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 124ms/step - accuracy: 0.7756 - loss: 0.4972 - val_accuracy: 0.8889 - val_loss: 0.6517
Epoch 2/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 0.8987 - loss: 0.5374 - val_accuracy: 0.8333 - val_loss: 0.6514
Epoch 3/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.9011 - loss: 0.2552 - val_accuracy: 0.8111 - val_loss: 0.6474
Epoch 4/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.9262 - loss: 0.2188 - val_accuracy: 0.7889 - val_loss: 0.6427
Epoch 5/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.9351 - loss: 0.1633 - val_accuracy: 0.7778 - val_loss: 0.6397
Epoch 6/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - accuracy: 0.9323 - loss: 0.2195 - val_accuracy: 0.6556 - val_loss: 0.6464
Epoch 7/100
[1m7/7[0m [32m━━━━━━━━━━

Deep learning approach combining Conv1D, LSTM, and Dense layers

In [22]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Load the combined AAC dataset
dataset_path = "/content/three_T_Marge.csv"
data = pd.read_csv(dataset_path)

# Ensure the dataset contains the 'Target' column
if 'Target' not in data.columns:
    raise ValueError("The dataset must include a 'Target' column for binary classification.")

# Separate features and labels
X = data.drop(columns=['Target']).values  # Features (all columns except 'Target')
y = data['Target'].values                 # Labels (the 'Target' column)

# Split data into training and validation sets (80-20 split)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Reshape input for Conv1D
X_train = X_train[..., np.newaxis]  # Adding channel dimension for Conv1D input
X_val = X_val[..., np.newaxis]      # Adding channel dimension for Conv1D input

# Model Architecture
model = Sequential()

# Stacked Conv1D layers with BatchNormalization and Dropout
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1), padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

model.add(Conv1D(filters=128, kernel_size=5, activation='relu', padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

model.add(Conv1D(filters=256, kernel_size=3, activation='relu', padding='same'))
model.add(BatchNormalization())
# Removed problematic pooling layer here to avoid negative dimension issue
model.add(Dropout(0.3))

# LSTM layer for sequential dependencies
model.add(LSTM(64, return_sequences=False, activation='relu'))

# Dense Layers for final prediction with Dropout for regularization
model.add(Dense(128, activation='swish'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='swish'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))   # Output layer for binary classification

# Compile the Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model Summary
model.summary()

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=32, verbose=1)

# Evaluate the model
val_predictions = (model.predict(X_val) > 0.5).astype(int)
accuracy = accuracy_score(y_val, val_predictions)

print("\nValidation Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_val, val_predictions))

# Save the trained model
model.save("mmCombinedDataset_AAC_model.h5")
print("Model saved as 'mmCombinedDataset_AAC_model.h5'.")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 128ms/step - accuracy: 0.7081 - loss: 0.5305 - val_accuracy: 0.8556 - val_loss: 0.6583
Epoch 2/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - accuracy: 0.8933 - loss: 0.3455 - val_accuracy: 0.7778 - val_loss: 0.6563
Epoch 3/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 77ms/step - accuracy: 0.9156 - loss: 0.2331 - val_accuracy: 0.7444 - val_loss: 0.6541
Epoch 4/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 71ms/step - accuracy: 0.9031 - loss: 0.2202 - val_accuracy: 0.7889 - val_loss: 0.6389
Epoch 5/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 73ms/step - accuracy: 0.9051 - loss: 0.2379 - val_accuracy: 0.6111 - val_loss: 0.6227
Epoch 6/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 77ms/step - accuracy: 0.9147 - loss: 0.2378 - val_accuracy: 0.6000 - val_loss: 0.6248
Epoch 7/100
[1m7/7[0m [32m━━━━━━━━━━




Validation Accuracy: 0.9111111111111111

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.93      0.91        45
           1       0.93      0.89      0.91        45

    accuracy                           0.91        90
   macro avg       0.91      0.91      0.91        90
weighted avg       0.91      0.91      0.91        90

Model saved as 'mmCombinedDataset_AAC_model.h5'.


PeptidePredictor_CNN

In [23]:
from keras.models import Model
from keras.layers import Input, Conv1D, BatchNormalization, Dropout, GlobalMaxPooling1D, Dense
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.metrics import classification_report
from keras import regularizers

# Load Dataset
file_path = '/content/three_T_Marge.csv'
data = pd.read_csv(file_path)

# Separate Features and Labels
X = data.iloc[:, :-1].values  # Features
y = data.iloc[:, -1].values   # Labels

# Handle missing values (imputation)
imputer = SimpleImputer(strategy='mean')  # You can also use median or most_frequent
X = imputer.fit_transform(X)

# Normalize Features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# SMOTE for oversampling the minority class
smote = SMOTE(sampling_strategy='auto', random_state=42)
X, y = smote.fit_resample(X, y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Build Model (same as the one discussed previously)
def build_model(input_dim):
    inputs = Input(shape=(input_dim, 1))  # 1D CNN expects a 3D input shape (samples, features, 1)

    # Conv1D Layer
    x = Conv1D(64, kernel_size=3, activation='relu', padding='same')(inputs)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)

    # Another Conv1D Layer
    x = Conv1D(128, kernel_size=3, activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)

    # Max Pooling
    x = GlobalMaxPooling1D()(x)

    # Dense Layer
    x = Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
    x = Dropout(0.4)(x)

    # Output Layer
    outputs = Dense(1, activation='sigmoid')(x)  # For binary classification

    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

    return model

# Reshaping input for Conv1D (samples, features, 1)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Get Input Dimension
input_dim = X_train.shape[1]

# Initialize Model
model = build_model(input_dim)

# Early Stopping (optional, but can be added for faster convergence)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the Model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

# Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy:.4f}")

# Evaluate the Model using Classification Report
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))

# Save the Model
model.save('/content/improved_probability_based_model.h5')


Epoch 1/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 67ms/step - accuracy: 0.5467 - loss: 2.9845 - val_accuracy: 0.8778 - val_loss: 1.8366
Epoch 2/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.8758 - loss: 1.8068 - val_accuracy: 0.9000 - val_loss: 1.7769
Epoch 3/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.9245 - loss: 1.6025 - val_accuracy: 0.9000 - val_loss: 1.7315
Epoch 4/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.9028 - loss: 1.5903 - val_accuracy: 0.9000 - val_loss: 1.6963
Epoch 5/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.8548 - loss: 1.5002 - val_accuracy: 0.9000 - val_loss: 1.6675
Epoch 6/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.8876 - loss: 1.4302 - val_accuracy: 0.9000 - val_loss: 1.6358
Epoch 7/100
[1m7/7[0m [32m━━━━━━━━━━━



Test Accuracy: 0.8889
[1m1/3[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 136ms/step



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step




              precision    recall  f1-score   support

           0       0.88      0.88      0.88        42
           1       0.90      0.90      0.90        48

    accuracy                           0.89        90
   macro avg       0.89      0.89      0.89        90
weighted avg       0.89      0.89      0.89        90



In [27]:
from keras.models import Model
from keras.layers import Input, Conv1D, BatchNormalization, Dropout, GlobalMaxPooling1D, Dense, LeakyReLU
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from keras import regularizers
import pandas as pd

# Load Dataset
file_path = '/content/three_T_Marge.csv'
data = pd.read_csv(file_path)

# Separate Features and Labels
X = data.iloc[:, :-1].values  # Features
y = data.iloc[:, -1].values   # Labels

# Handle missing values (imputation)
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Normalize Features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# SMOTE for oversampling the minority class
smote = SMOTE(sampling_strategy='auto', random_state=42)
X, y = smote.fit_resample(X, y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Build Model
def build_model(input_dim):
    inputs = Input(shape=(input_dim, 1))

    # First Conv1D Layer
    x = Conv1D(128, kernel_size=3, padding='same')(inputs)
    x = LeakyReLU(alpha=0.1)(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)

    # Second Conv1D Layer
    x = Conv1D(256, kernel_size=5, padding='same')(x)
    x = LeakyReLU(alpha=0.1)(x)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)

    # Third Conv1D Layer
    x = Conv1D(512, kernel_size=3, padding='same')(x)
    x = LeakyReLU(alpha=0.1)(x)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)

    # Global Max Pooling
    x = GlobalMaxPooling1D()(x)

    # Fully Connected Layers
    x = Dense(256, kernel_regularizer=regularizers.l2(0.01))(x)
    x = LeakyReLU(alpha=0.1)(x)
    x = Dropout(0.5)(x)

    x = Dense(128, kernel_regularizer=regularizers.l2(0.01))(x)
    x = LeakyReLU(alpha=0.1)(x)
    x = Dropout(0.5)(x)

    x = Dense(64, kernel_regularizer=regularizers.l2(0.01))(x)
    x = LeakyReLU(alpha=0.1)(x)
    x = Dropout(0.5)(x)

    # Output Layer
    outputs = Dense(1, activation='sigmoid')(x)

    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    return model

# Reshaping input for Conv1D
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Initialize Model
input_dim = X_train.shape[1]
model = build_model(input_dim)

# Early Stopping and Learning Rate Scheduler
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6)

# Train the Model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=150,
    batch_size=32,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy:.4f}")

# Classification Report
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))

# Save the Model
model.save('/content/high_accuracy_model.h5')




Epoch 1/150
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 267ms/step - accuracy: 0.5848 - loss: 8.2814 - val_accuracy: 0.8667 - val_loss: 6.5102 - learning_rate: 0.0010
Epoch 2/150
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 187ms/step - accuracy: 0.7976 - loss: 6.8492 - val_accuracy: 0.8667 - val_loss: 6.3487 - learning_rate: 0.0010
Epoch 3/150
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 287ms/step - accuracy: 0.8091 - loss: 6.8224 - val_accuracy: 0.8667 - val_loss: 6.2096 - learning_rate: 0.0010
Epoch 4/150
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 320ms/step - accuracy: 0.8555 - loss: 6.4537 - val_accuracy: 0.8889 - val_loss: 6.0873 - learning_rate: 0.0010
Epoch 5/150
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 191ms/step - accuracy: 0.8454 - loss: 6.1727 - val_accuracy: 0.8889 - val_loss: 5.9675 - learning_rate: 0.0010
Epoch 6/150
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 



              precision    recall  f1-score   support

           0       0.88      0.86      0.87        42
           1       0.88      0.90      0.89        48

    accuracy                           0.88        90
   macro avg       0.88      0.88      0.88        90
weighted avg       0.88      0.88      0.88        90



In [25]:
from keras.models import Model
from keras.layers import Input, Conv1D, BatchNormalization, Dropout, GlobalMaxPooling1D, Dense
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import pandas as pd
from keras import regularizers

# Load Dataset
file_path = '/content/three_T_Marge.csv'
data = pd.read_csv(file_path)

# Separate Features and Labels
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Normalize Features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# SMOTE for oversampling
smote = SMOTE(sampling_strategy='auto', random_state=42)
X, y = smote.fit_resample(X, y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Reshaping input for Conv1D
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Build Model
def build_model(input_dim):
    inputs = Input(shape=(input_dim, 1))
    x = Conv1D(128, kernel_size=3, activation='relu', padding='same')(inputs)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)

    x = Conv1D(256, kernel_size=5, activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)

    x = Conv1D(512, kernel_size=7, activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)

    x = GlobalMaxPooling1D()(x)

    x = Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
    x = Dropout(0.5)(x)

    outputs = Dense(1, activation='sigmoid')(x)
    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy'])
    return model

input_dim = X_train.shape[1]
model = build_model(input_dim)

early_stopping = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)

# Train Model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=150,
    batch_size=32,
    class_weight={0: 1, 1: 1},
    callbacks=[early_stopping],
    verbose=1
)

# Evaluate Model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy:.4f}")

# Classification Report
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))

# Save Model
model.save('/content/high_accuracy_model.keras')


Epoch 1/150
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 327ms/step - accuracy: 0.7240 - loss: 5.5424 - val_accuracy: 0.8778 - val_loss: 3.8520
Epoch 2/150
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 324ms/step - accuracy: 0.8760 - loss: 4.9690 - val_accuracy: 0.8889 - val_loss: 3.7310
Epoch 3/150
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 461ms/step - accuracy: 0.8989 - loss: 3.7968 - val_accuracy: 0.8778 - val_loss: 3.6459
Epoch 4/150
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 401ms/step - accuracy: 0.8942 - loss: 3.6658 - val_accuracy: 0.8556 - val_loss: 3.5547
Epoch 5/150
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 279ms/step - accuracy: 0.8770 - loss: 3.3767 - val_accuracy: 0.8222 - val_loss: 3.4699
Epoch 6/150
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 271ms/step - accuracy: 0.9053 - loss: 3.3158 - val_accuracy: 0.8000 - val_loss: 3.4017
Epoch 7/150
[1m7/7[0m [32m━━━━

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [None]:
import optuna
from keras.models import Model
from keras.layers import Input, Conv1D, BatchNormalization, Dropout, GlobalMaxPooling1D, Dense
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
import pandas as pd
from sklearn.metrics import classification_report

# Load the dataset
file_path = '/content/three_T_Marge.csv'
data = pd.read_csv(file_path)

# Separate features and labels
X = data.iloc[:, :-1].values  # Features
y = data.iloc[:, -1].values   # Labels

# Handle missing values (imputation)
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Normalize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# SMOTE for oversampling the minority class
smote = SMOTE(sampling_strategy='auto', random_state=42)
X, y = smote.fit_resample(X, y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Reshaping the input for Conv1D (samples, features, 1)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Objective function for Optuna
def objective(trial):
    # Hyperparameters to tune
    filters_1 = trial.suggest_int('filters_1', 32, 128)
    filters_2 = trial.suggest_int('filters_2', 64, 256)
    kernel_size_1 = trial.suggest_int('kernel_size_1', 3, 5)
    kernel_size_2 = trial.suggest_int('kernel_size_2', 3, 5)
    dropout_1 = trial.suggest_float('dropout_1', 0.2, 0.5)
    dropout_2 = trial.suggest_float('dropout_2', 0.2, 0.5)
    dense_units = trial.suggest_int('dense_units', 64, 256)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)

    # Build model
    inputs = Input(shape=(X_train.shape[1], 1))  # 1D CNN expects a 3D input shape (samples, features, 1)

    # First Conv1D Layer
    x = Conv1D(filters_1, kernel_size=kernel_size_1, activation='relu', padding='same')(inputs)
    x = BatchNormalization()(x)
    x = Dropout(dropout_1)(x)

    # Second Conv1D Layer
    x = Conv1D(filters_2, kernel_size=kernel_size_2, activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = Dropout(dropout_2)(x)

    # Max Pooling
    x = GlobalMaxPooling1D()(x)

    # Dense Layer
    x = Dense(dense_units, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
    x = Dropout(0.4)(x)

    # Output Layer
    outputs = Dense(1, activation='sigmoid')(x)  # For binary classification

    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])

    # Early stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=50,
        batch_size=32,
        callbacks=[early_stopping],
        verbose=0
    )

    # Evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    return accuracy  # We want to maximize accuracy

# Create an Optuna study and start optimizing
study = optuna.create_study(direction='maximize')  # We want to maximize the accuracy

# Optimize the hyperparameters
study.optimize(objective, n_trials=50)  # n_trials is the number of different hyperparameter sets to try

# Print the best hyperparameters and accuracy
print(f"Best Hyperparameters: {study.best_params}")
print(f"Best Accuracy: {study.best_value:.4f}")


[I 2025-01-16 16:44:19,237] A new study created in memory with name: no-name-9ce818a1-8954-409b-ae3c-0d3c3465e7d1
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)
[I 2025-01-16 16:44:49,664] Trial 0 finished with value: 0.8999999761581421 and parameters: {'filters_1': 127, 'filters_2': 99, 'kernel_size_1': 3, 'kernel_size_2': 5, 'dropout_1': 0.3846746952997323, 'dropout_2': 0.23942198137150264, 'dense_units': 198, 'learning_rate': 1.8600464785366256e-05}. Best is trial 0 with value: 0.8999999761581421.
[I 2025-01-16 16:45:08,252] Trial 1 finished with value: 0.9111111164093018 and parameters: {'filters_1': 125, 'filters_2': 251, 'kernel_size_1': 5, 'kernel_size_2': 4, 'dropout_1': 0.3130175125421953, 'dropout_2': 0.3254846559253608, 'dense_units': 90, 'learning_rate': 6.0115629311913246e-05}. Best is trial 1 with value: 0.9111111164093018.
[I 2025-01-16 16:45:27,511] Trial 2 finished with value: 0.8777777552604675 and parameters: {'filters_1': 39, 'filters_2': 17

Best Hyperparameters: {'filters_1': 60, 'filters_2': 197, 'kernel_size_1': 4, 'kernel_size_2': 5, 'dropout_1': 0.2598558372020141, 'dropout_2': 0.3931967458563041, 'dense_units': 104, 'learning_rate': 0.00018393736277396444}
Best Accuracy: 0.9222
