In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif, f_classif, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# Load the dataset
data = pd.read_csv('augmented_data.csv')

# Remove rows where all column values are duplicated
data = data.drop_duplicates()


# Preprocessing
features = data.drop(columns=['Target'])
target = data['Target']
scaler = StandardScaler()
features = scaler.fit_transform(features)
label_encoder = LabelEncoder()
target = label_encoder.fit_transform(target)

# Reshape data for feature selection
X = features
y = target

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Feature selection
scores = np.zeros(X_resampled.shape[1])
mi = mutual_info_classif(X_resampled, y_resampled)
scores += mi
f_test, _ = f_classif(X_resampled, y_resampled)
scores += f_test
model = LogisticRegression(max_iter=500)
rfe = RFE(model, n_features_to_select=30)
rfe.fit(X_resampled, y_resampled)
rfe_scores = np.array([1 if i in rfe.support_ else 0 for i in range(X_resampled.shape[1])])
scores += rfe_scores
model = LogisticRegression(penalty='l1', solver='liblinear', max_iter=500)
model.fit(X_resampled, y_resampled)
lasso_scores = np.abs(model.coef_[0])
scores += lasso_scores
model = RandomForestClassifier()
model.fit(X_resampled, y_resampled)
tree_scores = model.feature_importances_
scores += tree_scores

# Average the scores
average_scores = scores / 5

# Select the top features
top_indices = np.argsort(average_scores)[-70:]
X_selected = X_resampled[:, top_indices]

# Cross-validation setup
n_splits = 6
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize lists to store the results
accuracies = {'MLP': [], 'CNN': [], 'LSTM': [], 'GRU': [], 'Transformer': []}
precisions = {'MLP': [], 'CNN': [], 'LSTM': [], 'GRU': [], 'Transformer': []}
recalls = {'MLP': [], 'CNN': [], 'LSTM': [], 'GRU': [], 'Transformer': []}
f1_scores = {'MLP': [], 'CNN': [], 'LSTM': [], 'GRU': [], 'Transformer': []}
all_y_true = []
all_y_pred = []

# Function to evaluate deep learning models
def evaluate_dl_model(model, X_train, X_test, y_train, y_test, model_name):
    early_stopping = EarlyStopping(patience=10, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(factor=0.5, patience=5, verbose=1)
    model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0, callbacks=[early_stopping, reduce_lr])
    y_pred = np.argmax(model.predict(X_test), axis=1)
    all_y_true.extend(y_test)
    all_y_pred.extend(y_pred)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    accuracies[model_name].append(acc)
    precisions[model_name].append(prec)
    recalls[model_name].append(rec)
    f1_scores[model_name].append(f1)
    print(f"{model_name} Fold Results:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Cross-validation loop
fold_number = 1  # Initialize fold counter
for train_index, test_index in skf.split(X_selected, y_resampled):
    print(f"\nFold {fold_number}:")
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y_resampled[train_index], y_resampled[test_index]
    
    # Print number of data points in training and test sets
    print(f"Training set size: {X_train.shape[0]}")
    print(f"Test set size: {X_test.shape[0]}")

    # Reshape for CNN, LSTM, GRU, and Transformer input
    X_train_reshaped = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
    X_test_reshaped = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

    # MLP Model
    mlp_model = models.Sequential([
        layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        layers.Dense(64, activation='relu'),
        layers.Dense(len(np.unique(y_train)), activation='softmax')
    ])
    mlp_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    evaluate_dl_model(mlp_model, X_train, X_test, y_train, y_test, 'MLP')

    # CNN Model
    cnn_model = models.Sequential([
        layers.Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train_reshaped.shape[1], 1)),
        layers.MaxPooling1D(pool_size=2),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(len(np.unique(y_train)), activation='softmax')
    ])
    cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    evaluate_dl_model(cnn_model, X_train_reshaped, X_test_reshaped, y_train, y_test, 'CNN')

    # LSTM Model
    lstm_model = models.Sequential([
        layers.LSTM(64, input_shape=(X_train_reshaped.shape[1], 1)),
        layers.Dense(64, activation='relu'),
        layers.Dense(len(np.unique(y_train)), activation='softmax')
    ])
    lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    evaluate_dl_model(lstm_model, X_train_reshaped, X_test_reshaped, y_train, y_test, 'LSTM')

    # GRU Model
    gru_model = models.Sequential([
        layers.GRU(64, input_shape=(X_train_reshaped.shape[1], 1)),
        layers.Dense(64, activation='relu'),
        layers.Dense(len(np.unique(y_train)), activation='softmax')
    ])
    gru_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    evaluate_dl_model(gru_model, X_train_reshaped, X_test_reshaped, y_train, y_test, 'GRU')

    # Transformer Model (Functional API)
    input_layer = layers.Input(shape=(X_train_reshaped.shape[1], 1))
    attention_output = layers.MultiHeadAttention(num_heads=4, key_dim=2)(input_layer, input_layer)
    pooling_output = layers.GlobalAveragePooling1D()(attention_output)
    dense_output = layers.Dense(64, activation='relu')(pooling_output)
    final_output = layers.Dense(len(np.unique(y_train)), activation='softmax')(dense_output)

    transformer_model = models.Model(inputs=input_layer, outputs=final_output)
    transformer_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    evaluate_dl_model(transformer_model, X_train_reshaped, X_test_reshaped, y_train, y_test, 'Transformer')
    
    fold_number += 1  # Increment fold counter

# Display average results
for model_name in accuracies.keys():
    print(f"\nAverage results for {model_name}:")
    print(f"Accuracy: {np.mean(accuracies[model_name]):.4f}")
    print(f"Precision: {np.mean(precisions[model_name]):.4f}")
    print(f"Recall: {np.mean(recalls[model_name]):.4f}")
    print(f"F1 Score: {np.mean(f1_scores[model_name]):.4f}")

# Final Confusion Matrix and Final Classification Report
final_confusion_matrix = confusion_matrix(all_y_true, all_y_pred)
final_classification_report = classification_report(all_y_true, all_y_pred, target_names=label_encoder.classes_)

print("\nFinal Confusion Matrix:")
print(final_confusion_matrix)

print("\nFinal Classification Report:")
print(final_classification_report)
