In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif, f_classif, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('augmented_data.csv')

# Remove rows where all column values are duplicated
data = data.drop_duplicates()

# Preprocessing
features = data.drop(columns=['Target'])
target = data['Target']
scaler = StandardScaler()
features = scaler.fit_transform(features)
label_encoder = LabelEncoder()
target = label_encoder.fit_transform(target)

# Reshape data for feature selection
X = features
y = target

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Initialize an empty score array
scores = np.zeros(X_resampled.shape[1])

# 1. Mutual Information
mi = mutual_info_classif(X_resampled, y_resampled)
scores += mi

# 2. ANOVA F-test
f_test, _ = f_classif(X_resampled, y_resampled)
scores += f_test

# 3. Recursive Feature Elimination (RFE)
model = LogisticRegression(max_iter=500)
rfe = RFE(model, n_features_to_select=30)
rfe.fit(X_resampled, y_resampled)
rfe_scores = np.array([1 if i in rfe.support_ else 0 for i in range(X_resampled.shape[1])])
scores += rfe_scores

# 4. L1 Regularization (Lasso)
model = LogisticRegression(penalty='l1', solver='liblinear', max_iter=500)
model.fit(X_resampled, y_resampled)
lasso_scores = np.abs(model.coef_[0])
scores += lasso_scores

# 5. Tree-based Feature Importance
model = RandomForestClassifier()
model.fit(X_resampled, y_resampled)
tree_scores = model.feature_importances_
scores += tree_scores

# Average the scores
average_scores = scores / 5

# Select the top 100 features
top_100_indices = np.argsort(average_scores)[-70:]
X_selected = X[:, top_100_indices]

# Reshape the selected features for model input
X_selected_reshaped = X_selected.reshape(X_selected.shape[0], 1, X_selected.shape[1])

# Cross-Validation with the selected features
n_splits = 6
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize lists to store overall predictions and ground truths
accuracies = []
precisions = []
recalls = []
f1_scores = []
y_true_all = []
y_pred_all = []
classification_reports = []

# Set fold counter
fold = 1

def build_transformer_model(input_shape, num_classes):
    inputs = layers.Input(shape=input_shape)
    
    # Transformer Encoder Layers
    for _ in range(6):  # Increased number of layers
        attention_output = layers.MultiHeadAttention(num_heads=32, key_dim=64, dropout=0.2)(inputs, inputs)
        x = layers.Add()([attention_output, inputs])
        x = layers.LayerNormalization(epsilon=1e-6)(x)
        x = layers.Dense(512, activation='relu')(x)  # Increased Dense layer size for more complexity
    
    # Global Average Pooling
    x = layers.GlobalAveragePooling1D()(x)
    
    # Dense layer and output
    x = layers.Dense(512, activation='relu')(x)
    x = layers.Dropout(0.5)(x)  # Increased dropout to prevent overfitting
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    
    model = models.Model(inputs, outputs)
    return model

for train_index, val_index in skf.split(X_selected_reshaped, y):
    print(f"Training on fold {fold}...")
    
    X_train, X_val = X_selected_reshaped[train_index], X_selected_reshaped[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    # Build and compile the model
    model = build_transformer_model(input_shape=(X_train.shape[1], X_train.shape[2]), num_classes=len(np.unique(y)))
    model.compile(optimizer=tf.keras.optimizers.AdamW(learning_rate=3e-4),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    # Callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6)
    
    # Train the model and store the history
    history = model.fit(X_train, y_train,
                        validation_data=(X_val, y_val),
                        epochs=100,
                        batch_size=128,
                        callbacks=[early_stopping, reduce_lr],
                        verbose=1)
    
    # Evaluate the model
    y_pred = model.predict(X_val)
    y_pred = np.argmax(y_pred, axis=1)
    
    # Append the predictions and true labels for final evaluation
    y_true_all.extend(y_val)
    y_pred_all.extend(y_pred)
    
    # Calculate fold metrics
    accuracy = accuracy_score(y_val, y_pred)
    precis = precision_score(y_val, y_pred, average='weighted')
    recal = recall_score(y_val, y_pred, average='weighted')
    f1 = f1_score(y_val, y_pred, average='weighted')
    
    report = classification_report(y_val, y_pred, target_names=label_encoder.classes_)
    
    # Store fold results
    accuracies.append(accuracy)
    precisions.append(precis)
    recalls.append(recal)
    f1_scores.append(f1)
    classification_reports.append(report)
    
    # Print fold results
    print(f"Fold {fold} Accuracy: {accuracy:.4f}")
    print(f"Fold {fold} Precision: {precis:.4f}")
    print(f"Fold {fold} Recall: {recal:.4f}")
    print(f"Fold {fold} F1-Score: {f1:.4f}")
    print(f"Fold {fold} Classification Report:")
    print(report)
    
    fold += 1

# Compute final confusion matrix from all predictions and ground truths
conf_matrix_final = confusion_matrix(y_true_all, y_pred_all)

# Calculate the final classification report for all classes after cross-validation
final_classification_report = classification_report(y_true_all, y_pred_all, target_names=label_encoder.classes_)

# Print final confusion matrix and classification report
print(f"\nFinal Confusion Matrix:\n{conf_matrix_final}")
print(f"\nFinal Classification Report:\n{final_classification_report}")

# Summary of Cross-Validation Results (Weighted Averages)
mean_accuracy = np.mean(accuracies)
mean_precision = np.mean(precisions)
mean_recall = np.mean(recalls)
mean_f1 = np.mean(f1_scores)

print(f"\nMean Cross-Validation Accuracy: {mean_accuracy:.4f}")
print(f"Mean Cross-Validation Precision: {mean_precision:.4f}")
print(f"Mean Cross-Validation Recall: {mean_recall:.4f}")
print(f"Mean Cross-Validation F1-Score: {mean_f1:.4f}")

