In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif, f_classif, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras import layers, models

# Load the dataset
data = pd.read_csv('augmented_data.csv')

# Remove rows where all column values are duplicated
data = data.drop_duplicates()


# Preprocessing
features = data.drop(columns=['Target'])
target = data['Target']
scaler = StandardScaler()
features = scaler.fit_transform(features)
label_encoder = LabelEncoder()
target = label_encoder.fit_transform(target)

# Reshape data for feature selection
X = features
y = target

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Initialize an empty score array
scores = np.zeros(X_resampled.shape[1])

# Feature selection
# Mutual Information
mi = mutual_info_classif(X_resampled, y_resampled)
scores += mi

# ANOVA F-test
f_test, _ = f_classif(X_resampled, y_resampled)
scores += f_test

# Recursive Feature Elimination (RFE)
model = LogisticRegression(max_iter=500)
rfe = RFE(model, n_features_to_select=30)
rfe.fit(X_resampled, y_resampled)
rfe_scores = np.array([1 if i in rfe.support_ else 0 for i in range(X_resampled.shape[1])])
scores += rfe_scores

# L1 Regularization (Lasso)
model = LogisticRegression(penalty='l1', solver='liblinear', max_iter=500)
model.fit(X_resampled, y_resampled)
lasso_scores = np.abs(model.coef_[0])
scores += lasso_scores

# Average the scores
average_scores = scores / 4

# Select the top features
top_indices = np.argsort(average_scores)[-70:]
X_selected = X_resampled[:, top_indices]

# Cross-Validation with different models
n_splits = 6
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize lists to store the results
accuracies = {'Logistic Regression': [], 'SVM': [], 'KNN': [], 'Gradient Boosting': [], 'Neural Network': []}
precisions = {'Logistic Regression': [], 'SVM': [], 'KNN': [], 'Gradient Boosting': [], 'Neural Network': []}
recalls = {'Logistic Regression': [], 'SVM': [], 'KNN': [], 'Gradient Boosting': [], 'Neural Network': []}
f1_scores = {'Logistic Regression': [], 'SVM': [], 'KNN': [], 'Gradient Boosting': [], 'Neural Network': []}

# Function to evaluate the model
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    accuracies[model_name].append(acc)
    precisions[model_name].append(prec)
    recalls[model_name].append(rec)
    f1_scores[model_name].append(f1)
    print(f"{model_name} Fold Results:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Cross-validation loop
for train_index, test_index in skf.split(X_selected, y_resampled):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y_resampled[train_index], y_resampled[test_index]

    # Logistic Regression
    lr_model = LogisticRegression(max_iter=500)
    evaluate_model(lr_model, X_train, X_test, y_train, y_test, 'Logistic Regression')

    # Support Vector Machine (SVM)
    svm_model = SVC()
    evaluate_model(svm_model, X_train, X_test, y_train, y_test, 'SVM')

    # K-Nearest Neighbors (KNN)
    knn_model = KNeighborsClassifier()
    evaluate_model(knn_model, X_train, X_test, y_train, y_test, 'KNN')

    # Gradient Boosting
    gb_model = GradientBoostingClassifier()
    evaluate_model(gb_model, X_train, X_test, y_train, y_test, 'Gradient Boosting')

    # Neural Network
    nn_model = models.Sequential([
        layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        layers.Dense(32, activation='relu'),
        layers.Dense(len(np.unique(y_train)), activation='softmax')
    ])
    nn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    nn_model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
    y_pred_nn = np.argmax(nn_model.predict(X_test), axis=1)
    acc_nn = accuracy_score(y_test, y_pred_nn)
    prec_nn = precision_score(y_test, y_pred_nn, average='weighted')
    rec_nn = recall_score(y_test, y_pred_nn, average='weighted')
    f1_nn = f1_score(y_test, y_pred_nn, average='weighted')
    accuracies['Neural Network'].append(acc_nn)
    precisions['Neural Network'].append(prec_nn)
    recalls['Neural Network'].append(rec_nn)
    f1_scores['Neural Network'].append(f1_nn)
    print("Neural Network Fold Results:")
    print(classification_report(y_test, y_pred_nn, target_names=label_encoder.classes_))

# Display average results
for model_name in accuracies.keys():
    print(f"\nAverage results for {model_name}:")
    print(f"Accuracy: {np.mean(accuracies[model_name]):.4f}")
    print(f"Precision: {np.mean(precisions[model_name]):.4f}")
    print(f"Recall: {np.mean(recalls[model_name]):.4f}")
    print(f"F1 Score: {np.mean(f1_scores[model_name]):.4f}")
