In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset
filename = '/content/Accounts.csv'
dataset = pd.read_csv(filename)

# NaN values
cols_with_nan = dataset.columns[dataset.isna().any()].tolist()
for col in cols_with_nan:
    if dataset[col].isna().any():
        dataset[col] = dataset.groupby('Class')[col].transform(lambda x: x.fillna(x.mode().iloc[0]))
    else:
        print(f"")

X = dataset.iloc[:, 1:46].values
y = dataset.iloc[:, 47].values

# features names
feature_names = dataset.columns[1:46]

# Hyperparameters for SVM linear
svm_parameters = {'C': [10**i for i in range(-3, 0, 10)]}

# Split the data into training, testing, and validation sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Nested cross-validation strategy
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Performance metric lists
svm_accuracy_scores = []
svm_precision_scores = []
svm_recall_scores = []
svm_f1_scores = []

# Initialize a list
feature_importance_scores = []

# Perform nested cross-validation
for train_index, test_index in outer_cv.split(X_train, y_train):
    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]

    # SVM linear
    svm_classifier = SVC(kernel='linear', random_state=42)
    svm_grid_search = GridSearchCV(svm_classifier, svm_parameters, cv=inner_cv)
    svm_grid_search.fit(X_train_fold, y_train_fold)
    svm_best_params = svm_grid_search.best_params_
    svm_best_model = SVC(kernel='linear', random_state=42, **svm_best_params)

    # Ablation Study
    ablation_scores = []
    for feature_index in range(X_train_fold.shape[1]):
        ablated_X_train = np.delete(X_train_fold, feature_index, axis=1)
        ablated_X_test = np.delete(X_test_fold, feature_index, axis=1)
        svm_best_model.fit(ablated_X_train, y_train_fold)
        svm_predictions = svm_best_model.predict(ablated_X_test)
        ablation_scores.append(accuracy_score(y_test_fold, svm_predictions))

    # Store feature importance scores
    feature_importance_scores.append(ablation_scores)

    # Evaluate SVM linear performance metrics
    svm_accuracy_scores.append(np.mean(ablation_scores))
    svm_precision_scores.append(precision_score(y_test_fold, svm_predictions))
    svm_recall_scores.append(recall_score(y_test_fold, svm_predictions))
    svm_f1_scores.append(f1_score(y_test_fold, svm_predictions))

# Calculate average performance scores
svm_average_accuracy = np.mean(svm_accuracy_scores)
svm_average_precision = np.mean(svm_precision_scores)
svm_average_recall = np.mean(svm_recall_scores)
svm_average_f1 = np.mean(svm_f1_scores)
svm_std_accuracy = np.std(svm_accuracy_scores)
svm_std_precision = np.std(svm_precision_scores)
svm_std_recall = np.std(svm_recall_scores)
svm_std_f1 = np.std(svm_f1_scores)

# Print the average performance scores
print("\nSVM Linear:")
print("Average Accuracy:", round(svm_average_accuracy,4))
print("Standard Deviation Accuracy:",round(svm_std_accuracy,4))
print("Average Precision:", round(svm_average_precision,4))
print("Standard Deviation Precision:", round(svm_std_precision,4))
print("Average Recall:", round(svm_average_recall,4))
print("Standard Deviation Recall:", round(svm_std_recall,4))
print("Average F1-score:", round(svm_average_f1,4))
print("Standard Deviation F1-score:", round(svm_std_f1,4))



SVM Linear:
Average Accuracy: 0.9767
Standard Deviation Accuracy: 0.0061
Average Precision: 0.9838
Standard Deviation Precision: 0.0077
Average Recall: 0.9685
Standard Deviation Recall: 0.0112
Average F1-score: 0.976
Standard Deviation F1-score: 0.007


In [2]:
# Select the best-performing features
best_feature_indices = np.argsort(np.mean(feature_importance_scores, axis=0))[::-1]

num_features_to_display = 10
X_best_features = X[:, best_feature_indices[:num_features_to_display]]

# Calculate and print feature importance with feature names
average_importance_scores = np.mean(feature_importance_scores, axis=0)
sorted_indices = np.argsort(average_importance_scores)[::-1]

print("\nTop Feature Importance (Ablation):")
for i in range(num_features_to_display):
    feature_index = sorted_indices[i]
    importance_score = round(average_importance_scores[feature_index], 5)
    print(f"{feature_names[feature_index]}: {importance_score}")

#................................................................


Top Feature Importance (Ablation):
Gunning Fog Mean: 0.97775
 Count Blanks STD: 0.9775
Count Blanks Mean: 0.9775
Automated Readability Index Mean: 0.9775
 Num Lowercase Chars STD: 0.97725
Total Characters STD: 0.97725
Average Words Length Mean: 0.97725
Dale Chall Readability Mean: 0.97725
 Dale Chall Readability STD: 0.97725
Linsear Write STD: 0.97725
