In [12]:
#Decision Tree
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Load dataset
filename = '/content/Accounts.csv'
dataset = pd.read_csv(filename)

# Remove NaN values
cols_with_nan = dataset.columns[dataset.isna().any()].tolist()
for col in cols_with_nan:
    if dataset[col].isna().any():
        dataset[col] = dataset.groupby('Class')[col].transform(lambda x: x.fillna(x.mode().iloc[0]))
    else:
        print(f"")

X = dataset.iloc[:, 1:46].values
y = dataset.iloc[:, 47].values

feature_names = dataset.columns[1:46]

# Hyperparameters for Decision Tree
dt_parameters = {'max_depth': range(1, 5), 'criterion': ['gini', 'entropy']}

# Split the data into training, testing, and validation sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Nested cross-validation strategy
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Initialize performance metric lists
dt_accuracy_scores = []
dt_precision_scores = []
dt_recall_scores = []
dt_f1_scores = []

# Store feature importance scores
feature_importance_scores = []

# Perform nested cross-validation
for train_index, test_index in outer_cv.split(X_train, y_train):
    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]

    # Decision Tree
    dt_classifier = DecisionTreeClassifier(random_state=42)
    dt_grid_search = GridSearchCV(dt_classifier, dt_parameters, cv=inner_cv)
    dt_grid_search.fit(X_train, y_train)
    dt_best_params = dt_grid_search.best_params_
    dt_best_model = DecisionTreeClassifier(random_state=42, **dt_best_params)

    # Ablation Study
    ablation_scores = []
    for feature_index in range(X_train.shape[1]):
        ablated_X_train = np.delete(X_train, feature_index, axis=1)
        ablated_X_test = np.delete(X_test, feature_index, axis=1)
        dt_best_model.fit(ablated_X_train, y_train)
        dt_predictions = dt_best_model.predict(ablated_X_test)
        ablation_scores.append(accuracy_score(y_test, dt_predictions))

    # Store feature importance scores
    feature_importance_scores.append(ablation_scores)

    # Evaluate performance metrics
    dt_accuracy_scores.append(np.mean(ablation_scores))
    dt_precision_scores.append(precision_score(y_test, dt_predictions))
    dt_recall_scores.append(recall_score(y_test, dt_predictions))
    dt_f1_scores.append(f1_score(y_test, dt_predictions))

# Calculate average performance scores
dt_average_accuracy = np.mean(dt_accuracy_scores)
dt_average_precision = np.mean(dt_precision_scores)
dt_average_recall = np.mean(dt_recall_scores)
dt_average_f1 = np.mean(dt_f1_scores)
dt_std_accuracy= np.std(dt_accuracy_scores)
dt_std_precision = np.std(dt_precision_scores)
dt_std_recall = np.std(dt_recall_scores)
dt_std_f1 = np.std(dt_f1_scores)

# Print the average scores
print("\nDecision Tree")
print("Average Accuracy:", round(dt_average_accuracy,4))
print("Standard Deviation Accuracy:",round(dt_std_accuracy,4))
print("Average Precision:", round(dt_average_precision,4))
print("Standard Deviation Precision:", round(dt_std_precision,4))
print("Average Recall:", round(dt_average_recall,4))
print("Standard Deviation Recall:", round(dt_std_recall,4))
print("Average F1-score:", round(dt_average_f1,4))
print("Standard Deviation F1-score:", round(dt_std_f1,4))


Decision Tree
Average Accuracy: 0.9852
Standard Deviation Accuracy: 0.0
Average Precision: 0.992
Standard Deviation Precision: 0.0
Average Recall: 0.9763
Standard Deviation Recall: 0.0
Average F1-score: 0.9841
Standard Deviation F1-score: 0.0


In [11]:
# Select the best-performing features
best_feature_indices = np.argsort(np.mean(feature_importance_scores, axis=0))[::-1]
num_features_to_display = 10

X_best_features = X[:, best_feature_indices[:num_features_to_display]]

average_importance_scores = np.mean(feature_importance_scores, axis=0)
sorted_indices = np.argsort(average_importance_scores)[::-1]

print("\nTop Feature Importance With Scores:")
for i in range(num_features_to_display):
    feature_index = sorted_indices[i]
    importance_score = round(average_importance_scores[feature_index], 5)
    print(f"{feature_names[feature_index]}: {importance_score}")


Top Feature Importance With Scores:
linsear_write: 0.86933
ari: 0.86933
gunning_fog: 0.86933
coleman_liau_index: 0.86933
count_special_characters: 0.86933
count_numbers: 0.86933
num_lowercase_words: 0.86933
total_characters: 0.86933
num_uppercase_chars: 0.86867
count_unique_words: 0.86867
