<a href="https://colab.research.google.com/github/mahdyabdnia/kalabama/blob/master/%D8%A8%D8%B1%D9%86%D8%A7%D9%85%D9%87_%D8%A7%D8%B5%D9%84%DB%8C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFECV, RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score

drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/LongParameterList.csv')

# آماده‌سازی داده‌ها
X = data.drop('is_long_parameters_list', axis=1)
y = data['is_long_parameters_list']

# توابع انتخاب ویژگی
def select_features_by_rfecv(data, target_name='is_long_parameters_list', estimator=DecisionTreeClassifier(), n_features_to_select=None):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    if n_features_to_select:
        selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
    else:
        selector = RFECV(estimator, step=1, cv=StratifiedKFold(10), scoring='accuracy')
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_sfs(data, target_name='is_long_parameters_list', estimator=DecisionTreeClassifier(), n_features_to_select=10):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, n_features_to_select=n_features_to_select, direction='forward', scoring='accuracy', cv=StratifiedKFold(10))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_rfe(data, target_name='is_long_parameters_list', estimator=DecisionTreeClassifier(), n_features_to_select=10):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_custom_genetic(data, target_name='is_long_parameters_list', pop_size=20, n_generations=50, mutation_rate=0.01, n_features_to_select=10):
    X = data.drop(target_name, axis=1).values
    y = data[target_name].values
    population = initialize_population(pop_size, X.shape[1])
    for generation in range(n_generations):
        fitness_scores = np.array([fitness(individual, X, y) for individual in population])
        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents)
        population = mutate(offspring, mutation_rate)
    best_solution = population[np.argmax(fitness_scores)]
    # انتخاب n ویژگی برتر
    top_indices = np.argsort(best_solution)[-n_features_to_select:]
    best_solution = np.zeros_like(best_solution)
    best_solution[top_indices] = 1
    return dict(zip(data.drop(target_name, axis=1).columns, best_solution))

# توابع الگوریتم ژنتیک
def initialize_population(pop_size, num_features):
    return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores):
    parents = []
    for _ in range(len(population)):
        i, j = np.random.choice(len(population), 2, replace=False)
        if fitness_scores[i] > fitness_scores[j]:
            parents.append(population[i])
        else:
            parents.append(population[j])
    return np.array(parents)

def crossover(parents):
    offspring = []
    for i in range(0, len(parents), 2):
        if i + 1 < len(parents):
            crossover_point = random.randint(1, parents.shape[1] - 1)
            parent1, parent2 = parents[i], parents[i + 1]
            child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
            child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
            offspring.extend([child1, child2])
    return np.array(offspring)

def mutate(offspring, mutation_rate=0.01):
    for individual in offspring:
        for gene in range(len(individual)):
            if random.random() < mutation_rate:
                individual[gene] = 1 - individual[gene]
    return offspring

# تابع ارزیابی دقت
def fitness(solution, X, y):
    if np.sum(solution) == 0:
        return 0
    X_selected = X[:, solution == 1]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

# تابع برای محاسبه اهمیت ویژگی‌ها
def calculate_feature_importance(X, y):
    model = DecisionTreeClassifier()
    model.fit(X, y)
    return model.feature_importances_

# تابع برای انتخاب بهترین n ویژگی بر اساس اهمیت
def select_top_n_features(feature_importance, feature_names, n):
    top_indices = np.argsort(feature_importance)[-n:]
    selected_features = [feature_names[i] for i in top_indices]
    return selected_features

# تابع جدید برای انتخاب ویژگی‌ها بر اساس اهمیت
def select_features_by_importance(data, target_name='is_long_parameters_list', n_features_to_select=10):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    feature_importance = calculate_feature_importance(X, y)
    selected_features = select_top_n_features(feature_importance, X.columns, n_features_to_select)
    return {feature: feature in selected_features for feature in X.columns}

# توابع برای اجتماع، اشتراک و رأی‌گیری حداکثری
def union_features(results_dicts):
    all_features = set()
    for results in results_dicts:
        all_features.update({feature for feature, selected in results.items() if selected})
    return list(all_features)

def intersection_features(results_dicts):
    intersection_features = set(results_dicts[0].keys())
    for results in results_dicts:
        intersection_features.intersection_update({feature for feature, selected in results.items() if selected})
    return list(intersection_features)

def majority_voting_features(results_dicts):
    feature_count = {}
    for results in results_dicts:
        for feature, selected in results.items():
            if selected:
                feature_count[feature] = feature_count.get(feature, 0) + 1
    majority_features = [feature for feature, count in feature_count.items() if count >= len(results_dicts) // 2]
    return majority_features

# تابع اصلی برای کنترل تعداد ویژگی‌ها و بررسی دقت
def evaluate_feature_selection(data, target_name='is_long_parameters_list', feature_counts=[5, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]):
    results = {}
    methods = {
        "RFECV": select_features_by_rfecv,
        "SFS": select_features_by_sfs,
        "RFE": select_features_by_rfe,
        "Genetic": select_features_by_custom_genetic,
        "Importance": select_features_by_importance  # اضافه کردن روش جدید
    }

    for method_name, method_func in methods.items():
        print(f"\nارزیابی روش: {method_name}")
        best_accuracy = 0
        best_feature_count = 0
        best_features_dict = None

        for n_features in feature_counts:
            print(f"تعداد ویژگی‌ها: {n_features}")
            selected_features_dict = method_func(data, target_name=target_name, n_features_to_select=n_features)
            selected_features = [feature for feature, selected in selected_features_dict.items() if selected]

            # محاسبه دقت
            solution = np.array([1 if feature in selected_features else 0 for feature in data.drop(target_name, axis=1).columns])
            accuracy = fitness(solution, data.drop(target_name, axis=1).values, data[target_name].values)
            print(f"دقت: {accuracy}")

            # به‌روزرسانی بهترین دقت و تعداد ویژگی‌ها
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_feature_count = n_features
                best_features_dict = selected_features_dict

        results[method_name] = {
            "best_accuracy": best_accuracy,
            "best_feature_count": best_feature_count,
            "best_features_dict": best_features_dict
        }

    # محاسبه اجتماع، اشتراک و رأی‌گیری حداکثری
    print("\nمحاسبه اجتماع، اشتراک و رأی‌گیری حداکثری:")
    all_results_dicts = [results[method]["best_features_dict"] for method in results]
    union = union_features(all_results_dicts)
    intersection = intersection_features(all_results_dicts)
    majority = majority_voting_features(all_results_dicts)

    print(f"اجتماع: {union}")
    print(f"اشتراک: {intersection}")
    print(f"رأی‌گیری حداکثری: {majority}")

    return results

# اجرای تابع اصلی
results = evaluate_feature_selection(data, target_name='is_long_parameters_list')

# چاپ نتایج
for method_name, result in results.items():
    print(f"\nنتایج برای روش {method_name}:")
    print(f"بهترین تعداد ویژگی‌ها: {result['best_feature_count']}")
    print(f"بهترین دقت: {result['best_accuracy']}")
    print(f"ویژگی‌های انتخاب‌شده: {[feature for feature, selected in result['best_features_dict'].items() if selected]}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

ارزیابی روش: RFECV
تعداد ویژگی‌ها: 5
دقت: 0.8888888888888888
تعداد ویژگی‌ها: 10
دقت: 0.9206349206349206
تعداد ویژگی‌ها: 11
دقت: 0.9206349206349206
تعداد ویژگی‌ها: 12
دقت: 0.9285714285714286
تعداد ویژگی‌ها: 13
دقت: 0.9444444444444444
تعداد ویژگی‌ها: 14
دقت: 0.9206349206349206
تعداد ویژگی‌ها: 15
دقت: 0.9444444444444444
تعداد ویژگی‌ها: 16
دقت: 0.9365079365079365
تعداد ویژگی‌ها: 17
دقت: 0.9126984126984127
تعداد ویژگی‌ها: 18
دقت: 0.9047619047619048
تعداد ویژگی‌ها: 19
دقت: 0.8968253968253969
تعداد ویژگی‌ها: 20
دقت: 0.9206349206349206
تعداد ویژگی‌ها: 21
دقت: 0.9047619047619048
تعداد ویژگی‌ها: 22
دقت: 0.9126984126984127
تعداد ویژگی‌ها: 23
دقت: 0.9285714285714286
تعداد ویژگی‌ها: 24
دقت: 0.9126984126984127
تعداد ویژگی‌ها: 25
دقت: 0.8809523809523809
تعداد ویژگی‌ها: 26
دقت: 0.8968253968253969
تعداد ویژگی‌ها: 27
دقت: 0.9126984126984127
تعداد ویژگی‌ها: 28
دقت: 0.920634920

In [7]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFECV, RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score

drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/SwitchStatements.csv')

# آماده‌سازی داده‌ها
X = data.drop('is_switch_statements', axis=1)
y = data['is_switch_statements']

# توابع انتخاب ویژگی
def select_features_by_rfecv(data, target_name='is_switch_statements', estimator=DecisionTreeClassifier(), n_features_to_select=None):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    if n_features_to_select:
        selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
    else:
        selector = RFECV(estimator, step=1, cv=StratifiedKFold(10), scoring='accuracy')
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_sfs(data, target_name='is_switch_statements', estimator=DecisionTreeClassifier(), n_features_to_select=10):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, n_features_to_select=n_features_to_select, direction='forward', scoring='accuracy', cv=StratifiedKFold(10))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_rfe(data, target_name='is_switch_statements', estimator=DecisionTreeClassifier(), n_features_to_select=10):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_custom_genetic(data, target_name='is_switch_statements', pop_size=20, n_generations=50, mutation_rate=0.01, n_features_to_select=10):
    X = data.drop(target_name, axis=1).values
    y = data[target_name].values
    population = initialize_population(pop_size, X.shape[1])
    for generation in range(n_generations):
        fitness_scores = np.array([fitness(individual, X, y) for individual in population])
        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents)
        population = mutate(offspring, mutation_rate)
    best_solution = population[np.argmax(fitness_scores)]
    # انتخاب n ویژگی برتر
    top_indices = np.argsort(best_solution)[-n_features_to_select:]
    best_solution = np.zeros_like(best_solution)
    best_solution[top_indices] = 1
    return dict(zip(data.drop(target_name, axis=1).columns, best_solution))

# توابع الگوریتم ژنتیک
def initialize_population(pop_size, num_features):
    return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores):
    parents = []
    for _ in range(len(population)):
        i, j = np.random.choice(len(population), 2, replace=False)
        if fitness_scores[i] > fitness_scores[j]:
            parents.append(population[i])
        else:
            parents.append(population[j])
    return np.array(parents)

def crossover(parents):
    offspring = []
    for i in range(0, len(parents), 2):
        if i + 1 < len(parents):
            crossover_point = random.randint(1, parents.shape[1] - 1)
            parent1, parent2 = parents[i], parents[i + 1]
            child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
            child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
            offspring.extend([child1, child2])
    return np.array(offspring)

def mutate(offspring, mutation_rate=0.01):
    for individual in offspring:
        for gene in range(len(individual)):
            if random.random() < mutation_rate:
                individual[gene] = 1 - individual[gene]
    return offspring

# تابع ارزیابی دقت
def fitness(solution, X, y):
    if np.sum(solution) == 0:
        return 0
    X_selected = X[:, solution == 1]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

# تابع برای محاسبه اهمیت ویژگی‌ها
def calculate_feature_importance(X, y):
    model = DecisionTreeClassifier()
    model.fit(X, y)
    return model.feature_importances_

# تابع برای انتخاب بهترین n ویژگی بر اساس اهمیت
def select_top_n_features(feature_importance, feature_names, n):
    top_indices = np.argsort(feature_importance)[-n:]
    selected_features = [feature_names[i] for i in top_indices]
    return selected_features

# تابع جدید برای انتخاب ویژگی‌ها بر اساس اهمیت
def select_features_by_importance(data, target_name='is_switch_statements', n_features_to_select=10):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    feature_importance = calculate_feature_importance(X, y)
    selected_features = select_top_n_features(feature_importance, X.columns, n_features_to_select)
    return {feature: feature in selected_features for feature in X.columns}

# توابع برای اجتماع، اشتراک و رأی‌گیری حداکثری
def union_features(results_dicts):
    all_features = set()
    for results in results_dicts:
        all_features.update({feature for feature, selected in results.items() if selected})
    return list(all_features)

def intersection_features(results_dicts):
    intersection_features = set(results_dicts[0].keys())
    for results in results_dicts:
        intersection_features.intersection_update({feature for feature, selected in results.items() if selected})
    return list(intersection_features)

def majority_voting_features(results_dicts):
    feature_count = {}
    for results in results_dicts:
        for feature, selected in results.items():
            if selected:
                feature_count[feature] = feature_count.get(feature, 0) + 1
    majority_features = [feature for feature, count in feature_count.items() if count >= len(results_dicts) // 2]
    return majority_features

# تابع اصلی برای کنترل تعداد ویژگی‌ها و بررسی دقت
def evaluate_feature_selection(data, target_name='is_switch_statements', feature_counts=[10, 15, 20, 25]):
    results = {}
    methods = {
        "RFECV": select_features_by_rfecv,
        "SFS": select_features_by_sfs,
        "RFE": select_features_by_rfe,
        "Genetic": select_features_by_custom_genetic,
        "Importance": select_features_by_importance  # اضافه کردن روش جدید
    }

    for method_name, method_func in methods.items():
        print(f"\nارزیابی روش: {method_name}")
        best_accuracy = 0
        best_feature_count = 0
        best_features_dict = None

        for n_features in feature_counts:
            print(f"تعداد ویژگی‌ها: {n_features}")
            selected_features_dict = method_func(data, target_name=target_name, n_features_to_select=n_features)
            selected_features = [feature for feature, selected in selected_features_dict.items() if selected]

            # محاسبه دقت
            solution = np.array([1 if feature in selected_features else 0 for feature in data.drop(target_name, axis=1).columns])
            accuracy = fitness(solution, data.drop(target_name, axis=1).values, data[target_name].values)
            print(f"دقت: {accuracy}")

            # بررسی اگر دقت کاهش یافت
            if accuracy < best_accuracy:
                print(f"دقت کاهش یافت. متوقف کردن فرآیند برای این روش.")
                break

            # به‌روزرسانی بهترین دقت و تعداد ویژگی‌ها
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_feature_count = n_features
                best_features_dict = selected_features_dict

        results[method_name] = {
            "best_accuracy": best_accuracy,
            "best_feature_count": best_feature_count,
            "best_features_dict": best_features_dict
        }

    # محاسبه اجتماع، اشتراک و رأی‌گیری حداکثری
    print("\nمحاسبه اجتماع، اشتراک و رأی‌گیری حداکثری:")
    all_results_dicts = [results[method]["best_features_dict"] for method in results]
    union = union_features(all_results_dicts)
    intersection = intersection_features(all_results_dicts)
    majority = majority_voting_features(all_results_dicts)

    print(f"اجتماع: {union}")
    print(f"اشتراک: {intersection}")
    print(f"رأی‌گیری حداکثری: {majority}")

    return results

# اجرای تابع اصلی
results = evaluate_feature_selection(data, target_name='is_switch_statements')

# چاپ نتایج
for method_name, result in results.items():
    print(f"\nنتایج برای روش {method_name}:")
    print(f"بهترین تعداد ویژگی‌ها: {result['best_feature_count']}")
    print(f"بهترین دقت: {result['best_accuracy']}")
    print(f"ویژگی‌های انتخاب‌شده: {[feature for feature, selected in result['best_features_dict'].items() if selected]}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

ارزیابی روش: RFECV
تعداد ویژگی‌ها: 10
دقت: 0.9126984126984127
تعداد ویژگی‌ها: 15
دقت: 0.8968253968253969
دقت کاهش یافت. متوقف کردن فرآیند برای این روش.

ارزیابی روش: SFS
تعداد ویژگی‌ها: 10
دقت: 0.8650793650793651
تعداد ویژگی‌ها: 15
دقت: 0.873015873015873
تعداد ویژگی‌ها: 20
دقت: 0.8571428571428571
دقت کاهش یافت. متوقف کردن فرآیند برای این روش.

ارزیابی روش: RFE
تعداد ویژگی‌ها: 10
دقت: 0.9126984126984127
تعداد ویژگی‌ها: 15
دقت: 0.9206349206349206
تعداد ویژگی‌ها: 20
دقت: 0.9285714285714286
تعداد ویژگی‌ها: 25
دقت: 0.8888888888888888
دقت کاهش یافت. متوقف کردن فرآیند برای این روش.

ارزیابی روش: Genetic
تعداد ویژگی‌ها: 10
دقت: 0.7857142857142857
تعداد ویژگی‌ها: 15
دقت: 0.8888888888888888
تعداد ویژگی‌ها: 20
دقت: 0.9365079365079365
تعداد ویژگی‌ها: 25
دقت: 0.9126984126984127
دقت کاهش یافت. متوقف کردن فرآیند برای این روش.

ارزیابی روش: Importance
تعداد ویژگی‌ها: 10
دقت

In [6]:
# توابع برای اجتماع، اشتراک و رأی‌گیری حداکثری
def union_features(results_dicts):
    all_features = set()
    for results in results_dicts:
        all_features.update({feature for feature, selected in results.items() if selected})
    return list(all_features)

def intersection_features(results_dicts):
    intersection_features = set(results_dicts[0].keys())
    for results in results_dicts:
        intersection_features.intersection_update({feature for feature, selected in results.items() if selected})
    return list(intersection_features)

def majority_voting_features(results_dicts):
    feature_count = {}
    for results in results_dicts:
        for feature, selected in results.items():
            if selected:
                feature_count[feature] = feature_count.get(feature, 0) + 1
    majority_features = [feature for feature, count in feature_count.items() if count >= len(results_dicts) // 2]
    return majority_features

# تابع اصلی برای کنترل تعداد ویژگی‌ها و بررسی دقت
def evaluate_feature_selection(data, target_name='is_switch_statements', feature_counts=[10, 15, 20, 25]):
    results = {}
    methods = {
        "RFECV": select_features_by_rfecv,
        "SFS": select_features_by_sfs,
        "RFE": select_features_by_rfe,
        "Genetic": select_features_by_custom_genetic
    }

    for method_name, method_func in methods.items():
        print(f"\nارزیابی روش: {method_name}")
        best_accuracy = 0
        best_feature_count = 0
        best_features_dict = None

        for n_features in feature_counts:
            print(f"تعداد ویژگی‌ها: {n_features}")
            selected_features_dict = method_func(data, target_name=target_name, n_features_to_select=n_features)
            selected_features = [feature for feature, selected in selected_features_dict.items() if selected]

            # محاسبه دقت
            solution = np.array([1 if feature in selected_features else 0 for feature in data.drop(target_name, axis=1).columns])
            accuracy = fitness(solution, data.drop(target_name, axis=1).values, data[target_name].values)
            print(f"دقت: {accuracy}")

            # بررسی اگر دقت کاهش یافت
            if accuracy < best_accuracy:
                print(f"دقت کاهش یافت. متوقف کردن فرآیند برای این روش.")
                break

            # به‌روزرسانی بهترین دقت و تعداد ویژگی‌ها
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_feature_count = n_features
                best_features_dict = selected_features_dict

        results[method_name] = {
            "best_accuracy": best_accuracy,
            "best_feature_count": best_feature_count,
            "best_features_dict": best_features_dict
        }

    # محاسبه اجتماع، اشتراک و رأی‌گیری حداکثری
    print("\nمحاسبه اجتماع، اشتراک و رأی‌گیری حداکثری:")
    all_results_dicts = [results[method]["best_features_dict"] for method in results]
    union = union_features(all_results_dicts)
    intersection = intersection_features(all_results_dicts)
    majority = majority_voting_features(all_results_dicts)

    print(f"اجتماع: {union}")
    print(f"اشتراک: {intersection}")
    print(f"رأی‌گیری حداکثری: {majority}")

    return results

# اجرای تابع اصلی
results = evaluate_feature_selection(data, target_name='is_switch_statements')

# چاپ نتایج
for method_name, result in results.items():
    print(f"\nنتایج برای روش {method_name}:")
    print(f"بهترین تعداد ویژگی‌ها: {result['best_feature_count']}")
    print(f"بهترین دقت: {result['best_accuracy']}")
    print(f"ویژگی‌های انتخاب‌شده: {[feature for feature, selected in result['best_features_dict'].items() if selected]}")


ارزیابی روش: RFECV
تعداد ویژگی‌ها: 10
دقت: 0.9285714285714286
تعداد ویژگی‌ها: 15
دقت: 0.9206349206349206
دقت کاهش یافت. متوقف کردن فرآیند برای این روش.

ارزیابی روش: SFS
تعداد ویژگی‌ها: 10
دقت: 0.873015873015873
تعداد ویژگی‌ها: 15
دقت: 0.8650793650793651
دقت کاهش یافت. متوقف کردن فرآیند برای این روش.

ارزیابی روش: RFE
تعداد ویژگی‌ها: 10
دقت: 0.8888888888888888
تعداد ویژگی‌ها: 15
دقت: 0.9126984126984127
تعداد ویژگی‌ها: 20
دقت: 0.9047619047619048
دقت کاهش یافت. متوقف کردن فرآیند برای این روش.

ارزیابی روش: Genetic
تعداد ویژگی‌ها: 10
دقت: 0.8095238095238095
تعداد ویژگی‌ها: 15
دقت: 0.8095238095238095
تعداد ویژگی‌ها: 20
دقت: 0.8809523809523809
تعداد ویژگی‌ها: 25
دقت: 0.873015873015873
دقت کاهش یافت. متوقف کردن فرآیند برای این روش.

محاسبه اجتماع، اشتراک و رأی‌گیری حداکثری:
اجتماع: ['NOAV_method', 'FANOUT_method', 'MaMCL_method', 'AMWNAMM_type', 'LOC_type', 'CBO_type', 'CC_method', 'NOC_type', 'NOAM_type', 'WMCNAMM_type', 'WMC_type', 'DIT_type', 'NOPA_type', 'NMCS_method', 'LCOM5_type', 'NO

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.feature_selection import RFECV, RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.metrics import accuracy_score

drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/LongParameterList.csv')

# آماده‌سازی داده‌ها
X = data.drop('is_long_parameters_list', axis=1)
y = data['is_long_parameters_list']

# انتخاب ویژگی‌ها
def select_features_by_rfecv(data, target_name='is_long_parameters_list', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFECV(estimator, step=1, cv=StratifiedKFold(10), scoring='accuracy')
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_sfs(data, target_name='is_long_parameters_list', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, n_features_to_select='auto', direction='forward', scoring='accuracy', cv=StratifiedKFold(10))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_rfe(data, target_name='is_long_parameters_list', estimator=LogisticRegression(solver='liblinear'), n_features_to_select=10):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def fitness(solution, X, y):
    if np.sum(solution) == 0:
        return 0
    X_selected = X[:, solution == 1]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

def initialize_population(pop_size, num_features):
    return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores):
    parents = []
    for _ in range(len(population)):
        i, j = np.random.choice(len(population), 2, replace=False)
        if fitness_scores[i] > fitness_scores[j]:
            parents.append(population[i])
        else:
            parents.append(population[j])
    return np.array(parents)

def crossover(parents):
    offspring = []
    for i in range(0, len(parents), 2):
        if i + 1 < len(parents):
            crossover_point = random.randint(1, parents.shape[1] - 1)
            parent1, parent2 = parents[i], parents[i + 1]
            child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
            child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
            offspring.extend([child1, child2])
    return np.array(offspring)

def mutate(offspring, mutation_rate=0.01):
    for individual in offspring:
        for gene in range(len(individual)):
            if random.random() < mutation_rate:
                individual[gene] = 1 - individual[gene]
    return offspring

def select_features_by_custom_genetic(data, target_name='is_long_parameters_list', pop_size=20, n_generations=50, mutation_rate=0.01):
    X = data.drop(target_name, axis=1).values
    y = data[target_name].values
    population = initialize_population(pop_size, X.shape[1])
    for generation in range(n_generations):
        fitness_scores = np.array([fitness(individual, X, y) for individual in population])
        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents)
        population = mutate(offspring, mutation_rate)
    best_solution = population[np.argmax(fitness_scores)]
    return dict(zip(data.drop(target_name, axis=1).columns, best_solution))

# اجرای روش‌های انتخاب ویژگی
rfecv_results = select_features_by_rfecv(data, target_name='is_long_parameters_list')
sfs_results = select_features_by_sfs(data, target_name='is_long_parameters_list')
rfe_results = select_features_by_rfe(data, target_name='is_long_parameters_list', n_features_to_select=10)
genetic_results = select_features_by_custom_genetic(data, target_name='is_long_parameters_list')

# چاپ ویژگی‌های انتخاب‌شده توسط هر الگوریتم
def print_selected_features(method_name, results):
    selected_features = [feature for feature, selected in results.items() if selected]
    print(f"ویژگی‌های انتخاب‌شده با روش {method_name}: {selected_features}")

print_selected_features("RFECV", rfecv_results)
print_selected_features("SFS", sfs_results)
print_selected_features("RFE", rfe_results)
print_selected_features("الگوریتم ژنتیک", genetic_results)

# ترکیب نتایج
results_dicts = [rfecv_results, sfs_results, rfe_results, genetic_results]

# رای‌گیری حداکثری
def majority_voting_features(results_dicts):
    all_features = set()
    for results in results_dicts:
        all_features.update(results.keys())

    feature_count = {feature: 0 for feature in all_features}
    for results in results_dicts:
        for feature, selected in results.items():
            if selected:
                feature_count[feature] += 1

    majority_features = [feature for feature, count in feature_count.items() if count >= len(results_dicts) // 2]
    return majority_features

# عملیات اجتماع و اشتراک
def union_and_intersection_features(results_dicts):
    all_features = set()
    intersection_features = set(results_dicts[0].keys())

    for results in results_dicts:
        all_features.update(results.keys())
        intersection_features.intersection_update({feature for feature, selected in results.items() if selected})

    return list(all_features), list(intersection_features)

# چاپ ویژگی‌های انتخاب‌شده
selected_features = majority_voting_features(results_dicts)
print(f"ویژگی‌های انتخاب‌شده با رای‌گیری حداکثری: {selected_features}")

# چاپ اجتماع و اشتراک
union_features, intersection_features = union_and_intersection_features(results_dicts)
print(f"ویژگی‌های اجتماع: {union_features}")
print(f"ویژگی‌های اشتراک: {intersection_features}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
ویژگی‌های انتخاب‌شده با روش RFECV: ['NOP_method', 'LCOM5_type']
ویژگی‌های انتخاب‌شده با روش SFS: ['NOP_method', 'CC_method', 'ATFD_method', 'CM_method', 'LOC_method', 'CYCLO_method', 'NMCS_method', 'NOLV_method', 'MaMCL_method', 'NOAV_method', 'ATLD_method', 'MeMCL_method', 'CDISP_method', 'NOAM_type', 'NOM_type', 'NMO_type', 'ATFD_type', 'FANOUT_type', 'NOA_type', 'CFNAMM_type', 'NOPA_type', 'CBO_type', 'RFC_type', 'NOC_type', 'WMC_type', 'NOMNAMM_package', 'NOM_package']
ویژگی‌های انتخاب‌شده با روش RFE: ['NOP_method', 'MAXNESTING_method', 'MaMCL_method', 'LAA_method', 'ATLD_method', 'MeMCL_method', 'NOII_type', 'NOCS_type', 'TCC_type', 'LCOM5_type']
ویژگی‌های انتخاب‌شده با روش الگوریتم ژنتیک: ['NOP_method', 'ATFD_method', 'CM_method', 'MAXNESTING_method', 'NMCS_method', 'NOLV_method', 'MaMCL_method', 'LAA_method', 'FANOUT_method', 'CINT_method', 'CDISP_meth

In [None]:
# main program 12
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV, RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import Matern, ConstantKernel as C

best_kernel = C(1.0) * Matern(length_scale=0.5, nu=1.5)

def select_features_by_rfecv(data, target_name='is_god_class', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFECV(estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_sfs(data, target_name='is_god_class', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, n_features_to_select='auto', direction='forward', scoring='accuracy', cv=StratifiedKFold(5))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_rfe(data, target_name='is_god_class', estimator=LogisticRegression(solver='liblinear'), n_features_to_select=10):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def fitness(solution, X, y):
    if np.sum(solution) == 0:
        return 0
    X_selected = X[:, solution == 1]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

def initialize_population(pop_size, num_features):
    return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores):
    parents = []
    for _ in range(len(population)):
        i, j = np.random.choice(len(population), 2, replace=False)
        if fitness_scores[i] > fitness_scores[j]:
            parents.append(population[i])
        else:
            parents.append(population[j])
    return np.array(parents)

def crossover(parents):
    offspring = []
    for i in range(0, len(parents), 2):
        if i + 1 < len(parents):
            crossover_point = random.randint(1, parents.shape[1] - 1)
            parent1, parent2 = parents[i], parents[i + 1]
            child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
            child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
            offspring.extend([child1, child2])
    return np.array(offspring)

def mutate(offspring, mutation_rate=0.01):
    for individual in offspring:
        for gene in range(len(individual)):
            if random.random() < mutation_rate:
                individual[gene] = 1 - individual[gene]
    return offspring

def select_features_by_custom_genetic(data, target_name='is_god_class', pop_size=20, n_generations=50, mutation_rate=0.01):
    X = data.drop(target_name, axis=1).values
    y = data[target_name].values
    population = initialize_population(pop_size, X.shape[1])
    for generation in range(n_generations):
        fitness_scores = np.array([fitness(individual, X, y) for individual in population])
        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents)
        population = mutate(offspring, mutation_rate)
    best_solution = population[np.argmax(fitness_scores)]
    return dict(zip(data.drop(target_name, axis=1).columns, best_solution))

def majority_voting_features(results_dicts):
    all_features = set()
    for results in results_dicts:
        all_features.update(results.keys())

    feature_count = {feature: 0 for feature in all_features}
    for results in results_dicts:
        for feature, selected in results.items():
            if selected:
                feature_count[feature] += 1

    majority_features = [feature for feature, count in feature_count.items() if count >= len(results_dicts) // 2]
    return majority_features

drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/GodClass.csv')

X = data.drop('is_god_class', axis=1)
y = data['is_god_class']

scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_normalized.fillna(X_normalized.mean(), inplace=True)
data_normalized = pd.concat([X_normalized, y], axis=1)

rfecv_results = select_features_by_rfecv(data_normalized, target_name='is_god_class')
sfs_results = select_features_by_sfs(data_normalized, target_name='is_god_class')
rfe_results = select_features_by_rfe(data_normalized, target_name='is_god_class', n_features_to_select=10)
genetic_results = select_features_by_custom_genetic(data_normalized, target_name='is_god_class')

results_dicts = [rfecv_results, sfs_results, rfe_results, genetic_results]
majority_voting_features = majority_voting_features(results_dicts)

common_features = set()
for results in results_dicts:
    common_features.update(results.keys())

intersection_features = set.intersection(*[set(results.keys()) for results in results_dicts])

# Define parameter grids for GridSearchCV
param_grids = {
    "Decision Tree": {"criterion": ["gini", "entropy"], "max_depth": [10, 20, 30, None]},
    "SVM": {"C": [0.1, 1, 10, 100], "gamma": ["scale", "auto"], "kernel": ["linear", "rbf"]},

    "Logistic Regression": {"C": [0.01, 0.1, 1, 10, 100], "penalty": ["l1", "l2"], "solver": ["liblinear", "saga"]},
    "Random Forest": {"n_estimators": [100, 200, 300], "max_depth": [None, 10, 20], "min_samples_split": [2, 5, 10]},
    "Linear Discriminant Analysis": {"solver": ["svd", "lsqr", "eigen"]},
    "Gaussian Process": {"kernel": [best_kernel, None]}  # Example parameter grid for GPC
}

base_classifiers = [
    ("Decision Tree", DecisionTreeClassifier()),
    ("SVM", SVC(probability=True)),

    ('Logistic Regression', LogisticRegression()),
    ('Random Forest', RandomForestClassifier()),
    ("Linear Discriminant Analysis", LinearDiscriminantAnalysis()),
    ("Gaussian Process", GaussianProcessClassifier())
]

optimized_classifiers = []
for name, clf in base_classifiers:
    grid_search = GridSearchCV(clf, param_grids[name], cv=StratifiedKFold(5), scoring='accuracy')
    grid_search.fit(X_normalized, y)
    optimized_classifiers.append((name, grid_search.best_estimator_))

stacking_classifier = StackingClassifier(estimators=optimized_classifiers, final_estimator=MLPClassifier(max_iter=2000,activation='relu',solver='adam'))

feature_sets = {
    "All Features": X_normalized.columns.tolist(),
    "RFECV Features": [feature for feature, selected in rfecv_results.items() if selected],
    "SFS Features": [feature for feature, selected in sfs_results.items() if selected],
    "RFE Features": [feature for feature, selected in rfe_results.items() if selected],
    "Genetic Features": [feature for feature, selected in genetic_results.items() if selected],
    "Union of All Features": list(common_features),
    "Intersection Set": list(intersection_features),
    "Majority Set": list(majority_voting_features)
}

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

results_summary = []

for feature_set_name, features in feature_sets.items():
    X_selected = X_normalized[features]
    metrics = {name: {"accuracy": [], "f1": [], "auc": []} for name, _ in base_classifiers}
    metrics["Stacking Classifier"] = {"accuracy": [], "f1": [], "auc": []}

    for train_index, test_index in skf.split(X_selected, y):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        for name, clf in optimized_classifiers:
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            y_prob = clf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC

            metrics[name]["accuracy"].append(accuracy_score(y_test, y_pred))
            metrics[name]["f1"].append(f1_score(y_test, y_pred))
            metrics[name]["auc"].append(roc_auc_score(y_test, y_prob))

        # Stacking Classifier
        stacking_classifier.fit(X_train, y_train)
        y_pred = stacking_classifier.predict(X_test)
        y_prob = stacking_classifier.predict_proba(X_test)[:, 1]

        metrics["Stacking Classifier"]["accuracy"].append(accuracy_score(y_test, y_pred))
        metrics["Stacking Classifier"]["f1"].append(f1_score(y_test, y_pred))
        metrics["Stacking Classifier"]["auc"].append(roc_auc_score(y_test, y_prob))

    print(f"Results for {feature_set_name}:")

    for name, scores in metrics.items():
        avg_accuracy = np.mean(scores["accuracy"])
        avg_f1 = np.mean(scores["f1"])
        avg_auc = np.mean(scores["auc"])
        print(f"{name}: Accuracy={avg_accuracy:.4f}, F1={avg_f1:.4f}, AUC={avg_auc:.4f}")

        results_summary.append({
            "Feature Set": feature_set_name,
            "Classifier": name,
            "Accuracy": avg_accuracy,
            "F1": avg_f1,
            "AUC": avg_auc,
            "Selected Features": features
        })

    print("-" * 40)

# Convert the results summary to a DataFrame for better visualization
results_df = pd.DataFrame(results_summary)

# Display the DataFrame containing the selected features and their evaluation metrics
print(results_df)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Results for All Features:
Decision Tree: Accuracy=0.9690, F1=0.9538, AUC=0.9679
SVM: Accuracy=0.9667, F1=0.9475, AUC=0.9954
Logistic Regression: Accuracy=0.9643, F1=0.9432, AUC=0.9957
Random Forest: Accuracy=0.9762, F1=0.9648, AUC=0.9974
Linear Discriminant Analysis: Accuracy=0.9500, F1=0.9175, AUC=0.9936
Gaussian Process: Accuracy=0.9690, F1=0.9534, AUC=0.9926
Stacking Classifier: Accuracy=0.9714, F1=0.9571, AUC=0.9959
----------------------------------------
Results for RFECV Features:
Decision Tree: Accuracy=0.9619, F1=0.9437, AUC=0.9607
SVM: Accuracy=0.9643, F1=0.9438, AUC=0.9954
Logistic Regression: Accuracy=0.9643, F1=0.9432, AUC=0.9957
Random Forest: Accuracy=0.9738, F1=0.9613, AUC=0.9982
Linear Discriminant Analysis: Accuracy=0.9500, F1=0.9186, AUC=0.9941
Gaussian Process: Accuracy=0.9714, F1=0.9569, AUC=0.9934
Stacking Classifier: Accuracy=0.9762, F1

In [None]:
# Import libraries
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SequentialFeatureSelector as SFS

# Mount Google Drive
drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'

# Load dataset
data = pd.read_csv(Path + '/LongParameterList.csv')

# Normalize data
X = data.drop('is_long_parameters_list', axis=1)
y = data['is_long_parameters_list']
scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_normalized.fillna(X_normalized.mean(), inplace=True)
data_normalized = pd.concat([X_normalized, y], axis=1)


# ------------------------------
# Feature Selection Algorithms
# ------------------------------

# Forward Selection (FS) with Best Performance
def select_features_by_fs_best(data, target_name='is_long_parameters_list', estimator=LogisticRegression(max_iter=1000)):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, direction='forward', scoring='accuracy', cv=StratifiedKFold(10))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

# Backward Elimination (BE) with Best Performance
def select_features_by_be_best(data, target_name='is_long_parameters_list', estimator=LogisticRegression(max_iter=1000)):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, direction='backward', scoring='accuracy', cv=StratifiedKFold(10))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

# Recursive Feature Elimination with Cross-Validation (RFECV)
def select_features_by_rfecv(data, target_name='is_long_parameters_list', estimator=LogisticRegression(max_iter=1000)):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFECV(estimator, step=2, scoring='accuracy', cv=StratifiedKFold(10))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))


# Genetic Algorithm
def fitness(solution, X, y):
    if np.sum(solution) == 0:
        return 0
    X_selected = X[:, solution == 1]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

def initialize_population(pop_size, num_features):
    return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores):
    parents = []
    for _ in range(len(population)):
        i, j = np.random.choice(len(population), 2, replace=False)
        if fitness_scores[i] > fitness_scores[j]:
            parents.append(population[i])
        else:
            parents.append(population[j])
    return np.array(parents)

def crossover(parents):
    offspring = []
    for i in range(0, len(parents), 2):
        if i + 1 < len(parents):
            crossover_point = random.randint(1, parents.shape[1] - 1)
            parent1, parent2 = parents[i], parents[i + 1]
            child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
            child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
            offspring.extend([child1, child2])
    return np.array(offspring)

def mutate(offspring, mutation_rate=0.01):
    for individual in offspring:
        for gene in range(len(individual)):
            if random.random() < mutation_rate:
                individual[gene] = 1 - individual[gene]
    return offspring

def select_features_by_custom_genetic(data, target_name='is_long_parameters_list', pop_size=20, n_generations=50, mutation_rate=0.01):
    X = data.drop(target_name, axis=1).values
    y = data[target_name].values
    population = initialize_population(pop_size, X.shape[1])
    for generation in range(n_generations):
        fitness_scores = np.array([fitness(individual, X, y) for individual in population])
        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents)
        population = mutate(offspring, mutation_rate)
    best_solution = population[np.argmax(fitness_scores)]
    return dict(zip(data.drop(target_name, axis=1).columns, best_solution))


# ------------------------------
# Majority Voting, Union, and Intersection
# ------------------------------

def majority_voting_features(results_dicts):
    all_features = set()
    for results in results_dicts:
        all_features.update(results.keys())

    feature_count = {feature: 0 for feature in all_features}
    for results in results_dicts:
        for feature, selected in results.items():
            if selected:
                feature_count[feature] += 1

    majority_features = [feature for feature, count in feature_count.items() if count >= len(results_dicts) // 2]
    return majority_features


# ------------------------------
# Display Selected Features and Evaluate
# ------------------------------

def print_selected_features(method_name, feature_dict):
    selected_features = [feature for feature, selected in feature_dict.items() if selected]
    print(f"\n{method_name} Selected Features:")
    print(", ".join(selected_features))
    print("-" * 40)
    return selected_features

def train_and_evaluate(data, selected_features, target_name='is_long_parameters_list'):
    if not selected_features:
        print("\nNo features selected")
        return

    X = data[selected_features]
    y = data[target_name]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Accuracy with selected features ({len(selected_features)} features): {accuracy:.4f}")
    print("=" * 40)


# ------------------------------
# Main Execution
# ------------------------------

# Run all feature selection methods
fs_results = select_features_by_fs_best(data_normalized)
be_results = select_features_by_be_best(data_normalized)
rfecv_results = select_features_by_rfecv(data_normalized)
genetic_results = select_features_by_custom_genetic(data_normalized)

# Print selected features for each method
fs_selected = print_selected_features("Forward Selection", fs_results)
be_selected = print_selected_features("Backward Elimination", be_results)
rfecv_selected = print_selected_features("RFECV", rfecv_results)
genetic_selected = print_selected_features("Genetic Algorithm", genetic_results)

# Combine results (Majority Voting, Union, Intersection)
results_dicts = [fs_results, be_results]

# Majority Voting
majority_features = majority_voting_features(results_dicts)
print("\nMajority Voting Selected Features:")
print(", ".join(majority_features))
print("=" * 40)

# Union of Features
union_features = list(set().union(*[set(d.keys()) for d in results_dicts if any(d.values())]))
print("\nUnion of Selected Features:")
print(", ".join(union_features))
print("=" * 40)

# Intersection of Features
intersection_features = list(set.intersection(*[set([k for k, v in d.items() if v]) for d in results_dicts]))
print("\nIntersection of Selected Features:")
print(", ".join(intersection_features))
print("=" * 40)

# Evaluate models
print("\nEvaluating Models:")
train_and_evaluate(data_normalized, fs_selected)
train_and_evaluate(data_normalized, be_selected)
train_and_evaluate(data_normalized, rfecv_selected)
train_and_evaluate(data_normalized, genetic_selected)
train_and_evaluate(data_normalized, majority_features)
train_and_evaluate(data_normalized, union_features)
train_and_evaluate(data_normalized, intersection_features)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Forward Selection Selected Features:
NOP_method, CC_method, ATFD_method, CM_method, MAXNESTING_method, CYCLO_method, NMCS_method, MaMCL_method, NOAV_method, CFNAMM_method, CDISP_method, NOII_type, NOAM_type, NOCS_type, NOM_type, NMO_type, FANOUT_type, NOA_type, DIT_type, LOC_type, LOCNAMM_type, CFNAMM_type, NOPA_type, CBO_type, RFC_type, NOC_type, LCOM5_type
----------------------------------------

Backward Elimination Selected Features:
NOP_method, MAXNESTING_method, NOAV_method, CINT_method, NOCS_type, NOA_type, DIT_type, LOC_type, LOCNAMM_type, CFNAMM_type, TCC_type, NOPA_type, CBO_type, RFC_type, NOC_type, LCOM5_type, WOC_type, WMCNAMM_type, AMWNAMM_type, NOCS_package, NOMNAMM_package, NOI_package, LOC_package, NOM_package, NOPK_project, NOI_project, NOM_project, NOMNAMM_project
----------------------------------------

RFECV Selected Features:
NOP_meth

In [None]:
# Import libraries
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SequentialFeatureSelector as SFS

# Mount Google Drive
drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'

# Load dataset
data = pd.read_csv(Path + '/LongParameterList.csv')

# Normalize data
X = data.drop('is_long_parameters_list', axis=1)
y = data['is_long_parameters_list']
scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_normalized.fillna(X_normalized.mean(), inplace=True)
data_normalized = pd.concat([X_normalized, y], axis=1)


# ------------------------------
# Feature Selection Algorithms
# ------------------------------

# Forward Selection (FS) with Best Performance
def select_features_by_fs_best(data, target_name='is_long_parameters_list', estimator=LogisticRegression(max_iter=1000)):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, direction='forward', scoring='accuracy', cv=StratifiedKFold(10))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

# Backward Elimination (BE) with Best Performance
def select_features_by_be_best(data, target_name='is_long_parameters_list', estimator=LogisticRegression(max_iter=1000)):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, direction='backward', scoring='accuracy', cv=StratifiedKFold(10))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

# Recursive Feature Elimination with Cross-Validation (RFECV)
def select_features_by_rfecv(data, target_name='is_long_parameters_list', estimator=LogisticRegression(max_iter=1000)):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFECV(estimator, step=1, scoring='accuracy', cv=StratifiedKFold(10))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))


# Genetic Algorithm
def fitness(solution, X, y):
    if np.sum(solution) == 0:
        return 0
    X_selected = X[:, solution == 1]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

def initialize_population(pop_size, num_features):
    return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores):
    parents = []
    for _ in range(len(population)):
        i, j = np.random.choice(len(population), 2, replace=False)
        if fitness_scores[i] > fitness_scores[j]:
            parents.append(population[i])
        else:
            parents.append(population[j])
    return np.array(parents)

def crossover(parents):
    offspring = []
    for i in range(0, len(parents), 2):
        if i + 1 < len(parents):
            crossover_point = random.randint(1, parents.shape[1] - 1)
            parent1, parent2 = parents[i], parents[i + 1]
            child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
            child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
            offspring.extend([child1, child2])
    return np.array(offspring)

def mutate(offspring, mutation_rate=0.01):
    for individual in offspring:
        for gene in range(len(individual)):
            if random.random() < mutation_rate:
                individual[gene] = 1 - individual[gene]
    return offspring

def select_features_by_custom_genetic(data, target_name='is_long_parameters_list', pop_size=20, n_generations=50, mutation_rate=0.01):
    X = data.drop(target_name, axis=1).values
    y = data[target_name].values
    population = initialize_population(pop_size, X.shape[1])
    for generation in range(n_generations):
        fitness_scores = np.array([fitness(individual, X, y) for individual in population])
        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents)
        population = mutate(offspring, mutation_rate)
    best_solution = population[np.argmax(fitness_scores)]
    return dict(zip(data.drop(target_name, axis=1).columns, best_solution))


# ------------------------------
# Majority Voting, Union, and Intersection
# ------------------------------

def majority_voting_features(results_dicts):
    all_features = set()
    for results in results_dicts:
        all_features.update(results.keys())

    feature_count = {feature: 0 for feature in all_features}
    for results in results_dicts:
        for feature, selected in results.items():
            if selected:
                feature_count[feature] += 1

    majority_features = [feature for feature, count in feature_count.items() if count >= len(results_dicts) // 2]
    return majority_features


# ------------------------------
# Display Selected Features
# ------------------------------

# ------------------------------
# Display Selected Features
# ------------------------------

def print_selected_features(method_name, feature_dict):
    selected_features = [feature for feature, selected in feature_dict.items() if selected]
    print(f"\n{method_name} Selected Features:")
    print(", ".join(selected_features))
    print("-" * 40)


# اجرای الگوریتم‌های انتخاب ویژگی
print("Running Feature Selection Algorithms...\n")

# FS (Forward Selection)
fs_results = select_features_by_fs_best(data_normalized, target_name='is_long_parameters_list')
print_selected_features("Forward Selection (FS)", fs_results)

# BE (Backward Elimination)
be_results = select_features_by_be_best(data_normalized, target_name='is_long_parameters_list')
print_selected_features("Backward Elimination (BE)", be_results)

# RFECV
rfecv_results = select_features_by_rfecv(data_normalized, target_name='is_long_parameters_list')
print_selected_features("Recursive Feature Elimination (RFECV)", rfecv_results)

# Genetic Algorithm
genetic_results = select_features_by_custom_genetic(data_normalized, target_name='is_long_parameters_list')
print_selected_features("Genetic Algorithm", genetic_results)
# جمع‌بندی نتایج
results_dicts = [fs_results, be_results, rfecv_results, genetic_results]

# اتحاد (Union) ویژگی‌ها
union_features = set()
for results in results_dicts:
    union_features.update([feature for feature, selected in results.items() if selected])
print(f"\nUnion of Selected Features: {list(union_features)}")

# اشتراک (Intersection) ویژگی‌ها
intersection_features = set.intersection(*[set([feature for feature, selected in results.items() if selected]) for results in results_dicts])
print(f"\nIntersection of Selected Features: {list(intersection_features)}")

# رأی‌گیری حداکثری (Majority Voting)
majority_features = majority_voting_features(results_dicts)
print(f"\nMajority Voting Selected Features: {majority_features}")

Mounted at /content/drive
Running Feature Selection Algorithms...


Forward Selection (FS) Selected Features:
NOP_method, CC_method, ATFD_method, CM_method, MAXNESTING_method, CYCLO_method, NMCS_method, MaMCL_method, NOAV_method, CFNAMM_method, CDISP_method, NOII_type, NOAM_type, NOCS_type, NOM_type, NMO_type, FANOUT_type, NOA_type, DIT_type, LOC_type, LOCNAMM_type, CFNAMM_type, NOPA_type, CBO_type, RFC_type, NOC_type, LCOM5_type
----------------------------------------

Backward Elimination (BE) Selected Features:
NOP_method, MAXNESTING_method, NOAV_method, CINT_method, NOCS_type, NOA_type, DIT_type, LOC_type, LOCNAMM_type, CFNAMM_type, TCC_type, NOPA_type, CBO_type, RFC_type, NOC_type, LCOM5_type, WOC_type, WMCNAMM_type, AMWNAMM_type, NOCS_package, NOMNAMM_package, NOI_package, LOC_package, NOM_package, NOPK_project, NOI_project, NOM_project, NOMNAMM_project
----------------------------------------

Recursive Feature Elimination (RFECV) Selected Features:
NOP_method, NOLV_method, NOA

In [None]:
# Import libraries
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV, RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS

# Mount Google Drive
drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'

# Load dataset
data = pd.read_csv(Path + '/LongParameterList.csv')

# Normalize data
X = data.drop('is_long_parameters_list', axis=1)
y = data['is_long_parameters_list']
scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_normalized.fillna(X_normalized.mean(), inplace=True)
data_normalized = pd.concat([X_normalized, y], axis=1)


# ------------------------------
# Feature Selection Algorithms
# ------------------------------

# Forward Selection (FS)
def select_features_by_fs(data, target_name='is_long_parameters_list', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, n_features_to_select=10, direction='forward', scoring='accuracy', cv=StratifiedKFold(10))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

# Backward Elimination (BE)
def select_features_by_be(data, target_name='is_long_parameters_list', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, n_features_to_select=10, direction='backward', scoring='accuracy', cv=StratifiedKFold(10))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

# Recursive Feature Elimination (RFE)
def select_features_by_rfe(data, target_name='is_long_parameters_list', estimator=LogisticRegression(solver='liblinear'), n_features_to_select=10):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))


# Genetic Algorithm
def fitness(solution, X, y):
    if np.sum(solution) == 0:
        return 0
    X_selected = X[:, solution == 1]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

def initialize_population(pop_size, num_features):
    return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores):
    parents = []
    for _ in range(len(population)):
        i, j = np.random.choice(len(population), 2, replace=False)
        if fitness_scores[i] > fitness_scores[j]:
            parents.append(population[i])
        else:
            parents.append(population[j])
    return np.array(parents)

def crossover(parents):
    offspring = []
    for i in range(0, len(parents), 2):
        if i + 1 < len(parents):
            crossover_point = random.randint(1, parents.shape[1] - 1)
            parent1, parent2 = parents[i], parents[i + 1]
            child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
            child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
            offspring.extend([child1, child2])
    return np.array(offspring)

def mutate(offspring, mutation_rate=0.01):
    for individual in offspring:
        for gene in range(len(individual)):
            if random.random() < mutation_rate:
                individual[gene] = 1 - individual[gene]
    return offspring

def select_features_by_custom_genetic(data, target_name='is_long_parameters_list', pop_size=20, n_generations=50, mutation_rate=0.01):
    X = data.drop(target_name, axis=1).values
    y = data[target_name].values
    population = initialize_population(pop_size, X.shape[1])
    for generation in range(n_generations):
        fitness_scores = np.array([fitness(individual, X, y) for individual in population])
        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents)
        population = mutate(offspring, mutation_rate)
    best_solution = population[np.argmax(fitness_scores)]
    return dict(zip(data.drop(target_name, axis=1).columns, best_solution))


# ------------------------------
# Majority Voting, Union, and Intersection
# ------------------------------

def majority_voting_features(results_dicts):
    all_features = set()
    for results in results_dicts:
        all_features.update(results.keys())

    feature_count = {feature: 0 for feature in all_features}
    for results in results_dicts:
        for feature, selected in results.items():
            if selected:
                feature_count[feature] += 1

    majority_features = [feature for feature, count in feature_count.items() if count >= len(results_dicts) // 2]
    return majority_features


# ------------------------------
# Display Selected Features
# ------------------------------

def print_selected_features(method_name, feature_dict):
    print(f"\n{method_name} Selected Features:")
    for feature, selected in feature_dict.items():
        print(f"{feature}: {'Selected' if selected else 'Not Selected'}")
    print("-" * 40)


# اجرای الگوریتم‌های انتخاب ویژگی
print("Running Feature Selection Algorithms...\n")

# FS (Forward Selection)
fs_results = select_features_by_fs(data_normalized, target_name='is_long_parameters_list')
print_selected_features("Forward Selection (FS)", fs_results)

# BE (Backward Elimination)
be_results = select_features_by_be(data_normalized, target_name='is_long_parameters_list')
print_selected_features("Backward Elimination (BE)", be_results)

# RFE (Recursive Feature Elimination)
rfe_results = select_features_by_rfe(data_normalized, target_name='is_long_parameters_list', n_features_to_select=10)
print_selected_features("Recursive Feature Elimination (RFE)", rfe_results)

# Genetic Algorithm
genetic_results = select_features_by_custom_genetic(data_normalized, target_name='is_long_parameters_list')
print_selected_features("Genetic Algorithm", genetic_results)

# جمع‌بندی نتایج
results_dicts = [fs_results, be_results, rfe_results, genetic_results]

# اتحاد (Union) ویژگی‌ها
union_features = set()
for results in results_dicts:
    union_features.update([feature for feature, selected in results.items() if selected])
print(f"\nUnion of Selected Features: {list(union_features)}")

# اشتراک (Intersection) ویژگی‌ها
intersection_features = set.intersection(*[set([feature for feature, selected in results.items() if selected]) for results in results_dicts])
print(f"\nIntersection of Selected Features: {list(intersection_features)}")

# رأی‌گیری حداکثری (Majority Voting)
majority_features = majority_voting_features(results_dicts)
print(f"\nMajority Voting Selected Features: {majority_features}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Running Feature Selection Algorithms...


Forward Selection (FS) Selected Features:
NOP_method: Selected
CC_method: Selected
ATFD_method: Not Selected
FDP_method: Not Selected
CM_method: Selected
MAXNESTING_method: Not Selected
LOC_method: Not Selected
CYCLO_method: Selected
NMCS_method: Not Selected
NOLV_method: Not Selected
MaMCL_method: Not Selected
NOAV_method: Selected
LAA_method: Not Selected
FANOUT_method: Not Selected
CFNAMM_method: Not Selected
ATLD_method: Not Selected
CINT_method: Selected
MeMCL_method: Not Selected
CDISP_method: Not Selected
NOII_type: Selected
NOAM_type: Selected
NOCS_type: Not Selected
NOM_type: Not Selected
NMO_type: Not Selected
ATFD_type: Not Selected
FANOUT_type: Not Selected
NOMNAMM_type: Not Selected
NOA_type: Not Selected
NIM_type: Not Selected
DIT_type: Not Selected
LOC_type: Not Selected
LOCNAMM_type: Selected
CFNAMM_ty

In [None]:
# main program 12
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV, RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import Matern, ConstantKernel as C

best_kernel = C(1.0) * Matern(length_scale=0.5, nu=1.5)

def select_features_by_rfecv(data, target_name='is_god_class', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFECV(estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_sfs(data, target_name='is_god_class', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, n_features_to_select='auto', direction='forward', scoring='accuracy', cv=StratifiedKFold(5))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))
    # Forward Selection (FS) implementation

def select_features_by_fs(data, target_name='is_god_class', estimator=RandomForestClassifier()):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, n_features_to_select=10, direction='forward', scoring='accuracy', cv=StratifiedKFold(10))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

# Backward Elimination (BE) implementation
def select_features_by_be(data, target_name='is_god_class', estimator=RandomForestClassifier()):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, n_features_to_select=10, direction='backward', scoring='accuracy', cv=StratifiedKFold(10))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_rfe(data, target_name='is_god_class', estimator=LogisticRegression(solver='liblinear'), n_features_to_select=10):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def fitness(solution, X, y):
    if np.sum(solution) == 0:
        return 0
    X_selected = X[:, solution == 1]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

def initialize_population(pop_size, num_features):
    return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores):
    parents = []
    for _ in range(len(population)):
        i, j = np.random.choice(len(population), 2, replace=False)
        if fitness_scores[i] > fitness_scores[j]:
            parents.append(population[i])
        else:
            parents.append(population[j])
    return np.array(parents)

def crossover(parents):
    offspring = []
    for i in range(0, len(parents), 2):
        if i + 1 < len(parents):
            crossover_point = random.randint(1, parents.shape[1] - 1)
            parent1, parent2 = parents[i], parents[i + 1]
            child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
            child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
            offspring.extend([child1, child2])
    return np.array(offspring)

def mutate(offspring, mutation_rate=0.01):
    for individual in offspring:
        for gene in range(len(individual)):
            if random.random() < mutation_rate:
                individual[gene] = 1 - individual[gene]
    return offspring

def select_features_by_custom_genetic(data, target_name='is_god_class', pop_size=20, n_generations=50, mutation_rate=0.01):
    X = data.drop(target_name, axis=1).values
    y = data[target_name].values
    population = initialize_population(pop_size, X.shape[1])
    for generation in range(n_generations):
        fitness_scores = np.array([fitness(individual, X, y) for individual in population])
        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents)
        population = mutate(offspring, mutation_rate)
    best_solution = population[np.argmax(fitness_scores)]
    return dict(zip(data.drop(target_name, axis=1).columns, best_solution))

def majority_voting_features(results_dicts):
    all_features = set()
    for results in results_dicts:
        all_features.update(results.keys())

    feature_count = {feature: 0 for feature in all_features}
    for results in results_dicts:
        for feature, selected in results.items():
            if selected:
                feature_count[feature] += 1

    majority_features = [feature for feature, count in feature_count.items() if count >= len(results_dicts) // 2]
    return majority_features

drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/GodClass.csv')

X = data.drop('is_god_class', axis=1)
y = data['is_god_class']

scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_normalized.fillna(X_normalized.mean(), inplace=True)
data_normalized = pd.concat([X_normalized, y], axis=1)


fs_results = select_features_by_fs(data_normalized, target_name='is_god_class')
be_results = select_features_by_be(data_normalized, target_name='is_god_class')
rfe_results = select_features_by_rfe(data_normalized, target_name='is_god_class', n_features_to_select=10)
genetic_results = select_features_by_custom_genetic(data_normalized, target_name='is_god_class')

results_dicts = [be_results,fs_results, rfe_results, genetic_results]
majority_voting_features = majority_voting_features(results_dicts)

common_features = set()
for results in results_dicts:
    common_features.update(results.keys())

intersection_features = set.intersection(*[set(results.keys()) for results in results_dicts])

# Define parameter grids for GridSearchCV
param_grids = {
    "Decision Tree": {"criterion": ["gini", "entropy"], "max_depth": [10, 20, 30, None]},
    "SVM": {"C": [0.1, 1, 10, 100], "gamma": ["scale", "auto"], "kernel": ["linear", "rbf"]},

    "Logistic Regression": {"C": [0.01, 0.1, 1, 10, 100], "penalty": ["l1", "l2"], "solver": ["liblinear", "saga"]},
    "Random Forest": {"n_estimators": [100, 200, 300], "max_depth": [None, 10, 20], "min_samples_split": [2, 5, 10]},
    "Linear Discriminant Analysis": {"solver": ["svd", "lsqr", "eigen"]},
    "Gaussian Process": {"kernel": [best_kernel, None]}  # Example parameter grid for GPC
}

base_classifiers = [
    ("Decision Tree", DecisionTreeClassifier()),
    ("SVM", SVC(probability=True)),

    ('Logistic Regression', LogisticRegression()),
    ('Random Forest', RandomForestClassifier()),
    ("Linear Discriminant Analysis", LinearDiscriminantAnalysis()),
    ("Gaussian Process", GaussianProcessClassifier())
]

optimized_classifiers = []
for name, clf in base_classifiers:
    grid_search = GridSearchCV(clf, param_grids[name], cv=StratifiedKFold(5), scoring='accuracy')
    grid_search.fit(X_normalized, y)
    optimized_classifiers.append((name, grid_search.best_estimator_))

stacking_classifier = StackingClassifier(estimators=optimized_classifiers, final_estimator=MLPClassifier(max_iter=2000,activation='relu',solver='adam'))

feature_sets = {
    "All Features": X_normalized.columns.tolist(),

    "SFS Features": [feature for feature, selected in fs_results.items() if selected],
    "SFS Features": [feature for feature, selected in be_results.items() if selected],
    "RFE Features": [feature for feature, selected in rfe_results.items() if selected],
    "Genetic Features": [feature for feature, selected in genetic_results.items() if selected],
    "Union of All Features": list(common_features),
    "Intersection Set": list(intersection_features),
    "Majority Set": list(majority_voting_features)
}

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

results_summary = []

for feature_set_name, features in feature_sets.items():
    X_selected = X_normalized[features]
    metrics = {name: {"accuracy": [], "f1": [], "auc": []} for name, _ in base_classifiers}
    metrics["Stacking Classifier"] = {"accuracy": [], "f1": [], "auc": []}

    for train_index, test_index in skf.split(X_selected, y):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        for name, clf in optimized_classifiers:
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            y_prob = clf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC

            metrics[name]["accuracy"].append(accuracy_score(y_test, y_pred))
            metrics[name]["f1"].append(f1_score(y_test, y_pred))
            metrics[name]["auc"].append(roc_auc_score(y_test, y_prob))

        # Stacking Classifier
        stacking_classifier.fit(X_train, y_train)
        y_pred = stacking_classifier.predict(X_test)
        y_prob = stacking_classifier.predict_proba(X_test)[:, 1]

        metrics["Stacking Classifier"]["accuracy"].append(accuracy_score(y_test, y_pred))
        metrics["Stacking Classifier"]["f1"].append(f1_score(y_test, y_pred))
        metrics["Stacking Classifier"]["auc"].append(roc_auc_score(y_test, y_prob))

    print(f"Results for {feature_set_name}:")

    for name, scores in metrics.items():
        avg_accuracy = np.mean(scores["accuracy"])
        avg_f1 = np.mean(scores["f1"])
        avg_auc = np.mean(scores["auc"])
        print(f"{name}: Accuracy={avg_accuracy:.4f}, F1={avg_f1:.4f}, AUC={avg_auc:.4f}")

        results_summary.append({
            "Feature Set": feature_set_name,
            "Classifier": name,
            "Accuracy": avg_accuracy,
            "F1": avg_f1,
            "AUC": avg_auc,
            "Selected Features": features
        })

    print("-" * 40)

# Convert the results summary to a DataFrame for better visualization
results_df = pd.DataFrame(results_summary)

# Display the DataFrame containing the selected features and their evaluation metrics
print(results_df)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Results for All Features:
Decision Tree: Accuracy=0.9619, F1=0.9417, AUC=0.9536
SVM: Accuracy=0.9667, F1=0.9475, AUC=0.9954
Logistic Regression: Accuracy=0.9643, F1=0.9432, AUC=0.9957
Random Forest: Accuracy=0.9786, F1=0.9682, AUC=0.9977
Linear Discriminant Analysis: Accuracy=0.9500, F1=0.9175, AUC=0.9936
Gaussian Process: Accuracy=0.9690, F1=0.9534, AUC=0.9926
Stacking Classifier: Accuracy=0.9690, F1=0.9529, AUC=0.9972
----------------------------------------


In [None]:
# main program
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV, RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import Matern, ConstantKernel as C

# Define the kernel for Gaussian Process Classifier
best_kernel = C(1.0) * Matern(length_scale=0.5, nu=1.5)

# RFECV feature selection
def select_features_by_rfecv(data, target_name='is_god_class', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFECV(estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

# Forward Selection (FS) implementation
def select_features_by_fs(data, target_name='is_god_class', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, n_features_to_select='auto', direction='forward', scoring='accuracy', cv=StratifiedKFold(5))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

# Backward Elimination (BE) implementation
def select_features_by_be(data, target_name='is_god_class', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, n_features_to_select='auto', direction='backward', scoring='accuracy', cv=StratifiedKFold(5))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

# RFE feature selection
def select_features_by_rfe(data, target_name='is_god_class', estimator=LogisticRegression(solver='liblinear'), n_features_to_select=10):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

# Genetic algorithm for feature selection
def select_features_by_custom_genetic(data, target_name='is_god_class', pop_size=20, n_generations=50, mutation_rate=0.01):
    def fitness(solution, X, y):
        if np.sum(solution) == 0:
            return 0
        X_selected = X[:, solution == 1]
        X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
        model = LogisticRegression(max_iter=1000)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        return accuracy_score(y_test, y_pred)

    def initialize_population(pop_size, num_features):
        return np.random.randint(2, size=(pop_size, num_features))

    def select_parents(population, fitness_scores):
        parents = []
        for _ in range(len(population)):
            i, j = np.random.choice(len(population), 2, replace=False)
            if fitness_scores[i] > fitness_scores[j]:
                parents.append(population[i])
            else:
                parents.append(population[j])
        return np.array(parents)

    def crossover(parents):
        offspring = []
        for i in range(0, len(parents), 2):
            if i + 1 < len(parents):
                crossover_point = random.randint(1, parents.shape[1] - 1)
                parent1, parent2 = parents[i], parents[i + 1]
                child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
                child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
                offspring.extend([child1, child2])
        return np.array(offspring)

    def mutate(offspring, mutation_rate=0.01):
        for individual in offspring:
            for gene in range(len(individual)):
                if random.random() < mutation_rate:
                    individual[gene] = 1 - individual[gene]
        return offspring

    X = data.drop(target_name, axis=1).values
    y = data[target_name].values
    population = initialize_population(pop_size, X.shape[1])
    for generation in range(n_generations):
        fitness_scores = np.array([fitness(individual, X, y) for individual in population])
        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents)
        population = mutate(offspring, mutation_rate)
    best_solution = population[np.argmax(fitness_scores)]
    return dict(zip(data.drop(target_name, axis=1).columns, best_solution))

# Majority voting for feature selection
def majority_voting_features(results_dicts):
    all_features = set()
    for results in results_dicts:
        all_features.update(results.keys())

    feature_count = {feature: 0 for feature in all_features}
    for results in results_dicts:
        for feature, selected in results.items():
            if selected:
                feature_count[feature] += 1

    majority_features = [feature for feature, count in feature_count.items() if count >= len(results_dicts) // 2]
    return majority_features

# Load and preprocess data
drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/GodClass.csv')

X = data.drop('is_god_class', axis=1)
y = data['is_god_class']

scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_normalized.fillna(X_normalized.mean(), inplace=True)
data_normalized = pd.concat([X_normalized, y], axis=1)

# Perform feature selection using different methods
rfecv_results = select_features_by_rfecv(data_normalized, target_name='is_god_class')
fs_results = select_features_by_fs(data_normalized, target_name='is_god_class')
be_results = select_features_by_be(data_normalized, target_name='is_god_class')
rfe_results = select_features_by_rfe(data_normalized, target_name='is_god_class', n_features_to_select=10)
genetic_results = select_features_by_custom_genetic(data_normalized, target_name='is_god_class')

results_dicts = [rfecv_results, fs_results, be_results, rfe_results, genetic_results]
majority_voting_features = majority_voting_features(results_dicts)

# Feature sets
feature_sets = {
    "All Features": X_normalized.columns.tolist(),
    "RFECV Features": [feature for feature, selected in rfecv_results.items() if selected],
    "Forward Selection Features": [feature for feature, selected in fs_results.items() if selected],
    "Backward Elimination Features": [feature for feature, selected in be_results.items() if selected],
    "RFE Features": [feature for feature, selected in rfe_results.items() if selected],
    "Genetic Features": [feature for feature, selected in genetic_results.items() if selected],
    "Majority Voting Features": majority_voting_features,
}

# Define classifiers
base_classifiers = [
    ("Decision Tree", DecisionTreeClassifier()),
    ("SVM", SVC(probability=True)),
    ('Logistic Regression', LogisticRegression()),
    ('Random Forest', RandomForestClassifier()),
    ("Linear Discriminant Analysis", LinearDiscriminantAnalysis()),
    ("Gaussian Process", GaussianProcessClassifier()),
]

# Evaluate classifiers for each feature set
results_summary = []
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for feature_set_name, features in feature_sets.items():
    X_selected = X_normalized[features]
    metrics = {name: {"accuracy": [], "f1": [], "auc": []} for name, _ in base_classifiers}

    for train_index, test_index in skf.split(X_selected, y):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        for name, clf in base_classifiers:
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            y_prob = clf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC

            metrics[name]["accuracy"].append(accuracy_score(y_test, y_pred))
            metrics[name]["f1"].append(f1_score(y_test, y_pred))
            metrics[name]["auc"].append(roc_auc_score(y_test, y_prob))

    # Aggregate results
    for name, scores in metrics.items():
        avg_accuracy = np.mean(scores["accuracy"])
        avg_f1 = np.mean(scores["f1"])
        avg_auc = np.mean(scores["auc"])
        results_summary.append({
            "Feature Set": feature_set_name,
            "Classifier": name,
            "Accuracy": avg_accuracy,
            "F1": avg_f1,
            "AUC": avg_auc,
        })

# Convert to DataFrame and display
results_df = pd.DataFrame(results_summary)
print(results_df)

Mounted at /content/drive
                      Feature Set                    Classifier  Accuracy  \
0                    All Features                 Decision Tree  0.964286   
1                    All Features                           SVM  0.961905   
2                    All Features           Logistic Regression  0.964286   
3                    All Features                 Random Forest  0.978571   
4                    All Features  Linear Discriminant Analysis  0.950000   
5                    All Features              Gaussian Process  0.959524   
6                  RFECV Features                 Decision Tree  0.961905   
7                  RFECV Features                           SVM  0.964286   
8                  RFECV Features           Logistic Regression  0.964286   
9                  RFECV Features                 Random Forest  0.976190   
10                 RFECV Features  Linear Discriminant Analysis  0.950000   
11                 RFECV Features              Gau

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from random import random

from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import Matern, ConstantKernel as C

best_kernel = C(1.0) * Matern(length_scale=0.5, nu=1.5)

# ** توابع انتخاب ویژگی **

# SFS Forward
def select_features_by_sfs_forward(data, target_name='is_god_class', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, n_features_to_select='auto', direction='forward', scoring='accuracy', cv=StratifiedKFold(5))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

# SFS Backward
def select_features_by_sfs_backward(data, target_name='is_god_class', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, n_features_to_select='auto', direction='backward', scoring='accuracy', cv=StratifiedKFold(5))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_custom_genetic(data, target_name='is_long_parameters_list', pop_size=20, n_generations=50, mutation_rate=0.01):
    X = data.drop(target_name, axis=1).values
    y = data[target_name].values
    population = np.random.randint(2, size=(pop_size, X.shape[1]))

    def fitness(solution):
        if np.sum(solution) == 0:
            return 0
        X_selected = X[:, solution == 1]
        if X_selected.shape[1] == 0:
            return 0
        X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
        model = LogisticRegression(max_iter=1000)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        return (y_test == y_pred).mean()

    best_solution = None
    best_fitness = -1
    for generation in range(n_generations):
        fitness_scores = np.array([fitness(individual) for individual in population])
        if np.all(fitness_scores == 0):
            continue
        current_best = population[np.argmax(fitness_scores)]
        current_fitness = np.max(fitness_scores)
        if current_fitness > best_fitness:
            best_solution = current_best
            best_fitness = current_fitness
        parents = [current_best]
        offspring = []
        for i in range(0, len(parents), 2):
            if i + 1 < len(parents):
                crossover_point = np.random.randint(1, parents.shape[1] - 1)
                parent1, parent2 = parents[i], parents[i + 1]
                child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
                child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
                offspring.extend([child1, child2])
        population = np.array(offspring)
        for individual in population:
            for gene in range(len(individual)):
                if np.random.random() < mutation_rate:
                    individual[gene] = 1 - individual[gene]

    if best_solution is None:
        raise ValueError("No valid solution found.")

    return dict(zip(data.drop(target_name, axis=1).columns, best_solution))
# Majority Voting
def majority_voting_features(results_dicts):
    all_features = set()
    for results in results_dicts:
        all_features.update(results.keys())

    feature_count = {feature: 0 for feature in all_features}
    for results in results_dicts:
        for feature, selected in results.items():
            if selected:
                feature_count[feature] += 1

    majority_features = [feature for feature, count in feature_count.items() if count >= len(results_dicts) // 2]
    return majority_features

# ** بارگذاری و پیش‌پردازش داده‌ها **
drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/GodClass.csv')

X = data.drop('is_god_class', axis=1)
y = data['is_god_class']

scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_normalized.fillna(X_normalized.mean(), inplace=True)
data_normalized = pd.concat([X_normalized, y], axis=1)

# ** اجرای الگوریتم‌های انتخاب ویژگی **
sfs_forward_results = select_features_by_sfs_forward(data_normalized, target_name='is_god_class')
sfs_backward_results = select_features_by_sfs_backward(data_normalized, target_name='is_god_class')
genetic_results = select_features_by_custom_genetic(data_normalized, target_name='is_god_class')

results_dicts = [sfs_forward_results, sfs_backward_results, genetic_results]
majority_voting_features = majority_voting_features(results_dicts)

common_features = set()
for results in results_dicts:
    common_features.update(results.keys())

intersection_features = set.intersection(*[set(results.keys()) for results in results_dicts])

# Define parameter grids for GridSearchCV
param_grids = {
    "Decision Tree": {"criterion": ["gini", "entropy"], "max_depth": [10, 20, 30, None]},
    "SVM": {"C": [0.1, 1, 10, 100], "gamma": ["scale", "auto"], "kernel": ["linear", "rbf"]},

    "Logistic Regression": {"C": [0.01, 0.1, 1, 10, 100], "penalty": ["l1", "l2"], "solver": ["liblinear", "saga"]},
    "Random Forest": {"n_estimators": [100, 200, 300], "max_depth": [None, 10, 20], "min_samples_split": [2, 5, 10]},
    "Linear Discriminant Analysis": {"solver": ["svd", "lsqr", "eigen"]},
    "Gaussian Process": {"kernel": [best_kernel, None]}  # Example parameter grid for GPC
}

base_classifiers = [
    ("Decision Tree", DecisionTreeClassifier()),
    ("SVM", SVC(probability=True)),

    ('Logistic Regression', LogisticRegression()),
    ('Random Forest', RandomForestClassifier()),
    ("Linear Discriminant Analysis", LinearDiscriminantAnalysis()),
    ("Gaussian Process", GaussianProcessClassifier())
]

optimized_classifiers = []
for name, clf in base_classifiers:
    grid_search = GridSearchCV(clf, param_grids[name], cv=StratifiedKFold(5), scoring='accuracy')
    grid_search.fit(X_normalized, y)
    optimized_classifiers.append((name, grid_search.best_estimator_))

stacking_classifier = StackingClassifier(estimators=optimized_classifiers, final_estimator=MLPClassifier(max_iter=2000,activation='relu',solver='adam'))

feature_sets = {
    "All Features": X_normalized.columns.tolist(),
    "RFECV Features": [feature for feature, selected in rfecv_results.items() if selected],
    "SFS Features": [feature for feature, selected in sfs_results.items() if selected],
    "RFE Features": [feature for feature, selected in rfe_results.items() if selected],
    "Genetic Features": [feature for feature, selected in genetic_results.items() if selected],
    "Union of All Features": list(common_features),
    "Intersection Set": list(intersection_features),
    "Majority Set": list(majority_voting_features)
}

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

results_summary = []

for feature_set_name, features in feature_sets.items():
    X_selected = X_normalized[features]
    metrics = {name: {"accuracy": [], "f1": [], "auc": []} for name, _ in base_classifiers}
    metrics["Stacking Classifier"] = {"accuracy": [], "f1": [], "auc": []}

    for train_index, test_index in skf.split(X_selected, y):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        for name, clf in optimized_classifiers:
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            y_prob = clf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC

            metrics[name]["accuracy"].append(accuracy_score(y_test, y_pred))
            metrics[name]["f1"].append(f1_score(y_test, y_pred))
            metrics[name]["auc"].append(roc_auc_score(y_test, y_prob))

        # Stacking Classifier
        stacking_classifier.fit(X_train, y_train)
        y_pred = stacking_classifier.predict(X_test)
        y_prob = stacking_classifier.predict_proba(X_test)[:, 1]

        metrics["Stacking Classifier"]["accuracy"].append(accuracy_score(y_test, y_pred))
        metrics["Stacking Classifier"]["f1"].append(f1_score(y_test, y_pred))
        metrics["Stacking Classifier"]["auc"].append(roc_auc_score(y_test, y_prob))

    print(f"Results for {feature_set_name}:")

    for name, scores in metrics.items():
        avg_accuracy = np.mean(scores["accuracy"])
        avg_f1 = np.mean(scores["f1"])
        avg_auc = np.mean(scores["auc"])
        print(f"{name}: Accuracy={avg_accuracy:.4f}, F1={avg_f1:.4f}, AUC={avg_auc:.4f}")

        results_summary.append({
            "Feature Set": feature_set_name,
            "Classifier": name,
            "Accuracy": avg_accuracy,
            "F1": avg_f1,
            "AUC": avg_auc,
            "Selected Features": features
        })

    print("-" * 40)

# Convert the results summary to a DataFrame for better visualization
results_df = pd.DataFrame(results_summary)

# Display the DataFrame containing the selected features and their evaluation metrics
print(results_df)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# main program 11
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV, RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import Matern, ConstantKernel as C

best_kernel = C(1.0) * Matern(length_scale=0.5, nu=1.5)

def select_features_by_rfecv(data, target_name='is_long_method', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFECV(estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_sfs(data, target_name='is_long_method', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, n_features_to_select='auto', direction='forward', scoring='accuracy', cv=StratifiedKFold(5))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_rfe(data, target_name='is_long_method', estimator=LogisticRegression(solver='liblinear'), n_features_to_select=10):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def fitness(solution, X, y):
    if np.sum(solution) == 0:
        return 0
    X_selected = X[:, solution == 1]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

def initialize_population(pop_size, num_features):
    return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores):
    parents = []
    for _ in range(len(population)):
        i, j = np.random.choice(len(population), 2, replace=False)
        if fitness_scores[i] > fitness_scores[j]:
            parents.append(population[i])
        else:
            parents.append(population[j])
    return np.array(parents)

def crossover(parents):
    offspring = []
    for i in range(0, len(parents), 2):
        if i + 1 < len(parents):
            crossover_point = random.randint(1, parents.shape[1] - 1)
            parent1, parent2 = parents[i], parents[i + 1]
            child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
            child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
            offspring.extend([child1, child2])
    return np.array(offspring)

def mutate(offspring, mutation_rate=0.01):
    for individual in offspring:
        for gene in range(len(individual)):
            if random.random() < mutation_rate:
                individual[gene] = 1 - individual[gene]
    return offspring

def select_features_by_custom_genetic(data, target_name='is_long_method', pop_size=20, n_generations=50, mutation_rate=0.01):
    X = data.drop(target_name, axis=1).values
    y = data[target_name].values
    population = initialize_population(pop_size, X.shape[1])
    for generation in range(n_generations):
        fitness_scores = np.array([fitness(individual, X, y) for individual in population])
        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents)
        population = mutate(offspring, mutation_rate)
    best_solution = population[np.argmax(fitness_scores)]
    return dict(zip(data.drop(target_name, axis=1).columns, best_solution))

def majority_voting_features(results_dicts):
    all_features = set()
    for results in results_dicts:
        all_features.update(results.keys())

    feature_count = {feature: 0 for feature in all_features}
    for results in results_dicts:
        for feature, selected in results.items():
            if selected:
                feature_count[feature] += 1

    majority_features = [feature for feature, count in feature_count.items() if count >= len(results_dicts) // 2]
    return majority_features

drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/LongMethod.csv')

X = data.drop('is_long_method', axis=1)
y = data['is_long_method']

scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_normalized.fillna(X_normalized.mean(), inplace=True)
data_normalized = pd.concat([X_normalized, y], axis=1)

rfecv_results = select_features_by_rfecv(data_normalized, target_name='is_long_method')
sfs_results = select_features_by_sfs(data_normalized, target_name='is_long_method')
rfe_results = select_features_by_rfe(data_normalized, target_name='is_long_method', n_features_to_select=10)
genetic_results = select_features_by_custom_genetic(data_normalized, target_name='is_long_method')

results_dicts = [rfecv_results, sfs_results, rfe_results, genetic_results]
majority_voting_features = majority_voting_features(results_dicts)

common_features = set()
for results in results_dicts:
    common_features.update(results.keys())

intersection_features = set.intersection(*[set(results.keys()) for results in results_dicts])

# Define parameter grids for GridSearchCV
param_grids = {
    "Decision Tree": {"criterion": ["gini", "entropy"], "max_depth": [10, 20, 30, None]},
    "SVM": {"C": [0.1, 1, 10, 100], "gamma": ["scale", "auto"], "kernel": ["linear", "rbf"]},

    "Logistic Regression": {"C": [0.01, 0.1, 1, 10, 100], "penalty": ["l1", "l2"], "solver": ["liblinear", "saga"]},
    "Random Forest": {"n_estimators": [100, 200, 300], "max_depth": [None, 10, 20], "min_samples_split": [2, 5, 10]},
    "Linear Discriminant Analysis": {"solver": ["svd", "lsqr", "eigen"]},
    "Gaussian Process": {"kernel": [best_kernel, None]}  # Example parameter grid for GPC
}

base_classifiers = [
    ("Decision Tree", DecisionTreeClassifier()),
    ("SVM", SVC(probability=True)),

    ('Logistic Regression', LogisticRegression()),
    ('Random Forest', RandomForestClassifier()),
    ("Linear Discriminant Analysis", LinearDiscriminantAnalysis()),
    ("Gaussian Process", GaussianProcessClassifier())
]

optimized_classifiers = []
for name, clf in base_classifiers:
    grid_search = GridSearchCV(clf, param_grids[name], cv=StratifiedKFold(5), scoring='accuracy')
    grid_search.fit(X_normalized, y)
    optimized_classifiers.append((name, grid_search.best_estimator_))

stacking_classifier = StackingClassifier(estimators=optimized_classifiers, final_estimator=MLPClassifier(max_iter=2000,activation='relu',solver='adam'))

feature_sets = {
    "All Features": X_normalized.columns.tolist(),
    "RFECV Features": [feature for feature, selected in rfecv_results.items() if selected],
    "SFS Features": [feature for feature, selected in sfs_results.items() if selected],
    "RFE Features": [feature for feature, selected in rfe_results.items() if selected],
    "Genetic Features": [feature for feature, selected in genetic_results.items() if selected],
    "Union of All Features": list(common_features),
    "Intersection Set": list(intersection_features),
    "Majority Set": list(majority_voting_features)
}

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

results_summary = []

for feature_set_name, features in feature_sets.items():
    X_selected = X_normalized[features]
    metrics = {name: {"accuracy": [], "f1": [], "auc": []} for name, _ in base_classifiers}
    metrics["Stacking Classifier"] = {"accuracy": [], "f1": [], "auc": []}

    for train_index, test_index in skf.split(X_selected, y):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        for name, clf in optimized_classifiers:
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            y_prob = clf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC

            metrics[name]["accuracy"].append(accuracy_score(y_test, y_pred))
            metrics[name]["f1"].append(f1_score(y_test, y_pred))
            metrics[name]["auc"].append(roc_auc_score(y_test, y_prob))

        # Stacking Classifier
        stacking_classifier.fit(X_train, y_train)
        y_pred = stacking_classifier.predict(X_test)
        y_prob = stacking_classifier.predict_proba(X_test)[:, 1]

        metrics["Stacking Classifier"]["accuracy"].append(accuracy_score(y_test, y_pred))
        metrics["Stacking Classifier"]["f1"].append(f1_score(y_test, y_pred))
        metrics["Stacking Classifier"]["auc"].append(roc_auc_score(y_test, y_prob))

    print(f"Results for {feature_set_name}:")

    for name, scores in metrics.items():
        avg_accuracy = np.mean(scores["accuracy"])
        avg_f1 = np.mean(scores["f1"])
        avg_auc = np.mean(scores["auc"])
        print(f"{name}: Accuracy={avg_accuracy:.4f}, F1={avg_f1:.4f}, AUC={avg_auc:.4f}")

        results_summary.append({
            "Feature Set": feature_set_name,
            "Classifier": name,
            "Accuracy": avg_accuracy,
            "F1": avg_f1,
            "AUC": avg_auc,
            "Selected Features": features
        })

    print("-" * 40)

# Convert the results summary to a DataFrame for better visualization
results_df = pd.DataFrame(results_summary)

# Display the DataFrame containing the selected features and their evaluation metrics
print(results_df)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Results for All Features:
Decision Tree: Accuracy=0.9905, F1=0.9854, AUC=0.9875
SVM: Accuracy=0.9738, F1=0.9603, AUC=0.9972
Logistic Regression: Accuracy=0.9905, F1=0.9857, AUC=0.9992
Random Forest: Accuracy=0.9905, F1=0.9860, AUC=0.9997
Linear Discriminant Analysis: Accuracy=0.9500, F1=0.9241, AUC=0.9802
Gaussian Process: Accuracy=0.9833, F1=0.9754, AUC=0.9982
Stacking Classifier: Accuracy=0.9952, F1=0.9928, AUC=0.9995
----------------------------------------
Results for RFECV Features:
Decision Tree: Accuracy=0.9929, F1=0.9891, AUC=0.9911
SVM: Accuracy=0.9881, F1=0.9825, AUC=0.9990
Logistic Regression: Accuracy=0.9905, F1=0.9862, AUC=0.9985
Random Forest: Accuracy=0.9976, F1=0.9963, AUC=1.0000
Linear Discriminant Analysis: Accuracy=0.9643, F1=0.9444, AUC=0.9972
Gaussian Process: Accuracy=0.9857, F1=0.9791, AUC=0.9992
Stacking Classifier: Accuracy=0.9952, F1

In [None]:
# main program 10
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV, RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import Matern, ConstantKernel as C

best_kernel = C(1.0) * Matern(length_scale=0.5, nu=1.5)

def select_features_by_rfecv(data, target_name='is_data_class', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFECV(estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_sfs(data, target_name='is_data_class', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, n_features_to_select='auto', direction='forward', scoring='accuracy', cv=StratifiedKFold(5))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_rfe(data, target_name='is_data_class', estimator=LogisticRegression(solver='liblinear'), n_features_to_select=10):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def fitness(solution, X, y):
    if np.sum(solution) == 0:
        return 0
    X_selected = X[:, solution == 1]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

def initialize_population(pop_size, num_features):
    return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores):
    parents = []
    for _ in range(len(population)):
        i, j = np.random.choice(len(population), 2, replace=False)
        if fitness_scores[i] > fitness_scores[j]:
            parents.append(population[i])
        else:
            parents.append(population[j])
    return np.array(parents)

def crossover(parents):
    offspring = []
    for i in range(0, len(parents), 2):
        if i + 1 < len(parents):
            crossover_point = random.randint(1, parents.shape[1] - 1)
            parent1, parent2 = parents[i], parents[i + 1]
            child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
            child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
            offspring.extend([child1, child2])
    return np.array(offspring)

def mutate(offspring, mutation_rate=0.01):
    for individual in offspring:
        for gene in range(len(individual)):
            if random.random() < mutation_rate:
                individual[gene] = 1 - individual[gene]
    return offspring

def select_features_by_custom_genetic(data, target_name='is_data_class', pop_size=20, n_generations=50, mutation_rate=0.01):
    X = data.drop(target_name, axis=1).values
    y = data[target_name].values
    population = initialize_population(pop_size, X.shape[1])
    for generation in range(n_generations):
        fitness_scores = np.array([fitness(individual, X, y) for individual in population])
        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents)
        population = mutate(offspring, mutation_rate)
    best_solution = population[np.argmax(fitness_scores)]
    return dict(zip(data.drop(target_name, axis=1).columns, best_solution))

def majority_voting_features(results_dicts):
    all_features = set()
    for results in results_dicts:
        all_features.update(results.keys())

    feature_count = {feature: 0 for feature in all_features}
    for results in results_dicts:
        for feature, selected in results.items():
            if selected:
                feature_count[feature] += 1

    majority_features = [feature for feature, count in feature_count.items() if count >= len(results_dicts) // 2]
    return majority_features

drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/FeatureEnvy.csv')

X = data.drop('is_feature_envy', axis=1)
y = data['is_feature_envy']

scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_normalized.fillna(X_normalized.mean(), inplace=True)
data_normalized = pd.concat([X_normalized, y], axis=1)

rfecv_results = select_features_by_rfecv(data_normalized, target_name='is_feature_envy')
sfs_results = select_features_by_sfs(data_normalized, target_name='is_feature_envy')
rfe_results = select_features_by_rfe(data_normalized, target_name='is_feature_envy', n_features_to_select=10)
genetic_results = select_features_by_custom_genetic(data_normalized, target_name='is_feature_envy')

results_dicts = [rfecv_results, sfs_results, rfe_results, genetic_results]
majority_voting_features = majority_voting_features(results_dicts)

common_features = set()
for results in results_dicts:
    common_features.update(results.keys())

intersection_features = set.intersection(*[set(results.keys()) for results in results_dicts])

# Define parameter grids for GridSearchCV
param_grids = {
    "Decision Tree": {"criterion": ["gini", "entropy"], "max_depth": [10, 20, 30, None]},
    "SVM": {"C": [0.1, 1, 10, 100], "gamma": ["scale", "auto"], "kernel": ["linear", "rbf"]},

    "Logistic Regression": {"C": [0.01, 0.1, 1, 10, 100], "penalty": ["l1", "l2"], "solver": ["liblinear", "saga"]},
    "Random Forest": {"n_estimators": [100, 200, 300], "max_depth": [None, 10, 20], "min_samples_split": [2, 5, 10]},
    "Linear Discriminant Analysis": {"solver": ["svd", "lsqr", "eigen"]},
    "Gaussian Process": {"kernel": [best_kernel, None]}  # Example parameter grid for GPC
}

base_classifiers = [
    ("Decision Tree", DecisionTreeClassifier()),
    ("SVM", SVC(probability=True)),

    ('Logistic Regression', LogisticRegression()),
    ('Random Forest', RandomForestClassifier()),
    ("Linear Discriminant Analysis", LinearDiscriminantAnalysis()),
    ("Gaussian Process", GaussianProcessClassifier())
]

optimized_classifiers = []
for name, clf in base_classifiers:
    grid_search = GridSearchCV(clf, param_grids[name], cv=StratifiedKFold(5), scoring='accuracy')
    grid_search.fit(X_normalized, y)
    optimized_classifiers.append((name, grid_search.best_estimator_))

stacking_classifier = StackingClassifier(estimators=optimized_classifiers, final_estimator=MLPClassifier(max_iter=2000,activation='relu',solver='adam'))

feature_sets = {
    "All Features": X_normalized.columns.tolist(),
    "RFECV Features": [feature for feature, selected in rfecv_results.items() if selected],
    "SFS Features": [feature for feature, selected in sfs_results.items() if selected],
    "RFE Features": [feature for feature, selected in rfe_results.items() if selected],
    "Genetic Features": [feature for feature, selected in genetic_results.items() if selected],
    "Union of All Features": list(common_features),
    "Intersection Set": list(intersection_features),
    "Majority Set": list(majority_voting_features)
}

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

results_summary = []

for feature_set_name, features in feature_sets.items():
    X_selected = X_normalized[features]
    metrics = {name: {"accuracy": [], "f1": [], "auc": []} for name, _ in base_classifiers}
    metrics["Stacking Classifier"] = {"accuracy": [], "f1": [], "auc": []}

    for train_index, test_index in skf.split(X_selected, y):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        for name, clf in optimized_classifiers:
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            y_prob = clf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC

            metrics[name]["accuracy"].append(accuracy_score(y_test, y_pred))
            metrics[name]["f1"].append(f1_score(y_test, y_pred))
            metrics[name]["auc"].append(roc_auc_score(y_test, y_prob))

        # Stacking Classifier
        stacking_classifier.fit(X_train, y_train)
        y_pred = stacking_classifier.predict(X_test)
        y_prob = stacking_classifier.predict_proba(X_test)[:, 1]

        metrics["Stacking Classifier"]["accuracy"].append(accuracy_score(y_test, y_pred))
        metrics["Stacking Classifier"]["f1"].append(f1_score(y_test, y_pred))
        metrics["Stacking Classifier"]["auc"].append(roc_auc_score(y_test, y_prob))

    print(f"Results for {feature_set_name}:")

    for name, scores in metrics.items():
        avg_accuracy = np.mean(scores["accuracy"])
        avg_f1 = np.mean(scores["f1"])
        avg_auc = np.mean(scores["auc"])
        print(f"{name}: Accuracy={avg_accuracy:.4f}, F1={avg_f1:.4f}, AUC={avg_auc:.4f}")

        results_summary.append({
            "Feature Set": feature_set_name,
            "Classifier": name,
            "Accuracy": avg_accuracy,
            "F1": avg_f1,
            "AUC": avg_auc,
            "Selected Features": features
        })

    print("-" * 40)

# Convert the results summary to a DataFrame for better visualization
results_df = pd.DataFrame(results_summary)

# Display the DataFrame containing the selected features and their evaluation metrics
print(results_df)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Results for All Features:
Decision Tree: Accuracy=0.9452, F1=0.9189, AUC=0.9411
SVM: Accuracy=0.9452, F1=0.9172, AUC=0.9640
Logistic Regression: Accuracy=0.9476, F1=0.9189, AUC=0.9878
Random Forest: Accuracy=0.9548, F1=0.9337, AUC=0.9893
Linear Discriminant Analysis: Accuracy=0.8881, F1=0.8089, AUC=0.9643
Gaussian Process: Accuracy=0.9452, F1=0.9141, AUC=0.9847
Stacking Classifier: Accuracy=0.9595, F1=0.9401, AUC=0.9895
----------------------------------------
Results for RFECV Features:
Decision Tree: Accuracy=0.9571, F1=0.9346, AUC=0.9500
SVM: Accuracy=0.9595, F1=0.9382, AUC=0.9895
Logistic Regression: Accuracy=0.9429, F1=0.9107, AUC=0.9885
Random Forest: Accuracy=0.9643, F1=0.9467, AUC=0.9944
Linear Discriminant Analysis: Accuracy=0.8952, F1=0.8147, AUC=0.9916
Gaussian Process: Accuracy=0.9738, F1=0.9602, AUC=0.9918
Stacking Classifier: Accuracy=0.9714, F1

In [None]:
# main program 9
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV, RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import Matern, ConstantKernel as C

best_kernel = C(1.0) * Matern(length_scale=0.5, nu=1.5)

def select_features_by_rfecv(data, target_name='is_data_class', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFECV(estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_sfs(data, target_name='is_data_class', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, n_features_to_select='auto', direction='forward', scoring='accuracy', cv=StratifiedKFold(5))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_rfe(data, target_name='is_data_class', estimator=LogisticRegression(solver='liblinear'), n_features_to_select=10):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def fitness(solution, X, y):
    if np.sum(solution) == 0:
        return 0
    X_selected = X[:, solution == 1]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

def initialize_population(pop_size, num_features):
    return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores):
    parents = []
    for _ in range(len(population)):
        i, j = np.random.choice(len(population), 2, replace=False)
        if fitness_scores[i] > fitness_scores[j]:
            parents.append(population[i])
        else:
            parents.append(population[j])
    return np.array(parents)

def crossover(parents):
    offspring = []
    for i in range(0, len(parents), 2):
        if i + 1 < len(parents):
            crossover_point = random.randint(1, parents.shape[1] - 1)
            parent1, parent2 = parents[i], parents[i + 1]
            child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
            child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
            offspring.extend([child1, child2])
    return np.array(offspring)

def mutate(offspring, mutation_rate=0.01):
    for individual in offspring:
        for gene in range(len(individual)):
            if random.random() < mutation_rate:
                individual[gene] = 1 - individual[gene]
    return offspring

def select_features_by_custom_genetic(data, target_name='is_data_class', pop_size=20, n_generations=50, mutation_rate=0.01):
    X = data.drop(target_name, axis=1).values
    y = data[target_name].values
    population = initialize_population(pop_size, X.shape[1])
    for generation in range(n_generations):
        fitness_scores = np.array([fitness(individual, X, y) for individual in population])
        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents)
        population = mutate(offspring, mutation_rate)
    best_solution = population[np.argmax(fitness_scores)]
    return dict(zip(data.drop(target_name, axis=1).columns, best_solution))

def majority_voting_features(results_dicts):
    all_features = set()
    for results in results_dicts:
        all_features.update(results.keys())

    feature_count = {feature: 0 for feature in all_features}
    for results in results_dicts:
        for feature, selected in results.items():
            if selected:
                feature_count[feature] += 1

    majority_features = [feature for feature, count in feature_count.items() if count >= len(results_dicts) // 2]
    return majority_features

drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/FeatureEnvy.csv')

X = data.drop('is_data_class', axis=1)
y = data['is_data_class']

scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_normalized.fillna(X_normalized.mean(), inplace=True)
data_normalized = pd.concat([X_normalized, y], axis=1)

rfecv_results = select_features_by_rfecv(data_normalized, target_name='is_data_class')
sfs_results = select_features_by_sfs(data_normalized, target_name='is_data_class')
rfe_results = select_features_by_rfe(data_normalized, target_name='is_data_class', n_features_to_select=10)
genetic_results = select_features_by_custom_genetic(data_normalized, target_name='is_data_class')

results_dicts = [rfecv_results, sfs_results, rfe_results, genetic_results]
majority_voting_features = majority_voting_features(results_dicts)

common_features = set()
for results in results_dicts:
    common_features.update(results.keys())

intersection_features = set.intersection(*[set(results.keys()) for results in results_dicts])

# Define parameter grids for GridSearchCV
param_grids = {
    "Decision Tree": {"criterion": ["gini", "entropy"], "max_depth": [10, 20, 30, None]},
    "SVM": {"C": [0.1, 1, 10, 100], "gamma": ["scale", "auto"], "kernel": ["linear", "rbf"]},

    "Logistic Regression": {"C": [0.01, 0.1, 1, 10, 100], "penalty": ["l1", "l2"], "solver": ["liblinear", "saga"]},
    "Random Forest": {"n_estimators": [100, 200, 300], "max_depth": [None, 10, 20], "min_samples_split": [2, 5, 10]},
    "Linear Discriminant Analysis": {"solver": ["svd", "lsqr", "eigen"]},
    "Gaussian Process": {"kernel": [best_kernel, None]}  # Example parameter grid for GPC
}

base_classifiers = [
    ("Decision Tree", DecisionTreeClassifier()),
    ("SVM", SVC(probability=True)),

    ('Logistic Regression', LogisticRegression()),
    ('Random Forest', RandomForestClassifier()),
    ("Linear Discriminant Analysis", LinearDiscriminantAnalysis()),
    ("Gaussian Process", GaussianProcessClassifier())
]

optimized_classifiers = []
for name, clf in base_classifiers:
    grid_search = GridSearchCV(clf, param_grids[name], cv=StratifiedKFold(5), scoring='accuracy')
    grid_search.fit(X_normalized, y)
    optimized_classifiers.append((name, grid_search.best_estimator_))

stacking_classifier = StackingClassifier(estimators=optimized_classifiers, final_estimator=MLPClassifier(max_iter=2000,activation='relu',solver='adam'))

feature_sets = {
    "All Features": X_normalized.columns.tolist(),
    "RFECV Features": [feature for feature, selected in rfecv_results.items() if selected],
    "SFS Features": [feature for feature, selected in sfs_results.items() if selected],
    "RFE Features": [feature for feature, selected in rfe_results.items() if selected],
    "Genetic Features": [feature for feature, selected in genetic_results.items() if selected],
    "Union of All Features": list(common_features),
    "Intersection Set": list(intersection_features),
    "Majority Set": list(majority_voting_features)
}

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

results_summary = []

for feature_set_name, features in feature_sets.items():
    X_selected = X_normalized[features]
    metrics = {name: {"accuracy": [], "f1": [], "auc": []} for name, _ in base_classifiers}
    metrics["Stacking Classifier"] = {"accuracy": [], "f1": [], "auc": []}

    for train_index, test_index in skf.split(X_selected, y):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        for name, clf in optimized_classifiers:
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            y_prob = clf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC

            metrics[name]["accuracy"].append(accuracy_score(y_test, y_pred))
            metrics[name]["f1"].append(f1_score(y_test, y_pred))
            metrics[name]["auc"].append(roc_auc_score(y_test, y_prob))

        # Stacking Classifier
        stacking_classifier.fit(X_train, y_train)
        y_pred = stacking_classifier.predict(X_test)
        y_prob = stacking_classifier.predict_proba(X_test)[:, 1]

        metrics["Stacking Classifier"]["accuracy"].append(accuracy_score(y_test, y_pred))
        metrics["Stacking Classifier"]["f1"].append(f1_score(y_test, y_pred))
        metrics["Stacking Classifier"]["auc"].append(roc_auc_score(y_test, y_prob))

    print(f"Results for {feature_set_name}:")

    for name, scores in metrics.items():
        avg_accuracy = np.mean(scores["accuracy"])
        avg_f1 = np.mean(scores["f1"])
        avg_auc = np.mean(scores["auc"])
        print(f"{name}: Accuracy={avg_accuracy:.4f}, F1={avg_f1:.4f}, AUC={avg_auc:.4f}")

        results_summary.append({
            "Feature Set": feature_set_name,
            "Classifier": name,
            "Accuracy": avg_accuracy,
            "F1": avg_f1,
            "AUC": avg_auc,
            "Selected Features": features
        })

    print("-" * 40)

# Convert the results summary to a DataFrame for better visualization
results_df = pd.DataFrame(results_summary)

# Display the DataFrame containing the selected features and their evaluation metrics
print(results_df)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Results for All Features:
Decision Tree: Accuracy=0.9881, F1=0.9827, AUC=0.9893
SVM: Accuracy=0.9690, F1=0.9554, AUC=0.9916
Logistic Regression: Accuracy=0.9762, F1=0.9654, AUC=0.9939
Random Forest: Accuracy=0.9857, F1=0.9795, AUC=1.0000
Linear Discriminant Analysis: Accuracy=0.9190, F1=0.8864, AUC=0.9788
Gaussian Process: Accuracy=0.9738, F1=0.9608, AUC=0.9967
Stacking Classifier: Accuracy=0.9952, F1=0.9928, AUC=1.0000
----------------------------------------
Results for RFECV Features:
Decision Tree: Accuracy=0.9381, F1=0.9061, AUC=0.9304
SVM: Accuracy=0.9476, F1=0.9229, AUC=0.9806
Logistic Regression: Accuracy=0.9476, F1=0.9236, AUC=0.9821
Random Forest: Accuracy=0.9714, F1=0.9565, AUC=0.9967
Linear Discriminant Analysis: Accuracy=0.9119, F1=0.8788, AUC=0.9661
Gaussian Process: Accuracy=0.9643, F1=0.9468, AUC=0.9908
Stacking Classifier: Accuracy=0.9714, F1

KeyboardInterrupt: 

In [None]:
# main program 8
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV, RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import Matern, ConstantKernel as C

best_kernel = C(1.0) * Matern(length_scale=0.5, nu=1.5)

def select_features_by_rfecv(data, target_name='is_long_parameters_list', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFECV(estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_sfs(data, target_name='is_long_parameters_list', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, n_features_to_select='auto', direction='forward', scoring='accuracy', cv=StratifiedKFold(5))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_rfe(data, target_name='is_long_parameters_list', estimator=LogisticRegression(solver='liblinear'), n_features_to_select=10):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def fitness(solution, X, y):
    if np.sum(solution) == 0:
        return 0
    X_selected = X[:, solution == 1]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

def initialize_population(pop_size, num_features):
    return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores):
    parents = []
    for _ in range(len(population)):
        i, j = np.random.choice(len(population), 2, replace=False)
        if fitness_scores[i] > fitness_scores[j]:
            parents.append(population[i])
        else:
            parents.append(population[j])
    return np.array(parents)

def crossover(parents):
    offspring = []
    for i in range(0, len(parents), 2):
        if i + 1 < len(parents):
            crossover_point = random.randint(1, parents.shape[1] - 1)
            parent1, parent2 = parents[i], parents[i + 1]
            child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
            child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
            offspring.extend([child1, child2])
    return np.array(offspring)

def mutate(offspring, mutation_rate=0.01):
    for individual in offspring:
        for gene in range(len(individual)):
            if random.random() < mutation_rate:
                individual[gene] = 1 - individual[gene]
    return offspring

def select_features_by_custom_genetic(data, target_name='is_switch_statements', pop_size=20, n_generations=50, mutation_rate=0.01):
    X = data.drop(target_name, axis=1).values
    y = data[target_name].values
    population = initialize_population(pop_size, X.shape[1])
    for generation in range(n_generations):
        fitness_scores = np.array([fitness(individual, X, y) for individual in population])
        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents)
        population = mutate(offspring, mutation_rate)
    best_solution = population[np.argmax(fitness_scores)]
    return dict(zip(data.drop(target_name, axis=1).columns, best_solution))

def majority_voting_features(results_dicts):
    all_features = set()
    for results in results_dicts:
        all_features.update(results.keys())

    feature_count = {feature: 0 for feature in all_features}
    for results in results_dicts:
        for feature, selected in results.items():
            if selected:
                feature_count[feature] += 1

    majority_features = [feature for feature, count in feature_count.items() if count >= len(results_dicts) // 2]
    return majority_features

drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/SwitchStatements.csv')

X = data.drop('is_switch_statements', axis=1)
y = data['is_switch_statements']

scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_normalized.fillna(X_normalized.mean(), inplace=True)
data_normalized = pd.concat([X_normalized, y], axis=1)

rfecv_results = select_features_by_rfecv(data_normalized, target_name='is_switch_statements')
sfs_results = select_features_by_sfs(data_normalized, target_name='is_switch_statements')
rfe_results = select_features_by_rfe(data_normalized, target_name='is_switch_statements', n_features_to_select=10)
genetic_results = select_features_by_custom_genetic(data_normalized, target_name='is_switch_statements')

results_dicts = [rfecv_results, sfs_results, rfe_results, genetic_results]
majority_voting_features = majority_voting_features(results_dicts)

common_features = set()
for results in results_dicts:
    common_features.update(results.keys())

intersection_features = set.intersection(*[set(results.keys()) for results in results_dicts])

# Define parameter grids for GridSearchCV
param_grids = {
    "Decision Tree": {"criterion": ["gini", "entropy"], "max_depth": [10, 20, 30, None]},
    "SVM": {"C": [0.1, 1, 10, 100], "gamma": ["scale", "auto"], "kernel": ["linear", "rbf"]},
    "KNN": {"n_neighbors": [3, 5, 7], "weights": ["uniform", "distance"]},
    "Logistic Regression": {"C": [0.01, 0.1, 1, 10, 100], "penalty": ["l1", "l2"], "solver": ["liblinear", "saga"]},
    "MLP": {"hidden_layer_sizes": [(50,), (100,), (50,50)], "activation": ["tanh", "relu"], "solver": ["adam", "sgd"]},
    "Linear Discriminant Analysis": {"solver": ["svd", "lsqr", "eigen"]},
    "Gaussian Process": {"kernel": [best_kernel, None]}  # Example parameter grid for GPC
}

base_classifiers = [
    ("Decision Tree", DecisionTreeClassifier()),
    ("SVM", SVC(probability=True)),
    ("KNN", KNeighborsClassifier()),
    ('Logistic Regression', LogisticRegression()),
    ('MLP', MLPClassifier(max_iter=2000)),
    ("Linear Discriminant Analysis", LinearDiscriminantAnalysis()),
    ("Gaussian Process", GaussianProcessClassifier())
]

optimized_classifiers = []
for name, clf in base_classifiers:
    grid_search = GridSearchCV(clf, param_grids[name], cv=StratifiedKFold(5), scoring='accuracy')
    grid_search.fit(X_normalized, y)
    optimized_classifiers.append((name, grid_search.best_estimator_))

stacking_classifier = StackingClassifier(estimators=optimized_classifiers, final_estimator=RandomForestClassifier())

feature_sets = {
    "All Features": X_normalized.columns.tolist(),
    "RFECV Features": [feature for feature, selected in rfecv_results.items() if selected],
    "SFS Features": [feature for feature, selected in sfs_results.items() if selected],
    "RFE Features": [feature for feature, selected in rfe_results.items() if selected],
    "Genetic Features": [feature for feature, selected in genetic_results.items() if selected],
    "Union of All Features": list(common_features),
    "Intersection Set": list(intersection_features),
    "Majority Set": list(majority_voting_features)
}

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

results_summary = []

for feature_set_name, features in feature_sets.items():
    X_selected = X_normalized[features]
    metrics = {name: {"accuracy": [], "f1": [], "auc": []} for name, _ in base_classifiers}
    metrics["Stacking Classifier"] = {"accuracy": [], "f1": [], "auc": []}

    for train_index, test_index in skf.split(X_selected, y):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        for name, clf in optimized_classifiers:
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            y_prob = clf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC

            metrics[name]["accuracy"].append(accuracy_score(y_test, y_pred))
            metrics[name]["f1"].append(f1_score(y_test, y_pred))
            metrics[name]["auc"].append(roc_auc_score(y_test, y_prob))

        # Stacking Classifier
        stacking_classifier.fit(X_train, y_train)
        y_pred = stacking_classifier.predict(X_test)
        y_prob = stacking_classifier.predict_proba(X_test)[:, 1]

        metrics["Stacking Classifier"]["accuracy"].append(accuracy_score(y_test, y_pred))
        metrics["Stacking Classifier"]["f1"].append(f1_score(y_test, y_pred))
        metrics["Stacking Classifier"]["auc"].append(roc_auc_score(y_test, y_prob))

    print(f"Results for {feature_set_name}:")

    for name, scores in metrics.items():
        avg_accuracy = np.mean(scores["accuracy"])
        avg_f1 = np.mean(scores["f1"])
        avg_auc = np.mean(scores["auc"])
        print(f"{name}: Accuracy={avg_accuracy:.4f}, F1={avg_f1:.4f}, AUC={avg_auc:.4f}")

        results_summary.append({
            "Feature Set": feature_set_name,
            "Classifier": name,
            "Accuracy": avg_accuracy,
            "F1": avg_f1,
            "AUC": avg_auc,
            "Selected Features": features
        })

    print("-" * 40)

# Convert the results summary to a DataFrame for better visualization
results_df = pd.DataFrame(results_summary)

# Display the DataFrame containing the selected features and their evaluation metrics
print(results_df)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Results for All Features:
Decision Tree: Accuracy=0.8452, F1=0.7396, AUC=0.8132
SVM: Accuracy=0.8595, F1=0.7488, AUC=0.9292
KNN: Accuracy=0.7905, F1=0.6036, AUC=0.8498
Logistic Regression: Accuracy=0.9000, F1=0.8343, AUC=0.9526
MLP: Accuracy=0.8762, F1=0.8031, AUC=0.9355
Linear Discriminant Analysis: Accuracy=0.8500, F1=0.7381, AUC=0.9196
Gaussian Process: Accuracy=0.8548, F1=0.7496, AUC=0.9356
Stacking Classifier: Accuracy=0.8905, F1=0.8197, AUC=0.9451
----------------------------------------
Results for RFECV Features:
Decision Tree: Accuracy=0.8500, F1=0.7630, AUC=0.8319
SVM: Accuracy=0.8667, F1=0.7621, AUC=0.9438
KNN: Accuracy=0.8571, F1=0.7488, AUC=0.9149
Logistic Regression: Accuracy=0.8952, F1=0.8268, AUC=0.9609
MLP: Accuracy=0.8905, F1=0.8223, AUC=0.9580
Linear Discriminant Analysis: Accuracy=0.8714, F1=0.7801, AUC=0.9435
Gaussian Process: Accuracy=0.

In [None]:
#main program 7
import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV, RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import Matern, ConstantKernel as C
best_kernel = C(1.0) * Matern(length_scale=0.5, nu=1.5)

def select_features_by_rfecv(data, target_name='is_switch_statements', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFECV(estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_sfs(data, target_name='is_switch_statements', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, n_features_to_select='auto', direction='forward', scoring='accuracy', cv=StratifiedKFold(5))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_rfe(data, target_name='is_switch_statements', estimator=LogisticRegression(solver='liblinear'), n_features_to_select=10):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def fitness(solution, X, y):
    if np.sum(solution) == 0:
        return 0
    X_selected = X[:, solution == 1]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

def initialize_population(pop_size, num_features):
    return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores):
    parents = []
    for _ in range(len(population)):
        i, j = np.random.choice(len(population), 2, replace=False)
        if fitness_scores[i] > fitness_scores[j]:
            parents.append(population[i])
        else:
            parents.append(population[j])
    return np.array(parents)

def crossover(parents):
    offspring = []
    for i in range(0, len(parents), 2):
        if i + 1 < len(parents):
            crossover_point = random.randint(1, parents.shape[1] - 1)
            parent1, parent2 = parents[i], parents[i + 1]
            child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
            child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
            offspring.extend([child1, child2])
    return np.array(offspring)

def mutate(offspring, mutation_rate=0.01):
    for individual in offspring:
        for gene in range(len(individual)):
            if random.random() < mutation_rate:
                individual[gene] = 1 - individual[gene]
    return offspring

def select_features_by_custom_genetic(data, target_name='is_switch_statements', pop_size=20, n_generations=50, mutation_rate=0.01):
    X = data.drop(target_name, axis=1).values
    y = data[target_name].values
    population = initialize_population(pop_size, X.shape[1])
    for generation in range(n_generations):
        fitness_scores = np.array([fitness(individual, X, y) for individual in population])
        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents)
        population = mutate(offspring, mutation_rate)
    best_solution = population[np.argmax(fitness_scores)]
    return dict(zip(data.drop(target_name, axis=1).columns, best_solution))

def majority_voting_features(results_dicts):
    all_features = set()
    for results in results_dicts:
        all_features.update(results.keys())

    feature_count = {feature: 0 for feature in all_features}
    for results in results_dicts:
        for feature, selected in results.items():
            if selected:
                feature_count[feature] += 1

    majority_features = [feature for feature, count in feature_count.items() if count >= len(results_dicts) // 2]
    return majority_features

drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/SwitchStatements.csv')

X = data.drop('is_switch_statements', axis=1)
y = data['is_switch_statements']

scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_normalized.fillna(X_normalized.mean(), inplace=True)
data_normalized = pd.concat([X_normalized, y], axis=1)

rfecv_results = select_features_by_rfecv(data_normalized, target_name='is_switch_statements')
sfs_results = select_features_by_sfs(data_normalized, target_name='is_switch_statements')
rfe_results = select_features_by_rfe(data_normalized, target_name='is_switch_statements', n_features_to_select=10)
genetic_results = select_features_by_custom_genetic(data_normalized, target_name='is_switch_statements')

results_dicts = [rfecv_results, sfs_results, rfe_results, genetic_results]
majority_voting_features = majority_voting_features(results_dicts)

common_features = set()
for results in results_dicts:
    common_features.update(results.keys())

intersection_features = set.intersection(*[set(results.keys()) for results in results_dicts])

base_classifiers = [
    ("Decision Tree", DecisionTreeClassifier()),
    ("SVM", SVC(C=100, gamma='auto', kernel='rbf', probability=True)),
    ("KNN", KNeighborsClassifier(n_neighbors=3, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski')),
    ('Logistic Regression', LogisticRegression(C=100,penalty = 'l1',solver = 'saga')),
    ('MLP', MLPClassifier(max_iter=2000)),
    ("Linear Discriminant Analysis", LinearDiscriminantAnalysis()),
    ("Gaussian Process", GaussianProcessClassifier(kernel=best_kernel, random_state=42)),
]

stacking_classifier = StackingClassifier(estimators=base_classifiers, final_estimator=RandomForestClassifier())

feature_sets = {
    "All Features": X_normalized.columns.tolist(),
    "RFECV Features": [feature for feature, selected in rfecv_results.items() if selected],
    "SFS Features": [feature for feature, selected in sfs_results.items() if selected],
    "RFE Features": [feature for feature, selected in rfe_results.items() if selected],
    "Genetic Features": [feature for feature, selected in genetic_results.items() if selected],
    "Union of All Features": list(common_features),
    "Intersection Set": list(intersection_features),
    "Majority Set": list(majority_voting_features)
}

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

results_summary = []

for feature_set_name, features in feature_sets.items():
    X_selected = X_normalized[features]
    metrics = {name: {"accuracy": [], "f1": [], "auc": []} for name, _ in base_classifiers}
    metrics["Stacking Classifier"] = {"accuracy": [], "f1": [], "auc": []}

    for train_index, test_index in skf.split(X_selected, y):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        for name, clf in base_classifiers:
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            y_prob = clf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC

            metrics[name]["accuracy"].append(accuracy_score(y_test, y_pred))
            metrics[name]["f1"].append(f1_score(y_test, y_pred))
            metrics[name]["auc"].append(roc_auc_score(y_test, y_prob))

        # Stacking Classifier
        stacking_classifier.fit(X_train, y_train)
        y_pred = stacking_classifier.predict(X_test)
        y_prob = stacking_classifier.predict_proba(X_test)[:, 1]

        metrics["Stacking Classifier"]["accuracy"].append(accuracy_score(y_test, y_pred))
        metrics["Stacking Classifier"]["f1"].append(f1_score(y_test, y_pred))
        metrics["Stacking Classifier"]["auc"].append(roc_auc_score(y_test, y_prob))

    print(f"Results for {feature_set_name}:")

    for name, scores in metrics.items():
        avg_accuracy = np.mean(scores["accuracy"])
        avg_f1 = np.mean(scores["f1"])
        avg_auc = np.mean(scores["auc"])
        print(f"{name}: Accuracy={avg_accuracy:.4f}, F1={avg_f1:.4f}, AUC={avg_auc:.4f}")

        results_summary.append({
            "Feature Set": feature_set_name,
            "Classifier": name,
            "Accuracy": avg_accuracy,
            "F1": avg_f1,
            "AUC": avg_auc,
            "Selected Features": features
        })

    print("-" * 40)

# Convert the results summary to a DataFrame for better visualization
results_df = pd.DataFrame(results_summary)

# Display the DataFrame containing the selected features and their evaluation metrics
print(results_df)

Mounted at /content/drive




Results for All Features:
Decision Tree: Accuracy=0.8429, F1=0.7445, AUC=0.8158
SVM: Accuracy=0.8595, F1=0.7528, AUC=0.9298
KNN: Accuracy=0.7690, F1=0.6040, AUC=0.8224
Logistic Regression: Accuracy=0.8548, F1=0.7552, AUC=0.9345
MLP: Accuracy=0.8738, F1=0.7926, AUC=0.9318
Linear Discriminant Analysis: Accuracy=0.8500, F1=0.7381, AUC=0.9196
Gaussian Process: Accuracy=0.8548, F1=0.7496, AUC=0.9356
Stacking Classifier: Accuracy=0.8571, F1=0.7498, AUC=0.9249
----------------------------------------




Results for RFECV Features:
Decision Tree: Accuracy=0.8357, F1=0.7412, AUC=0.8172
SVM: Accuracy=0.8929, F1=0.8229, AUC=0.9542
KNN: Accuracy=0.8357, F1=0.7227, AUC=0.8907
Logistic Regression: Accuracy=0.8810, F1=0.8022, AUC=0.9556
MLP: Accuracy=0.8905, F1=0.8242, AUC=0.9569
Linear Discriminant Analysis: Accuracy=0.8714, F1=0.7801, AUC=0.9435
Gaussian Process: Accuracy=0.8929, F1=0.8271, AUC=0.9567
Stacking Classifier: Accuracy=0.8690, F1=0.7850, AUC=0.9398
----------------------------------------




Results for SFS Features:
Decision Tree: Accuracy=0.8524, F1=0.7619, AUC=0.8315
SVM: Accuracy=0.8667, F1=0.7669, AUC=0.9440
KNN: Accuracy=0.8000, F1=0.6353, AUC=0.8464
Logistic Regression: Accuracy=0.8714, F1=0.7883, AUC=0.9468
MLP: Accuracy=0.8595, F1=0.7680, AUC=0.9346
Linear Discriminant Analysis: Accuracy=0.8595, F1=0.7624, AUC=0.9383
Gaussian Process: Accuracy=0.8690, F1=0.7791, AUC=0.9463
Stacking Classifier: Accuracy=0.8762, F1=0.7924, AUC=0.9427
----------------------------------------




Results for RFE Features:
Decision Tree: Accuracy=0.8500, F1=0.7643, AUC=0.8344
SVM: Accuracy=0.8952, F1=0.8259, AUC=0.9556
KNN: Accuracy=0.8500, F1=0.7330, AUC=0.8715
Logistic Regression: Accuracy=0.8810, F1=0.8010, AUC=0.9564
MLP: Accuracy=0.8881, F1=0.8187, AUC=0.9585
Linear Discriminant Analysis: Accuracy=0.8667, F1=0.7694, AUC=0.9444
Gaussian Process: Accuracy=0.8952, F1=0.8281, AUC=0.9561
Stacking Classifier: Accuracy=0.8857, F1=0.8145, AUC=0.9521
----------------------------------------




Results for Genetic Features:
Decision Tree: Accuracy=0.8524, F1=0.7635, AUC=0.8269
SVM: Accuracy=0.8714, F1=0.7803, AUC=0.9401
KNN: Accuracy=0.7905, F1=0.6377, AUC=0.8408
Logistic Regression: Accuracy=0.8786, F1=0.7952, AUC=0.9452
MLP: Accuracy=0.8881, F1=0.8172, AUC=0.9512
Linear Discriminant Analysis: Accuracy=0.8571, F1=0.7509, AUC=0.9395
Gaussian Process: Accuracy=0.8738, F1=0.7869, AUC=0.9465
Stacking Classifier: Accuracy=0.8833, F1=0.8091, AUC=0.9463
----------------------------------------




Results for Union of All Features:
Decision Tree: Accuracy=0.8500, F1=0.7501, AUC=0.8209
SVM: Accuracy=0.8595, F1=0.7528, AUC=0.9298
KNN: Accuracy=0.7690, F1=0.6040, AUC=0.8224
Logistic Regression: Accuracy=0.8548, F1=0.7552, AUC=0.9343
MLP: Accuracy=0.8571, F1=0.7629, AUC=0.9320
Linear Discriminant Analysis: Accuracy=0.8500, F1=0.7381, AUC=0.9196
Gaussian Process: Accuracy=0.8548, F1=0.7496, AUC=0.9356
Stacking Classifier: Accuracy=0.8524, F1=0.7421, AUC=0.9273
----------------------------------------




Results for Intersection Set:
Decision Tree: Accuracy=0.8548, F1=0.7588, AUC=0.8244
SVM: Accuracy=0.8595, F1=0.7528, AUC=0.9298
KNN: Accuracy=0.7690, F1=0.6040, AUC=0.8224
Logistic Regression: Accuracy=0.8548, F1=0.7552, AUC=0.9351
MLP: Accuracy=0.8738, F1=0.7953, AUC=0.9326
Linear Discriminant Analysis: Accuracy=0.8500, F1=0.7381, AUC=0.9196
Gaussian Process: Accuracy=0.8548, F1=0.7496, AUC=0.9356
Stacking Classifier: Accuracy=0.8667, F1=0.7674, AUC=0.9309
----------------------------------------




Results for Majority Set:
Decision Tree: Accuracy=0.8595, F1=0.7722, AUC=0.8365
SVM: Accuracy=0.8667, F1=0.7719, AUC=0.9439
KNN: Accuracy=0.8167, F1=0.6770, AUC=0.8448
Logistic Regression: Accuracy=0.8714, F1=0.7883, AUC=0.9444
MLP: Accuracy=0.8762, F1=0.7936, AUC=0.9449
Linear Discriminant Analysis: Accuracy=0.8571, F1=0.7542, AUC=0.9353
Gaussian Process: Accuracy=0.8714, F1=0.7835, AUC=0.9505
Stacking Classifier: Accuracy=0.8548, F1=0.7641, AUC=0.9359
----------------------------------------
     Feature Set                    Classifier  Accuracy        F1       AUC  \
0   All Features                 Decision Tree  0.842857  0.744463  0.815769   
1   All Features                           SVM  0.859524  0.752805  0.929761   
2   All Features                           KNN  0.769048  0.604001  0.822366   
3   All Features           Logistic Regression  0.854762  0.755226  0.934523   
4   All Features                           MLP  0.873810  0.792551  0.931782   
..           ...     

In [None]:
#main program 6
import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV, RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.gaussian_process import GaussianProcessClassifier

def select_features_by_rfecv(data, target_name='is_long_parameters_list', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFECV(estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_sfs(data, target_name='is_long_parameters_list', estimator=LogisticRegression(solver='liblinear')):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = SFS(estimator, n_features_to_select='auto', direction='forward', scoring='accuracy', cv=StratifiedKFold(5))
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_rfe(data, target_name='is_long_parameters_list', estimator=LogisticRegression(solver='liblinear'), n_features_to_select=10):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def fitness(solution, X, y):
    if np.sum(solution) == 0:
        return 0
    X_selected = X[:, solution == 1]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

def initialize_population(pop_size, num_features):
    return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores):
    parents = []
    for _ in range(len(population)):
        i, j = np.random.choice(len(population), 2, replace=False)
        if fitness_scores[i] > fitness_scores[j]:
            parents.append(population[i])
        else:
            parents.append(population[j])
    return np.array(parents)

def crossover(parents):
    offspring = []
    for i in range(0, len(parents), 2):
        if i + 1 < len(parents):
            crossover_point = random.randint(1, parents.shape[1] - 1)
            parent1, parent2 = parents[i], parents[i + 1]
            child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
            child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
            offspring.extend([child1, child2])
    return np.array(offspring)

def mutate(offspring, mutation_rate=0.01):
    for individual in offspring:
        for gene in range(len(individual)):
            if random.random() < mutation_rate:
                individual[gene] = 1 - individual[gene]
    return offspring

def select_features_by_custom_genetic(data, target_name='is_long_parameters_list', pop_size=20, n_generations=50, mutation_rate=0.01):
    X = data.drop(target_name, axis=1).values
    y = data[target_name].values
    population = initialize_population(pop_size, X.shape[1])
    for generation in range(n_generations):
        fitness_scores = np.array([fitness(individual, X, y) for individual in population])
        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents)
        population = mutate(offspring, mutation_rate)
    best_solution = population[np.argmax(fitness_scores)]
    return dict(zip(data.drop(target_name, axis=1).columns, best_solution))

def majority_voting_features(results_dicts):
    all_features = set()
    for results in results_dicts:
        all_features.update(results.keys())

    feature_count = {feature: 0 for feature in all_features}
    for results in results_dicts:
        for feature, selected in results.items():
            if selected:
                feature_count[feature] += 1

    majority_features = [feature for feature, count in feature_count.items() if count >= len(results_dicts) // 2]
    return majority_features

drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/LongParameterList.csv')

X = data.drop('is_long_parameters_list', axis=1)
y = data['is_long_parameters_list']

scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_normalized.fillna(X_normalized.mean(), inplace=True)
data_normalized = pd.concat([X_normalized, y], axis=1)

rfecv_results = select_features_by_rfecv(data_normalized, target_name='is_long_parameters_list')
sfs_results = select_features_by_sfs(data_normalized, target_name='is_long_parameters_list')
rfe_results = select_features_by_rfe(data_normalized, target_name='is_long_parameters_list', n_features_to_select=10)
genetic_results = select_features_by_custom_genetic(data_normalized, target_name='is_long_parameters_list')

results_dicts = [rfecv_results, sfs_results, rfe_results, genetic_results]
majority_voting_features = majority_voting_features(results_dicts)

common_features = set()
for results in results_dicts:
    common_features.update(results.keys())

intersection_features = set.intersection(*[set(results.keys()) for results in results_dicts])

base_classifiers = [
    ("Decision Tree", DecisionTreeClassifier()),
    ("SVM", SVC(C=10, gamma='scale', kernel='linear', probability=True)),
    ("KNN", KNeighborsClassifier(n_neighbors=3, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski')),
    ('Logistic Regression', LogisticRegression()),
    ('MLP', RandomForestClassifier()),
    ("Linear Discriminant Analysis", LinearDiscriminantAnalysis()),
    ("Gaussian Process", GaussianProcessClassifier()),
]

stacking_classifier = StackingClassifier(estimators=base_classifiers, final_estimator=MLPClassifier(max_iter=2000))

feature_sets = {
    "All Features": X_normalized.columns.tolist(),
    "RFECV Features": [feature for feature, selected in rfecv_results.items() if selected],
    "SFS Features": [feature for feature, selected in sfs_results.items() if selected],
    "RFE Features": [feature for feature, selected in rfe_results.items() if selected],
    "Genetic Features": [feature for feature, selected in genetic_results.items() if selected],
    "Union of All Features": list(common_features),
    "Intersection Set": list(intersection_features),
    "Majority Set": list(majority_voting_features)
}

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for feature_set_name, features in feature_sets.items():
    X_selected = X_normalized[features]
    metrics = {name: {"accuracy": [], "f1": [], "auc": []} for name, _ in base_classifiers}
    metrics["Stacking Classifier"] = {"accuracy": [], "f1": [], "auc": []}

    for train_index, test_index in skf.split(X_selected, y):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        for name, clf in base_classifiers:
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            y_prob = clf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC

            metrics[name]["accuracy"].append(accuracy_score(y_test, y_pred))
            metrics[name]["f1"].append(f1_score(y_test, y_pred))
            metrics[name]["auc"].append(roc_auc_score(y_test, y_prob))

        # Stacking Classifier
        stacking_classifier.fit(X_train, y_train)
        y_pred = stacking_classifier.predict(X_test)
        y_prob = stacking_classifier.predict_proba(X_test)[:, 1]

        metrics["Stacking Classifier"]["accuracy"].append(accuracy_score(y_test, y_pred))
        metrics["Stacking Classifier"]["f1"].append(f1_score(y_test, y_pred))
        metrics["Stacking Classifier"]["auc"].append(roc_auc_score(y_test, y_prob))

    print(f"Results for {feature_set_name}:")

    for name, scores in metrics.items():
        avg_accuracy = np.mean(scores["accuracy"])
        avg_f1 = np.mean(scores["f1"])
        avg_auc = np.mean(scores["auc"])
        print(f"{name}: Accuracy={avg_accuracy:.4f}, F1={avg_f1:.4f}, AUC={avg_auc:.4f}")

    print("-" * 40)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Results for All Features:
Decision Tree: Accuracy=0.9190, F1=0.8766, AUC=0.9118
SVM: Accuracy=0.9048, F1=0.8485, AUC=0.9614
KNN: Accuracy=0.7690, F1=0.6258, AUC=0.8054
Logistic Regression: Accuracy=0.8690, F1=0.7559, AUC=0.9602
MLP: Accuracy=0.9405, F1=0.9013, AUC=0.9632
Linear Discriminant Analysis: Accuracy=0.8786, F1=0.7871, AUC=0.9513
Gaussian Process: Accuracy=0.8190, F1=0.6312, AUC=0.9171
Stacking Classifier: Accuracy=0.9214, F1=0.8772, AUC=0.9552
----------------------------------------
Results for RFECV Features:
Decision Tree: Accuracy=0.9071, F1=0.8572, AUC=0.8982
SVM: Accuracy=0.9190, F1=0.8567, AUC=0.9712
KNN: Accuracy=0.9310, F1=0.8924, AUC=0.9502
Logistic Regression: Accuracy=0.8857, F1=0.7808, AUC=0.9625
MLP: Accuracy=0.9405, F1=0.9029, AUC=0.9587
Linear Discriminant Analysis: Accuracy=0.9071, F1=0.8255, AUC=0.9582
Gaussian Process: Accuracy=0.

In [None]:
#main program 3
import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.gaussian_process import GaussianProcessClassifier

def select_features_by_rfe(data, target_name='is_long_parameters_list', n_features_list=[10, 15, 20]):
    results = {}
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    estimator = LogisticRegression(solver='liblinear')
    for n_features in n_features_list:
        selector = RFE(estimator, n_features_to_select=n_features, step=1)
        selector = selector.fit(X, y)
        results[n_features] = dict(zip(X.columns, selector.support_))
    return results

def select_features_by_sfs(data, target_name='is_long_parameters_list', n_features_list=[10, 15, 20]):
    results = {}
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    estimator = LogisticRegression(solver='liblinear')
    for n_features in n_features_list:
        selector = SFS(estimator, n_features_to_select=n_features, direction='forward')
        selector = selector.fit(X, y)
        results[n_features] = dict(zip(X.columns, selector.support_))
    return results

def fitness(solution, X, y):
    if np.sum(solution) == 0:
        return 0
    X_selected = X[:, solution == 1]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

def initialize_population(pop_size, num_features):
    return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores):
    parents = []
    for _ in range(len(population)):
        i, j = np.random.choice(len(population), 2, replace=False)
        if fitness_scores[i] > fitness_scores[j]:
            parents.append(population[i])
        else:
            parents.append(population[j])
    return np.array(parents)

def crossover(parents):
    offspring = []
    for i in range(0, len(parents), 2):
        if i + 1 < len(parents):
            crossover_point = random.randint(1, parents.shape[1] - 1)
            parent1, parent2 = parents[i], parents[i + 1]
            child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
            child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
            offspring.extend([child1, child2])
    return np.array(offspring)

def mutate(offspring, mutation_rate=0.01):
    for individual in offspring:
        for gene in range(len(individual)):
            if random.random() < mutation_rate:
                individual[gene] = 1 - individual[gene]
    return offspring

def select_features_by_custom_genetic(data, target_name='is_long_parameters_list', pop_size=20, n_generations=50, mutation_rate=0.01):
    X = data.drop(target_name, axis=1).values
    y = data[target_name].values
    population = initialize_population(pop_size, X.shape[1])
    for generation in range(n_generations):
        fitness_scores = np.array([fitness(individual, X, y) for individual in population])
        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents)
        population = mutate(offspring, mutation_rate)
    best_solution = population[np.argmax(fitness_scores)]
    return dict(zip(data.drop(target_name, axis=1).columns, best_solution))

def majority_voting_features(results_dicts):
    all_features = set()
    for results in results_dicts:
        for features in results.values():
            all_features.update(features.keys())

    feature_count = {feature: 0 for feature in all_features}
    for results in results_dicts:
        for features in results.values():
            for feature, selected in features.items():
                if selected:
                    feature_count[feature] += 1

    majority_features = [feature for feature, count in feature_count.items() if count >= len(results_dicts) // 2]
    return majority_features

def select_features_by_new_wrapper(data, target_name='is_long_parameters_list', n_features_list=[10, 15, 20]):
    results = {}
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    estimator = RandomForestClassifier()
    for n_features in n_features_list:
        selector = RFE(estimator, n_features_to_select=n_features, step=1)
        selector = selector.fit(X, y)
        results[n_features] = dict(zip(X.columns, selector.support_))
    return results

drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/LongParameterList.csv')

X = data.drop('is_long_parameters_list', axis=1)
y = data['is_long_parameters_list']

scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_normalized.fillna(X_normalized.mean(), inplace=True)
data_normalized = pd.concat([X_normalized, y], axis=1)

n_features_list = [10, 15, 20]

rfe_results = select_features_by_rfe(data_normalized, target_name='is_long_parameters_list', n_features_list=n_features_list)
sfs_results = select_features_by_sfs(data_normalized, target_name='is_long_parameters_list', n_features_list=n_features_list)
genetic_results = select_features_by_custom_genetic(data_normalized, target_name='is_long_parameters_list')
new_wrapper_results = select_features_by_new_wrapper(data_normalized, target_name='is_long_parameters_list', n_features_list=n_features_list)

results_dicts = [rfe_results, sfs_results, {20: genetic_results}, new_wrapper_results]
majority_voting_features = majority_voting_features(results_dicts)

common_features = set()
for results in results_dicts:
    for features in results.values():
        common_features.update(features.keys())

intersection_features = set.intersection(*[set(features.keys()) for results in results_dicts for features in results.values()])

base_classifiers = [
    ("Decision Tree", DecisionTreeClassifier()),
    ("SVM", SVC(C=10, gamma='scale', kernel='linear', probability=True)),
    ("KNN", KNeighborsClassifier(n_neighbors=3, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski')),
    ('Logistic Regression', LogisticRegression()),
    ('MLP', RandomForestClassifier()),
    ("Linear Discriminant Analysis", LinearDiscriminantAnalysis()),
    ("Gaussian Process", GaussianProcessClassifier()),
]

stacking_classifier = StackingClassifier(estimators=base_classifiers, final_estimator=MLPClassifier(max_iter=2000))

feature_sets = {
    "All Features": X_normalized.columns.tolist(),
    "RFE Features": {n: [feature for feature, selected in features.items() if selected] for n, features in rfe_results.items()},
    "SFS Features": {n: [feature for feature, selected in features.items() if selected] for n, features in sfs_results.items()},
    "Genetic Features": [feature for feature, selected in genetic_results.items() if selected],
    "New Wrapper Features": {n: [feature for feature, selected in features.items() if selected] for n, features in new_wrapper_results.items()},
    "Union of All Features": list(common_features),
    "Intersection Set": list(intersection_features),
    "Majority Set": list(majority_voting_features)
}

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for feature_set_name, feature_set in feature_sets.items():
    if isinstance(feature_set, dict):
        for n_features, features in feature_set.items():
            X_selected = X_normalized[features]
            metrics = {name: {"accuracy": [], "f1": [], "auc": []} for name, _ in base_classifiers}
            metrics["Stacking Classifier"] = {"accuracy": [], "f1": [], "auc": []}

            for train_index, test_index in skf.split(X_selected, y):
                X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                for name, clf in base_classifiers:
                    clf.fit(X_train, y_train)
                    y_pred = clf.predict(X_test)
                    y_prob = clf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC

                    metrics[name]["accuracy"].append(accuracy_score(y_test, y_pred))
                    metrics[name]["f1"].append(f1_score(y_test, y_pred))
                    metrics[name]["auc"].append(roc_auc_score(y_test, y_prob))

                # Stacking Classifier
                stacking_classifier.fit(X_train, y_train)
                y_pred = stacking_classifier.predict(X_test)
                y_prob = stacking_classifier.predict_proba(X_test)[:, 1]

                metrics["Stacking Classifier"]["accuracy"].append(accuracy_score(y_test, y_pred))
                metrics["Stacking Classifier"]["f1"].append(f1_score(y_test, y_pred))
                metrics["Stacking Classifier"]["auc"].append(roc_auc_score(y_test, y_prob))

            print(f"Results for {feature_set_name} with {n_features} features:")

            for name, scores in metrics.items():
                avg_accuracy = np.mean(scores["accuracy"])
                avg_f1 = np.mean(scores["f1"])
                avg_auc = np.mean(scores["auc"])
                print(f"{name}: Accuracy={avg_accuracy:.4f}, F1={avg_f1:.4f}, AUC={avg_auc:.4f}")

            print("-" * 40)
    else:
        X_selected = X_normalized[feature_set]
        metrics = {name: {"accuracy": [], "f1": [], "auc": []} for name, _ in base_classifiers}
        metrics["Stacking Classifier"] = {"accuracy": [], "f1": [], "auc": []}

        for train_index, test_index in skf.split(X_selected, y):
            X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            for name, clf in base_classifiers:
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_test)
                y_prob = clf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC

                metrics[name]["accuracy"].append(accuracy_score(y_test, y_pred))
                metrics[name]["f1"].append(f1_score(y_test, y_pred))
                metrics[name]["auc"].append(roc_auc_score(y_test, y_prob))

            # Stacking Classifier
            stacking_classifier.fit(X_train, y_train)
            y_pred = stacking_classifier.predict(X_test)
            y_prob = stacking_classifier.predict_proba(X_test)[:, 1]

            metrics["Stacking Classifier"]["accuracy"].append(accuracy_score(y_test, y_pred))
            metrics["Stacking Classifier"]["f1"].append(f1_score(y_test, y_pred))
            metrics["Stacking Classifier"]["auc"].append(roc_auc_score(y_test, y_prob))

        print(f"Results for {feature_set_name}:")

        for name, scores in metrics.items():
            avg_accuracy = np.mean(scores["accuracy"])
            avg_f1 = np.mean(scores["f1"])
            avg_auc = np.mean(scores["auc"])
            print(f"{name}: Accuracy={avg_accuracy:.4f}, F1={avg_f1:.4f}, AUC={avg_auc:.4f}")

        print("-" * 40)

Mounted at /content/drive
Results for All Features:
Decision Tree: Accuracy=0.9143, F1=0.8679, AUC=0.9028
SVM: Accuracy=0.9048, F1=0.8485, AUC=0.9614
KNN: Accuracy=0.7690, F1=0.6258, AUC=0.8054
Logistic Regression: Accuracy=0.8690, F1=0.7559, AUC=0.9602
MLP: Accuracy=0.9405, F1=0.9027, AUC=0.9672
Linear Discriminant Analysis: Accuracy=0.8786, F1=0.7871, AUC=0.9513
Gaussian Process: Accuracy=0.8190, F1=0.6312, AUC=0.9171
Stacking Classifier: Accuracy=0.9286, F1=0.8909, AUC=0.9603
----------------------------------------
Results for RFE Features with 10 features:
Decision Tree: Accuracy=0.9214, F1=0.8778, AUC=0.9117
SVM: Accuracy=0.9238, F1=0.8685, AUC=0.9659
KNN: Accuracy=0.9119, F1=0.8616, AUC=0.9376
Logistic Regression: Accuracy=0.8810, F1=0.7728, AUC=0.9667
MLP: Accuracy=0.9357, F1=0.8973, AUC=0.9654
Linear Discriminant Analysis: Accuracy=0.9095, F1=0.8298, AUC=0.9548
Gaussian Process: Accuracy=0.8833, F1=0.7774, AUC=0.9603
Stacking Classifier: Accuracy=0.9381, F1=0.9020, AUC=0.9699


In [None]:
#main program 2
import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.gaussian_process import GaussianProcessClassifier

def select_features_by_rfe(data, target_name='is_long_parameters_list', n_features=20):
  X = data.drop(target_name, axis=1)
  y = data[target_name]
  estimator = LogisticRegression(solver='liblinear')
  selector = RFE(estimator, n_features_to_select=n_features, step=1)
  selector = selector.fit(X, y)
  return dict(zip(X.columns, selector.support_))

def select_features_by_sfs(data, target_name='is_long_parameters_list', n_features=20):
  X = data.drop(target_name, axis=1)
  y = data[target_name]
  estimator = LogisticRegression(solver='liblinear')
  selector = SFS(estimator, n_features_to_select=n_features, direction='forward')
  selector = selector.fit(X, y)
  return dict(zip(X.columns, selector.support_))

def fitness(solution, X, y):
  if np.sum(solution) == 0:
    return 0
  X_selected = X[:, solution == 1]
  X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
  model = LogisticRegression(max_iter=1000)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  return accuracy_score(y_test, y_pred)

def initialize_population(pop_size, num_features):
  return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores):
  parents = []
  for _ in range(len(population)):
    i, j = np.random.choice(len(population), 2, replace=False)
    if fitness_scores[i] > fitness_scores[j]:
      parents.append(population[i])
    else:
      parents.append(population[j])
  return np.array(parents)

def crossover(parents):
  offspring = []
  for i in range(0, len(parents), 2):
    if i + 1 < len(parents):
      crossover_point = random.randint(1, parents.shape[1] - 1)
      parent1, parent2 = parents[i], parents[i + 1]
      child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
      child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
      offspring.extend([child1, child2])
  return np.array(offspring)

def mutate(offspring, mutation_rate=0.01):
  for individual in offspring:
    for gene in range(len(individual)):
      if random.random() < mutation_rate:
        individual[gene] = 1 - individual[gene]
  return offspring

def select_features_by_custom_genetic(data, target_name='is_long_parameters_list', pop_size=20, n_generations=50, mutation_rate=0.01):
  X = data.drop(target_name, axis=1).values
  y = data[target_name].values
  population = initialize_population(pop_size, X.shape[1])
  for generation in range(n_generations):
    fitness_scores = np.array([fitness(individual, X, y) for individual in population])
    parents = select_parents(population, fitness_scores)
    offspring = crossover(parents)
    population = mutate(offspring, mutation_rate)
  best_solution = population[np.argmax(fitness_scores)]
  return dict(zip(data.drop(target_name, axis=1).columns, best_solution))

def majority_voting_features(rfe_scores, sfs_scores, genetic_scores, new_wrapper_scores):
  all_features = set(rfe_scores.keys()).union(sfs_scores.keys()).union(genetic_scores.keys()).union(new_wrapper_scores.keys())
  feature_count = {feature: 0 for feature in all_features}
  for feature in rfe_scores:
    if rfe_scores[feature]:
      feature_count[feature] += 1
  for feature in sfs_scores:
    if sfs_scores[feature]:
      feature_count[feature] += 1
  for feature in genetic_scores:
    if genetic_scores[feature]:
      feature_count[feature] += 1
  for feature in new_wrapper_scores:
    if new_wrapper_scores[feature]:
      feature_count[feature] += 1
  majority_features = [feature for feature, count in feature_count.items() if count >= 3]
  return majority_features

def select_features_by_new_wrapper(data, target_name='is_long_parameters_list', n_features=20):
  X = data.drop(target_name, axis=1)
  y = data[target_name]
  estimator = RandomForestClassifier()
  selector = RFE(estimator, n_features_to_select=n_features, step=1)
  selector = selector.fit(X, y)
  return dict(zip(X.columns, selector.support_))

drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/LongParameterList.csv')

X = data.drop('is_long_parameters_list', axis=1)
y = data['is_long_parameters_list']

scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_normalized.fillna(X_normalized.mean(), inplace=True)
data_normalized = pd.concat([X_normalized, y], axis=1)

rfe_scores = select_features_by_rfe(data_normalized, target_name='is_long_parameters_list', n_features=20)
sfs_scores = select_features_by_sfs(data_normalized, target_name='is_long_parameters_list', n_features=20)
genetic_scores = select_features_by_custom_genetic(data_normalized, target_name='is_long_parameters_list')
new_wrapper_scores = select_features_by_new_wrapper(data_normalized, target_name='is_long_parameters_list', n_features=20)

intersection_features = set(rfe_scores.keys()).intersection(set(sfs_scores.keys()), set(genetic_scores.keys()), set(new_wrapper_scores.keys()))
majority_voting_features = majority_voting_features(rfe_scores, sfs_scores, genetic_scores, new_wrapper_scores)

common_features = set(rfe_scores.keys()).union(set(sfs_scores.keys())).union(set(genetic_scores.keys())).union(set(new_wrapper_scores.keys()))

base_classifiers = [
  ("Decision Tree", DecisionTreeClassifier()),

  ("SVM", SVC(C= 10, gamma= 'scale', kernel= 'linear', probability=True)),
  ("KNN", KNeighborsClassifier(n_neighbors=3,weights='uniform',   algorithm='auto',  leaf_size=30,   p=2,   metric='minkowski', metric_params=None,    n_jobs=None)),
  ('Logistic Regression', LogisticRegression()),
    ('MLP', RandomForestClassifier()),
  ("Linear Discriminant Analysis", LinearDiscriminantAnalysis()),
  ("Gaussian Process", GaussianProcessClassifier()),

]

stacking_classifier = StackingClassifier(estimators=base_classifiers, final_estimator=(MLPClassifier(max_iter=2000)))

feature_sets = {
  "All Features": X_normalized.columns.tolist(),
  "RFE Features": [feature for feature, selected in rfe_scores.items() if selected],
  "SFS Features": [feature for feature, selected in sfs_scores.items() if selected],
  "Genetic Features": [feature for feature, selected in genetic_scores.items() if selected],
  "New Wrapper Features": [feature for feature, selected in new_wrapper_scores.items() if selected],
  "Union of All Features": list(common_features),
  "Intersection Set": list(intersection_features),
  "Majority Set": list(majority_voting_features)
}

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for feature_set_name, feature_set in feature_sets.items():
  X_selected = X_normalized[feature_set]
  metrics = {name: {"accuracy": [], "f1": [], "auc": []} for name, _ in base_classifiers}
  metrics["Stacking Classifier"] = {"accuracy": [], "f1": [], "auc": []}

  for train_index, test_index in skf.split(X_selected, y):
    X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    for name, clf in base_classifiers:
      clf.fit(X_train, y_train)
      y_pred = clf.predict(X_test)
      y_prob = clf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC

      metrics[name]["accuracy"].append(accuracy_score(y_test, y_pred))
      metrics[name]["f1"].append(f1_score(y_test, y_pred))
      metrics[name]["auc"].append(roc_auc_score(y_test, y_prob))

    # Stacking Classifier
    stacking_classifier.fit(X_train, y_train)
    y_pred = stacking_classifier.predict(X_test)
    y_prob = stacking_classifier.predict_proba(X_test)[:, 1]

    metrics["Stacking Classifier"]["accuracy"].append(accuracy_score(y_test, y_pred))
    metrics["Stacking Classifier"]["f1"].append(f1_score(y_test, y_pred))
    metrics["Stacking Classifier"]["auc"].append(roc_auc_score(y_test, y_prob))

  print(f"Results for {feature_set_name}:")

  for name, scores in metrics.items():
    avg_accuracy = np.mean(scores["accuracy"])
    avg_f1 = np.mean(scores["f1"])
    avg_auc = np.mean(scores["auc"])
    print(f"{name}: Accuracy={avg_accuracy:.4f}, F1={avg_f1:.4f}, AUC={avg_auc:.4f}")

  print("-" * 40)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Results for All Features:
Decision Tree: Accuracy=0.9214, F1=0.8817, AUC=0.9137
SVM: Accuracy=0.9048, F1=0.8485, AUC=0.9614
KNN: Accuracy=0.7690, F1=0.6258, AUC=0.8054
Logistic Regression: Accuracy=0.8690, F1=0.7559, AUC=0.9602
MLP: Accuracy=0.9405, F1=0.8996, AUC=0.9665
Linear Discriminant Analysis: Accuracy=0.8786, F1=0.7871, AUC=0.9513
Gaussian Process: Accuracy=0.8190, F1=0.6312, AUC=0.9171
Stacking Classifier: Accuracy=0.9381, F1=0.9028, AUC=0.9658
----------------------------------------
Results for RFE Features:
Decision Tree: Accuracy=0.9000, F1=0.8512, AUC=0.8940
SVM: Accuracy=0.9357, F1=0.8922, AUC=0.9768
KNN: Accuracy=0.8500, F1=0.7643, AUC=0.8937
Logistic Regression: Accuracy=0.8857, F1=0.7872, AUC=0.9697
MLP: Accuracy=0.9357, F1=0.8938, AUC=0.9725
Linear Discriminant Analysis: Accuracy=0.9071, F1=0.8287, AUC=0.9665
Gaussian Process: Accuracy=0.87

In [None]:
#programm 2
import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.gaussian_process import GaussianProcessClassifier

def select_features_by_rfe(data, target_name='is_long_parameters_list', n_features=20):
  X = data.drop(target_name, axis=1)
  y = data[target_name]
  estimator = LogisticRegression(solver='liblinear')
  selector = RFE(estimator, n_features_to_select=n_features, step=1)
  selector = selector.fit(X, y)
  return dict(zip(X.columns, selector.support_))

def select_features_by_sfs(data, target_name='is_long_parameters_list', n_features=20):
  X = data.drop(target_name, axis=1)
  y = data[target_name]
  estimator = LogisticRegression(solver='liblinear')
  selector = SFS(estimator, n_features_to_select=n_features, direction='forward')
  selector = selector.fit(X, y)
  return dict(zip(X.columns, selector.support_))

def fitness(solution, X, y):
  if np.sum(solution) == 0:
    return 0
  X_selected = X[:, solution == 1]
  X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
  model = LogisticRegression(max_iter=1000)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  return accuracy_score(y_test, y_pred)

def initialize_population(pop_size, num_features):
  return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores):
  parents = []
  for _ in range(len(population)):
    i, j = np.random.choice(len(population), 2, replace=False)
    if fitness_scores[i] > fitness_scores[j]:
      parents.append(population[i])
    else:
      parents.append(population[j])
  return np.array(parents)

def crossover(parents):
  offspring = []
  for i in range(0, len(parents), 2):
    if i + 1 < len(parents):
      crossover_point = random.randint(1, parents.shape[1] - 1)
      parent1, parent2 = parents[i], parents[i + 1]
      child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
      child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
      offspring.extend([child1, child2])
  return np.array(offspring)

def mutate(offspring, mutation_rate=0.01):
  for individual in offspring:
    for gene in range(len(individual)):
      if random.random() < mutation_rate:
        individual[gene] = 1 - individual[gene]
  return offspring

def select_features_by_custom_genetic(data, target_name='is_long_parameters_list', pop_size=20, n_generations=50, mutation_rate=0.01):
  X = data.drop(target_name, axis=1).values
  y = data[target_name].values
  population = initialize_population(pop_size, X.shape[1])
  for generation in range(n_generations):
    fitness_scores = np.array([fitness(individual, X, y) for individual in population])
    parents = select_parents(population, fitness_scores)
    offspring = crossover(parents)
    population = mutate(offspring, mutation_rate)
  best_solution = population[np.argmax(fitness_scores)]
  return dict(zip(data.drop(target_name, axis=1).columns, best_solution))

def majority_voting_features(rfe_scores, sfs_scores, genetic_scores, new_wrapper_scores):
  all_features = set(rfe_scores.keys()).union(sfs_scores.keys()).union(genetic_scores.keys()).union(new_wrapper_scores.keys())
  feature_count = {feature: 0 for feature in all_features}
  for feature in rfe_scores:
    if rfe_scores[feature]:
      feature_count[feature] += 1
  for feature in sfs_scores:
    if sfs_scores[feature]:
      feature_count[feature] += 1
  for feature in genetic_scores:
    if genetic_scores[feature]:
      feature_count[feature] += 1
  for feature in new_wrapper_scores:
    if new_wrapper_scores[feature]:
      feature_count[feature] += 1
  majority_features = [feature for feature, count in feature_count.items() if count >= 3]
  return majority_features

def select_features_by_new_wrapper(data, target_name='is_long_parameters_list', n_features=20):
  X = data.drop(target_name, axis=1)
  y = data[target_name]
  estimator = RandomForestClassifier()
  selector = RFE(estimator, n_features_to_select=n_features, step=1)
  selector = selector.fit(X, y)
  return dict(zip(X.columns, selector.support_))

drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/LongParameterList.csv')

X = data.drop('is_long_parameters_list', axis=1)
y = data['is_long_parameters_list']

scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_normalized.fillna(X_normalized.mean(), inplace=True)
data_normalized = pd.concat([X_normalized, y], axis=1)

rfe_scores = select_features_by_rfe(data_normalized, target_name='is_long_parameters_list', n_features=20)
sfs_scores = select_features_by_sfs(data_normalized, target_name='is_long_parameters_list', n_features=20)
genetic_scores = select_features_by_custom_genetic(data_normalized, target_name='is_long_parameters_list')
new_wrapper_scores = select_features_by_new_wrapper(data_normalized, target_name='is_long_parameters_list', n_features=20)

intersection_features = set(rfe_scores.keys()).intersection(set(sfs_scores.keys()), set(genetic_scores.keys()), set(new_wrapper_scores.keys()))
majority_voting_features = majority_voting_features(rfe_scores, sfs_scores, genetic_scores, new_wrapper_scores)

common_features = set(rfe_scores.keys()).union(set(sfs_scores.keys())).union(set(genetic_scores.keys())).union(set(new_wrapper_scores.keys()))

base_classifiers = [
  ("Decision Tree", DecisionTreeClassifier()),
  ('Random Forest', RandomForestClassifier()),
  ("SVM", SVC(C=0.1, gamma='scale', kernel='poly', degree=3, probability=True)),
  ("KNN", KNeighborsClassifier(n_neighbors=1)),
  ('Logistic Regression', LogisticRegression()),
  ("Linear Discriminant Analysis", LinearDiscriminantAnalysis()),
  ("Gaussian Process", GaussianProcessClassifier())
]

stacking_classifier = StackingClassifier(estimators=base_classifiers, final_estimator=MLPClassifier(max_iter=2000))

feature_sets = {
  "All Features": X_normalized.columns.tolist(),
  "RFE Features": [feature for feature, selected in rfe_scores.items() if selected],
  "SFS Features": [feature for feature, selected in sfs_scores.items() if selected],
  "Genetic Features": [feature for feature, selected in genetic_scores.items() if selected],
  "New Wrapper Features": [feature for feature, selected in new_wrapper_scores.items() if selected],
  "Union of All Features": list(common_features),
  "Intersection Set": list(intersection_features),
  "Majority Set": list(majority_voting_features)
}

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for feature_set_name, feature_set in feature_sets.items():
  X_selected = X_normalized[feature_set]
  metrics = {name: {"accuracy": [], "f1": [], "auc": []} for name, _ in base_classifiers}
  metrics["Stacking Classifier"] = {"accuracy": [], "f1": [], "auc": []}

  for train_index, test_index in skf.split(X_selected, y):
    X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    for name, clf in base_classifiers:
      clf.fit(X_train, y_train)
      y_pred = clf.predict(X_test)
      y_prob = clf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC

      metrics[name]["accuracy"].append(accuracy_score(y_test, y_pred))
      metrics[name]["f1"].append(f1_score(y_test, y_pred))
      metrics[name]["auc"].append(roc_auc_score(y_test, y_prob))

    # Stacking Classifier
    stacking_classifier.fit(X_train, y_train)
    y_pred = stacking_classifier.predict(X_test)
    y_prob = stacking_classifier.predict_proba(X_test)[:, 1]

    metrics["Stacking Classifier"]["accuracy"].append(accuracy_score(y_test, y_pred))
    metrics["Stacking Classifier"]["f1"].append(f1_score(y_test, y_pred))
    metrics["Stacking Classifier"]["auc"].append(roc_auc_score(y_test, y_prob))

  print(f"Results for {feature_set_name}:")

  for name, scores in metrics.items():
    avg_accuracy = np.mean(scores["accuracy"])
    avg_f1 = np.mean(scores["f1"])
    avg_auc = np.mean(scores["auc"])
    print(f"{name}: Accuracy={avg_accuracy:.4f}, F1={avg_f1:.4f}, AUC={avg_auc:.4f}")

  print("-" * 40)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Results for All Features:
Decision Tree: Accuracy=0.9095, F1=0.8619, AUC=0.8993
Random Forest: Accuracy=0.9405, F1=0.9002, AUC=0.9693
SVM: Accuracy=0.7976, F1=0.5445, AUC=0.9068
KNN: Accuracy=0.7667, F1=0.6352, AUC=0.7284
Logistic Regression: Accuracy=0.8690, F1=0.7559, AUC=0.9602
Linear Discriminant Analysis: Accuracy=0.8786, F1=0.7871, AUC=0.9513
Gaussian Process: Accuracy=0.8190, F1=0.6312, AUC=0.9171
Stacking Classifier: Accuracy=0.9333, F1=0.8937, AUC=0.9642
----------------------------------------
Results for RFE Features:
Decision Tree: Accuracy=0.9024, F1=0.8537, AUC=0.8957
Random Forest: Accuracy=0.9381, F1=0.8999, AUC=0.9722
SVM: Accuracy=0.8357, F1=0.6592, AUC=0.9438
KNN: Accuracy=0.8429, F1=0.7566, AUC=0.8185
Logistic Regression: Accuracy=0.8857, F1=0.7872, AUC=0.9697
Linear Discriminant Analysis: Accuracy=0.9071, F1=0.8287, AUC=0.9665
Gaussian Pr

In [None]:
#main program
import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.gaussian_process import GaussianProcessClassifier

def select_features_by_rfe(data, target_name='is_long_parameters_list', n_features=20):
  X = data.drop(target_name, axis=1)
  y = data[target_name]
  estimator = LogisticRegression(solver='liblinear')
  selector = RFE(estimator, n_features_to_select=n_features, step=1)
  selector = selector.fit(X, y)
  return dict(zip(X.columns, selector.support_))

def select_features_by_sfs(data, target_name='is_long_parameters_list', n_features=20):
  X = data.drop(target_name, axis=1)
  y = data[target_name]
  estimator = LogisticRegression(solver='liblinear')
  selector = SFS(estimator, n_features_to_select=n_features, direction='forward')
  selector = selector.fit(X, y)
  return dict(zip(X.columns, selector.support_))

def fitness(solution, X, y):
  if np.sum(solution) == 0:
    return 0
  X_selected = X[:, solution == 1]
  X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
  model = LogisticRegression(max_iter=1000)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  return accuracy_score(y_test, y_pred)

def initialize_population(pop_size, num_features):
  return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores):
  parents = []
  for _ in range(len(population)):
    i, j = np.random.choice(len(population), 2, replace=False)
    if fitness_scores[i] > fitness_scores[j]:
      parents.append(population[i])
    else:
      parents.append(population[j])
  return np.array(parents)

def crossover(parents):
  offspring = []
  for i in range(0, len(parents), 2):
    if i + 1 < len(parents):
      crossover_point = random.randint(1, parents.shape[1] - 1)
      parent1, parent2 = parents[i], parents[i + 1]
      child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
      child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
      offspring.extend([child1, child2])
  return np.array(offspring)

def mutate(offspring, mutation_rate=0.01):
  for individual in offspring:
    for gene in range(len(individual)):
      if random.random() < mutation_rate:
        individual[gene] = 1 - individual[gene]
  return offspring

def select_features_by_custom_genetic(data, target_name='is_long_parameters_list', pop_size=20, n_generations=50, mutation_rate=0.01):
  X = data.drop(target_name, axis=1).values
  y = data[target_name].values
  population = initialize_population(pop_size, X.shape[1])
  for generation in range(n_generations):
    fitness_scores = np.array([fitness(individual, X, y) for individual in population])
    parents = select_parents(population, fitness_scores)
    offspring = crossover(parents)
    population = mutate(offspring, mutation_rate)
  best_solution = population[np.argmax(fitness_scores)]
  return dict(zip(data.drop(target_name, axis=1).columns, best_solution))

def majority_voting_features(rfe_scores, sfs_scores, genetic_scores, new_wrapper_scores):
  all_features = set(rfe_scores.keys()).union(sfs_scores.keys()).union(genetic_scores.keys()).union(new_wrapper_scores.keys())
  feature_count = {feature: 0 for feature in all_features}
  for feature in rfe_scores:
    if rfe_scores[feature]:
      feature_count[feature] += 1
  for feature in sfs_scores:
    if sfs_scores[feature]:
      feature_count[feature] += 1
  for feature in genetic_scores:
    if genetic_scores[feature]:
      feature_count[feature] += 1
  for feature in new_wrapper_scores:
    if new_wrapper_scores[feature]:
      feature_count[feature] += 1
  majority_features = [feature for feature, count in feature_count.items() if count >= 3]
  return majority_features

def select_features_by_new_wrapper(data, target_name='is_long_parameters_list', n_features=20):
  X = data.drop(target_name, axis=1)
  y = data[target_name]
  estimator = RandomForestClassifier()
  selector = RFE(estimator, n_features_to_select=n_features, step=1)
  selector = selector.fit(X, y)
  return dict(zip(X.columns, selector.support_))

drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/LongParameterList.csv')

X = data.drop('is_long_parameters_list', axis=1)
y = data['is_long_parameters_list']

scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_normalized.fillna(X_normalized.mean(), inplace=True)
data_normalized = pd.concat([X_normalized, y], axis=1)

rfe_scores = select_features_by_rfe(data_normalized, target_name='is_long_parameters_list', n_features=20)
sfs_scores = select_features_by_sfs(data_normalized, target_name='is_long_parameters_list', n_features=20)
genetic_scores = select_features_by_custom_genetic(data_normalized, target_name='is_long_parameters_list')
new_wrapper_scores = select_features_by_new_wrapper(data_normalized, target_name='is_long_parameters_list', n_features=20)

intersection_features = set(rfe_scores.keys()).intersection(set(sfs_scores.keys()), set(genetic_scores.keys()), set(new_wrapper_scores.keys()))
majority_voting_features = majority_voting_features(rfe_scores, sfs_scores, genetic_scores, new_wrapper_scores)

common_features = set(rfe_scores.keys()).union(set(sfs_scores.keys())).union(set(genetic_scores.keys())).union(set(new_wrapper_scores.keys()))

base_classifiers = [
  ("Decision Tree", DecisionTreeClassifier()),
  ('Random Forest', RandomForestClassifier()),
  ("SVM", SVC(C=0.1, gamma='scale', kernel='poly', degree=3, probability=True)),
  ("KNN", KNeighborsClassifier(n_neighbors=3,weights='uniform',   algorithm='auto',  leaf_size=30,   p=2,   metric='minkowski', metric_params=None,    n_jobs=None)),
  ('Logistic Regression', LogisticRegression()),
  ("Linear Discriminant Analysis", LinearDiscriminantAnalysis()),
  ("Gaussian Process", GaussianProcessClassifier()),
  ("MLP",MLPClassifier(max_iter=2000))
]

stacking_classifier = StackingClassifier(estimators=base_classifiers, final_estimator=MLPClassifier(max_iter=2000))

feature_sets = {
  "All Features": X_normalized.columns.tolist(),
  "RFE Features": [feature for feature, selected in rfe_scores.items() if selected],
  "SFS Features": [feature for feature, selected in sfs_scores.items() if selected],
  "Genetic Features": [feature for feature, selected in genetic_scores.items() if selected],
  "New Wrapper Features": [feature for feature, selected in new_wrapper_scores.items() if selected],
  "Union of All Features": list(common_features),
  "Intersection Set": list(intersection_features),
  "Majority Set": list(majority_voting_features)
}

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for feature_set_name, feature_set in feature_sets.items():
  X_selected = X_normalized[feature_set]
  metrics = {name: {"accuracy": [], "f1": [], "auc": []} for name, _ in base_classifiers}
  metrics["Stacking Classifier"] = {"accuracy": [], "f1": [], "auc": []}

  for train_index, test_index in skf.split(X_selected, y):
    X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    for name, clf in base_classifiers:
      clf.fit(X_train, y_train)
      y_pred = clf.predict(X_test)
      y_prob = clf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC

      metrics[name]["accuracy"].append(accuracy_score(y_test, y_pred))
      metrics[name]["f1"].append(f1_score(y_test, y_pred))
      metrics[name]["auc"].append(roc_auc_score(y_test, y_prob))

    # Stacking Classifier
    stacking_classifier.fit(X_train, y_train)
    y_pred = stacking_classifier.predict(X_test)
    y_prob = stacking_classifier.predict_proba(X_test)[:, 1]

    metrics["Stacking Classifier"]["accuracy"].append(accuracy_score(y_test, y_pred))
    metrics["Stacking Classifier"]["f1"].append(f1_score(y_test, y_pred))
    metrics["Stacking Classifier"]["auc"].append(roc_auc_score(y_test, y_prob))

  print(f"Results for {feature_set_name}:")

  for name, scores in metrics.items():
    avg_accuracy = np.mean(scores["accuracy"])
    avg_f1 = np.mean(scores["f1"])
    avg_auc = np.mean(scores["auc"])
    print(f"{name}: Accuracy={avg_accuracy:.4f}, F1={avg_f1:.4f}, AUC={avg_auc:.4f}")

  print("-" * 40)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Results for All Features:
Decision Tree: Accuracy=0.9095, F1=0.8640, AUC=0.9029
Random Forest: Accuracy=0.9381, F1=0.8980, AUC=0.9665
SVM: Accuracy=0.7976, F1=0.5445, AUC=0.9068
KNN: Accuracy=0.7690, F1=0.6258, AUC=0.8054
Logistic Regression: Accuracy=0.8690, F1=0.7559, AUC=0.9602
Linear Discriminant Analysis: Accuracy=0.8786, F1=0.7871, AUC=0.9513
Gaussian Process: Accuracy=0.8190, F1=0.6312, AUC=0.9171
Stacking Classifier: Accuracy=0.9381, F1=0.9017, AUC=0.9665
----------------------------------------
Results for RFE Features:
Decision Tree: Accuracy=0.9024, F1=0.8557, AUC=0.8979
Random Forest: Accuracy=0.9357, F1=0.8948, AUC=0.9710
SVM: Accuracy=0.8357, F1=0.6592, AUC=0.9438
KNN: Accuracy=0.8500, F1=0.7643, AUC=0.8937
Logistic Regression: Accuracy=0.8857, F1=0.7872, AUC=0.9697
Linear Discriminant Analysis: Accuracy=0.9071, F1=0.8287, AUC=0.9665
Gaussian Pr

In [None]:
import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.gaussian_process import GaussianProcessClassifier

// 1
def entropy(target_col):
  elements, counts = np.unique(target_col, return_counts=True)
  entropy = -np.sum([(counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(elements))])
  return entropy

def InfoGain(data, split_attribute_name, target_name="class"):
  total_entropy = entropy(data[target_name])
  vals, counts = np.unique(data[split_attribute_name], return_counts=True)
  Weighted_Entropy = np.sum([(counts[i] / np.sum(counts)) * entropy(data.where(data[split_attribute_name] == vals[i]).dropna()[target_name]) for i in range(len(vals))])
  Information_Gain = total_entropy - Weighted_Entropy
  return Information_Gain

def split_info(data, split_attribute_name):
  vals, counts = np.unique(data[split_attribute_name], return_counts=True)
  split_info = -np.sum([(counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(vals))])
  return split_info

def gain_ratio(data, split_attribute_name, target_name="class"):
  gain = InfoGain(data, split_attribute_name, target_name)
  split = split_info(data, split_attribute_name)
  gain_ratio = gain / split
  return gain_ratio

def select_features_by_gain_ratio(data, target_name="class"):
  features = data.columns.drop(target_name)
  scores = {feature: gain_ratio(data, feature, target_name) for feature in features}
  return scores

def select_features_by_mutual_info(data, target_name="class"):
  features = data.columns.drop(target_name)
  scores = dict(zip(features, mutual_info_classif(data[features], data[target_name])))
  return scores

#عریف تابع برای محاسبه ضریب همبستگی Spearman
def spearman_corr(x, y):
  x_rank = x.rank()
  y_rank = y.rank()
  return ((x_rank - x_rank.mean()) * (y_rank - y_rank.mean())).mean() / (x_rank.std() * y_rank.std())

def select_features_by_spearman(data, target_name="class"):
  features = data.columns.drop(target_name)
  scores = {feature: spearman_corr(data[feature], data[target_name]) for feature in features}
  return scores
def select_features_by_rfe(data, target_name='is_long_parameters_list', n_features=10):
  X = data.drop(target_name, axis=1)
  y = data[target_name]
  estimator = LogisticRegression(solver='liblinear')
  selector = RFE(estimator, n_features_to_select=n_features, step=1)
  selector = selector.fit(X, y)
  return dict(zip(X.columns, selector.support_))

#SFS با Logistic Regression
def select_features_by_sfs(data, target_name='is_long_parameters_list', n_features=10):
  X = data.drop(target_name, axis=1)
  y = data[target_name]
  estimator = LogisticRegression(solver='liblinear')
  selector = SFS(estimator, n_features_to_select=n_features, direction='forward')
  selector = selector.fit(X, y)
  return dict(zip(X.columns, selector.support_))


def fitness(solution, X, y):
  if np.sum(solution) == 0:
    return 0


  X_selected = X[:, solution == 1]
  X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

  model = LogisticRegression(max_iter=1000)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  return accuracy_score(y_test, y_pred)
def initialize_population(pop_size, num_features):
  return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores):
    parents = []
    for _ in range(len(population)):
        i, j = np.random.choice(len(population), 2, replace=False)
        if fitness_scores[i] > fitness_scores[j]:
            parents.append(population[i])
        else:
            parents.append(population[j])
    return np.array(parents)

def crossover(parents):
    offspring = []
    for i in range(0, len(parents), 2):
        if i + 1 < len(parents):
            crossover_point = random.randint(1, parents.shape[1] - 1)
            parent1, parent2 = parents[i], parents[i + 1]
            child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
            child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
            offspring.extend([child1, child2])
    return np.array(offspring)

def mutate(offspring, mutation_rate=0.01):
    for individual in offspring:
        for gene in range(len(individual)):
            if random.random() < mutation_rate:
                individual[gene] = 1 - individual[gene]
    return offspring


def select_features_by_custom_genetic(data, target_name='is_long_parameters_list', pop_size=20, n_generations=50, mutation_rate=0.01):
    X = data.drop(target_name, axis=1).values
    y = data[target_name].values

    population = initialize_population(pop_size, X.shape[1])

    for generation in range(n_generations):
        fitness_scores = np.array([fitness(individual, X, y) for individual in population])
        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents)
        population = mutate(offspring, mutation_rate)

    best_solution = population[np.argmax(fitness_scores)]
    return dict(zip(data.drop(target_name, axis=1).columns, best_solution))

#magority feature selection
def majority_voting_features(rfe_scores, sfs_scores, genetic_scores):
    all_features = set(rfe_scores.keys()).union(sfs_scores.keys()).union(genetic_scores.keys())
    feature_count = {feature: 0 for feature in all_features}

    for feature in rfe_scores:
        if rfe_scores[feature]:
            feature_count[feature] += 1
    for feature in sfs_scores:
        if sfs_scores[feature]:
            feature_count[feature] += 1
    for feature in genetic_scores:
        if genetic_scores[feature]:
            feature_count[feature] += 1

    majority_features = [feature for feature, count in feature_count.items() if count >= 2]
    return majority_features


drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/LongParameterList.csv')


X = data.drop('is_long_parameters_list', axis=1)
y = data['is_long_parameters_list']


scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


X_normalized.fillna(X_normalized.mean(), inplace=True)


data_normalized = pd.concat([X_normalized, y], axis=1)


gain_ratio_scores = select_features_by_gain_ratio(data_normalized, target_name='is_long_parameters_list')

mutual_info_scores = select_features_by_mutual_info(data_normalized, target_name='is_long_parameters_list')


spearman_scores = select_features_by_spearman(data_normalized, target_name='is_long_parameters_list')
rfe_scores = select_features_by_rfe(data_normalized, target_name='is_long_parameters_list', n_features=25)
sfs_scores = select_features_by_sfs(data_normalized, target_name='is_long_parameters_list', n_features=25)
genetic_scores = select_features_by_custom_genetic(data_normalized, target_name='is_long_parameters_list')
intersection_features = set(rfe_scores.keys()).intersection(set(sfs_scores.keys()), set(genetic_scores.keys()))
majority_voting_features = majority_voting_features(rfe_scores, sfs_scores, genetic_scores)


common_features = set(gain_ratio_scores.keys()).union(set(mutual_info_scores.keys())).union(set(spearman_scores.keys())).union(set(rfe_scores.keys())).union(set(sfs_scores.keys())).union(set(genetic_scores.keys()))


base_classifiers = [
("Decision Tree", DecisionTreeClassifier()),
('Rg',RandomForestClassifier()),
("SVM", SVC(probability=True)),
("KNN", KNeighborsClassifier()),
('Logistic Regression', LogisticRegression()),
("Linear Discriminant Analysis", LinearDiscriminantAnalysis()),
("GPC",GaussianProcessClassifier()),

]
stacking_classifier = StackingClassifier(estimators=base_classifiers, final_estimator=MLPClassifier(max_iter=2000))


feature_sets = {
"All Features": X_normalized.columns.tolist(), # استفاده از تمامی ویژگی‌ها
"Gain Ratio Features": [feature for feature, score in gain_ratio_scores.items() if score > 0.04],
"Mutual Information Features": [feature for feature, score in mutual_info_scores.items() if score > 0.04],
"Spearman Correlation Features": [feature for feature, score in spearman_scores.items() if score > 0.04],
"RFE Features": [feature for feature, selected in rfe_scores.items() if selected],
"SFS Features": [feature for feature, selected in sfs_scores.items() if selected],
"genetic Features": [feature for feature, selected in genetic_scores.items() if selected],
"Union of All Features": list(common_features),
"Instertion Set":list(intersection_features),
"magority set": list(majority_voting_features)
}


n_splits = 7

# StratifiedKFold
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# تعریف تعداد تکرار (fold)
n_splits = 10

# تعریف StratifiedKFold
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# اعمال الگوریتم‌های طبقه‌بندی بر روی مجموعه‌های ویژگی
for feature_set_name, feature_set in feature_sets.items():
    X_selected = X_normalized[feature_set]
    metrics = {name: {"accuracy": [], "f1": [], "auc": []} for name, _ in base_classifiers}
    metrics["Stacking Classifier"] = {"accuracy": [], "f1": [], "auc": []}

    for train_index, test_index in skf.split(X_selected, y):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        print(f"\n{feature_set_name}:")
        for base_classifier_name, base_classifier in base_classifiers:
            base_classifier.fit(X_train, y_train)
            y_pred = base_classifier.predict(X_test)
            y_pred_prob = base_classifier.predict_proba(X_test)[:, 1]
            accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_prob)
            metrics[base_classifier_name]["accuracy"].append(accuracy)
            metrics[base_classifier_name]["f1"].append(f1)
            metrics[base_classifier_name]["auc"].append(auc)
            print(f"{base_classifier_name} - Accuracy: {accuracy}, F1: {f1}, AUC: {auc}")

        stacking_classifier.fit(X_train, y_train)
        y_pred = stacking_classifier.predict(X_test)
        y_pred_prob = stacking_classifier.predict_proba(X_test)[:, 1]
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_pred_prob)
        metrics["Stacking Classifier"]["accuracy"].append(accuracy)
        metrics["Stacking Classifier"]["f1"].append(f1)
        metrics["Stacking Classifier"]["auc"].append(auc)
        print(f"Stacking Classifier - Accuracy: {accuracy}, F1: {f1}, AUC: {auc}")

    for classifier_name, metric_values in metrics.items():
        avg_accuracy = np.mean(metric_values["accuracy"])
        avg_f1 = np.mean(metric_values["f1"])
        avg_auc = np.mean(metric_values["auc"])
        print(f"\n{classifier_name} - Average Accuracy: {avg_accuracy}, Average F1: {avg_f1}, Average AUC: {avg_auc}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

All Features:
Decision Tree - Accuracy: 0.8571428571428571, F1: 0.7692307692307693, AUC: 0.8328912466843501
Rg - Accuracy: 0.9047619047619048, F1: 0.8333333333333333, AUC: 0.9442970822281167
SVM - Accuracy: 0.8571428571428571, F1: 0.7000000000000001, AUC: 0.9496021220159151
KNN - Accuracy: 0.7380952380952381, F1: 0.4761904761904762, AUC: 0.6657824933687002
Logistic Regression - Accuracy: 0.9047619047619048, F1: 0.8181818181818181, AUC: 0.9549071618037135
Linear Discriminant Analysis - Accuracy: 0.9047619047619048, F1: 0.8333333333333333, AUC: 0.9018567639257294
GPC - Accuracy: 0.8333333333333334, F1: 0.631578947368421, AUC: 0.9310344827586208
Stacking Classifier - Accuracy: 0.8809523809523809, F1: 0.8, AUC: 0.9363395225464192

All Features:
Decision Tree - Accuracy: 0.8809523809523809, F1: 0.8275862068965517, AUC: 0.8925729442970822
Rg - Accuracy: 0.97619047

In [None]:
import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.gaussian_process import GaussianProcessClassifier

def entropy(target_col):
  elements, counts = np.unique(target_col, return_counts=True)
  entropy = -np.sum([(counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(elements))])
  return entropy

def InfoGain(data, split_attribute_name, target_name="class"):
  total_entropy = entropy(data[target_name])
  vals, counts = np.unique(data[split_attribute_name], return_counts=True)
  Weighted_Entropy = np.sum([(counts[i] / np.sum(counts)) * entropy(data.where(data[split_attribute_name] == vals[i]).dropna()[target_name]) for i in range(len(vals))])
  Information_Gain = total_entropy - Weighted_Entropy
  return Information_Gain

def split_info(data, split_attribute_name):
  vals, counts = np.unique(data[split_attribute_name], return_counts=True)
  split_info = -np.sum([(counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(vals))])
  return split_info

def gain_ratio(data, split_attribute_name, target_name="class"):
  gain = InfoGain(data, split_attribute_name, target_name)
  split = split_info(data, split_attribute_name)
  gain_ratio = gain / split
  return gain_ratio

def select_features_by_gain_ratio(data, target_name="class"):
  features = data.columns.drop(target_name)
  scores = {feature: gain_ratio(data, feature, target_name) for feature in features}
  return scores

def select_features_by_mutual_info(data, target_name="class"):
  features = data.columns.drop(target_name)
  scores = dict(zip(features, mutual_info_classif(data[features], data[target_name])))
  return scores

#عریف تابع برای محاسبه ضریب همبستگی Spearman
def spearman_corr(x, y):
  x_rank = x.rank()
  y_rank = y.rank()
  return ((x_rank - x_rank.mean()) * (y_rank - y_rank.mean())).mean() / (x_rank.std() * y_rank.std())

def select_features_by_spearman(data, target_name="class"):
  features = data.columns.drop(target_name)
  scores = {feature: spearman_corr(data[feature], data[target_name]) for feature in features}
  return scores
def select_features_by_rfe(data, target_name='is_long_parameters_list', n_features=20):
  X = data.drop(target_name, axis=1)
  y = data[target_name]
  estimator = LogisticRegression(solver='liblinear')
  selector = RFE(estimator, n_features_to_select=n_features, step=1)
  selector = selector.fit(X, y)
  return dict(zip(X.columns, selector.support_))

#SFS با Logistic Regression
def select_features_by_sfs(data, target_name='is_long_parameters_list', n_features=20):
  X = data.drop(target_name, axis=1)
  y = data[target_name]
  estimator = LogisticRegression(solver='liblinear')
  selector = SFS(estimator, n_features_to_select=n_features, direction='forward')
  selector = selector.fit(X, y)
  return dict(zip(X.columns, selector.support_))


def fitness(solution, X, y):
  if np.sum(solution) == 0:
    return 0


  X_selected = X[:, solution == 1]
  X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

  model = LogisticRegression(max_iter=1000)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  return accuracy_score(y_test, y_pred)
def initialize_population(pop_size, num_features):
  return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores):
    parents = []
    for _ in range(len(population)):
        i, j = np.random.choice(len(population), 2, replace=False)
        if fitness_scores[i] > fitness_scores[j]:
            parents.append(population[i])
        else:
            parents.append(population[j])
    return np.array(parents)

def crossover(parents):
    offspring = []
    for i in range(0, len(parents), 2):
        if i + 1 < len(parents):
            crossover_point = random.randint(1, parents.shape[1] - 1)
            parent1, parent2 = parents[i], parents[i + 1]
            child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
            child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
            offspring.extend([child1, child2])
    return np.array(offspring)

def mutate(offspring, mutation_rate=0.01):
    for individual in offspring:
        for gene in range(len(individual)):
            if random.random() < mutation_rate:
                individual[gene] = 1 - individual[gene]
    return offspring


def select_features_by_custom_genetic(data, target_name='is_long_parameters_list', pop_size=20, n_generations=50, mutation_rate=0.01):
    X = data.drop(target_name, axis=1).values
    y = data[target_name].values

    population = initialize_population(pop_size, X.shape[1])

    for generation in range(n_generations):
        fitness_scores = np.array([fitness(individual, X, y) for individual in population])
        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents)
        population = mutate(offspring, mutation_rate)

    best_solution = population[np.argmax(fitness_scores)]
    return dict(zip(data.drop(target_name, axis=1).columns, best_solution))

#magority feature selection
def majority_voting_features(rfe_scores, sfs_scores, genetic_scores):
    all_features = set(rfe_scores.keys()).union(sfs_scores.keys()).union(genetic_scores.keys())
    feature_count = {feature: 0 for feature in all_features}

    for feature in rfe_scores:
        if rfe_scores[feature]:
            feature_count[feature] += 1
    for feature in sfs_scores:
        if sfs_scores[feature]:
            feature_count[feature] += 1
    for feature in genetic_scores:
        if genetic_scores[feature]:
            feature_count[feature] += 1

    majority_features = [feature for feature, count in feature_count.items() if count >= 2]
    return majority_features

drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/LongParameterList.csv')

X = data.drop('is_long_parameters_list', axis=1)
y = data['is_long_parameters_list']

scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_normalized.fillna(X_normalized.mean(), inplace=True)
data_normalized = pd.concat([X_normalized, y], axis=1)

gain_ratio_scores = select_features_by_gain_ratio(data_normalized, target_name='is_long_parameters_list')
mutual_info_scores = select_features_by_mutual_info(data_normalized, target_name='is_long_parameters_list')
spearman_scores = select_features_by_spearman(data_normalized, target_name='is_long_parameters_list')
rfe_scores = select_features_by_rfe(data_normalized, target_name='is_long_parameters_list', n_features=20)
sfs_scores = select_features_by_sfs(data_normalized, target_name='is_long_parameters_list', n_features=20)
genetic_scores = select_features_by_custom_genetic(data_normalized, target_name='is_long_parameters_list')
intersection_features = set(rfe_scores.keys()).intersection(set(sfs_scores.keys()), set(genetic_scores.keys()))
majority_voting_features = majority_voting_features(rfe_scores, sfs_scores, genetic_scores)

common_features = set(gain_ratio_scores.keys()).union(set(mutual_info_scores.keys())).union(set(spearman_scores.keys())).union(set(rfe_scores.keys())).union(set(sfs_scores.keys())).union(set(genetic_scores.keys()))

base_classifiers = [
    ("Decision Tree", DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ("SVM", SVC(C=0.1, gamma='scale', kernel='poly', degree=3,probability=True)),
    ("KNN", KNeighborsClassifier(n_neighbors=1)),
    ('Logistic Regression', LogisticRegression()),
    ("Linear Discriminant Analysis", LinearDiscriminantAnalysis()),
    ("Gaussian Process", GaussianProcessClassifier())
]

stacking_classifier = StackingClassifier(estimators=base_classifiers, final_estimator=MLPClassifier(max_iter=2000))

feature_sets = {
    "All Features": X_normalized.columns.tolist(),
    "Gain Ratio Features": [feature for feature, score in gain_ratio_scores.items() if score > 0.04],
    "Mutual Information Features": [feature for feature, score in mutual_info_scores.items() if score > 0.04],
    "Spearman Correlation Features": [feature for feature, score in spearman_scores.items() if score > 0.04],
    "RFE Features": [feature for feature, selected in rfe_scores.items() if selected],
    "SFS Features": [feature for feature, selected in sfs_scores.items() if selected],
    "Genetic Features": [feature for feature, selected in genetic_scores.items() if selected],
    "Union of All Features": list(common_features),
    "Intersection Set": list(intersection_features),
    "Majority Set": list(majority_voting_features)
}

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for feature_set_name, feature_set in feature_sets.items():
    X_selected = X_normalized[feature_set]
    metrics = {name: {"accuracy": [], "f1": [], "auc": []} for name, _ in base_classifiers}
    metrics["Stacking Classifier"] = {"accuracy": [], "f1": [], "auc": []}

    for train_index, test_index in skf.split(X_selected, y):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        for name, clf in base_classifiers:
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            y_prob = clf.predict_proba(X_test)[:, 1]  # Probability estimates for AUC

            metrics[name]["accuracy"].append(accuracy_score(y_test, y_pred))
            metrics[name]["f1"].append(f1_score(y_test, y_pred))
            metrics[name]["auc"].append(roc_auc_score(y_test, y_prob))

        # Stacking Classifier
        stacking_classifier.fit(X_train, y_train)
        y_pred = stacking_classifier.predict(X_test)
        y_prob = stacking_classifier.predict_proba(X_test)[:, 1]

        metrics["Stacking Classifier"]["accuracy"].append(accuracy_score(y_test, y_pred))
        metrics["Stacking Classifier"]["f1"].append(f1_score(y_test, y_pred))
        metrics["Stacking Classifier"]["auc"].append(roc_auc_score(y_test, y_prob))

    print(f"Results for {feature_set_name}:")

    for name, scores in metrics.items():
        avg_accuracy = np.mean(scores["accuracy"])
        avg_f1 = np.mean(scores["f1"])
        avg_auc = np.mean(scores["auc"])
        print(f"{name}: Accuracy={avg_accuracy:.4f}, F1={avg_f1:.4f}, AUC={avg_auc:.4f}")

    print("-" * 40)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Results for All Features:
Decision Tree: Accuracy=0.9190, F1=0.8756, AUC=0.9083
Random Forest: Accuracy=0.9452, F1=0.9098, AUC=0.9728
SVM: Accuracy=0.7976, F1=0.5445, AUC=0.9067
KNN: Accuracy=0.7667, F1=0.6352, AUC=0.7284
Logistic Regression: Accuracy=0.8690, F1=0.7559, AUC=0.9602
Linear Discriminant Analysis: Accuracy=0.8786, F1=0.7871, AUC=0.9513
Gaussian Process: Accuracy=0.8190, F1=0.6312, AUC=0.9171
Stacking Classifier: Accuracy=0.9286, F1=0.8887, AUC=0.9717
----------------------------------------
Results for Gain Ratio Features:
Decision Tree: Accuracy=0.8976, F1=0.8440, AUC=0.8852
Random Forest: Accuracy=0.9381, F1=0.8976, AUC=0.9653
SVM: Accuracy=0.8286, F1=0.6491, AUC=0.9261
KNN: Accuracy=0.8214, F1=0.7114, AUC=0.7836
Logistic Regression: Accuracy=0.8810, F1=0.7779, AUC=0.9509
Linear Discriminant Analysis: Accuracy=0.8881, F1=0.7934, AUC=0.9524
Gaus

In [None]:
import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.gaussian_process import GaussianProcessClassifier

def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy = -np.sum([(counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(elements))])
    return entropy

def InfoGain(data, split_attribute_name, target_name="class"):
    total_entropy = entropy(data[target_name])
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    Weighted_Entropy = np.sum([(counts[i] / np.sum(counts)) * entropy(data.where(data[split_attribute_name] == vals[i]).dropna()[target_name]) for i in range(len(vals))])
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

def split_info(data, split_attribute_name):
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    split_info = -np.sum([(counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(vals))])
    return split_info

def gain_ratio(data, split_attribute_name, target_name="class"):
    gain = InfoGain(data, split_attribute_name, target_name)
    split = split_info(data, split_attribute_name)
    gain_ratio = gain / split
    return gain_ratio

def select_features_by_gain_ratio(data, target_name="class"):
    features = data.columns.drop(target_name)
    scores = {feature: gain_ratio(data, feature, target_name) for feature in features}
    return scores

def select_features_by_mutual_info(data, target_name="class"):
    features = data.columns.drop(target_name)
    scores = dict(zip(features, mutual_info_classif(data[features], data[target_name])))
    return scores

def spearman_corr(x, y):
    x_rank = x.rank()
    y_rank = y.rank()
    return ((x_rank - x_rank.mean()) * (y_rank - y_rank.mean())).mean() / (x_rank.std() * y_rank.std())

def select_features_by_spearman(data, target_name="class"):
    features = data.columns.drop(target_name)
    scores = {feature: spearman_corr(data[feature], data[target_name]) for feature in features}
    return scores

def select_features_by_rfe(data, target_name='is_long_parameters_list', n_features=10):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    estimator = LogisticRegression(solver='liblinear')
    selector = RFE(estimator, n_features_to_select=n_features, step=1)
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def select_features_by_sfs(data, target_name='is_long_parameters_list', n_features=10):
    X = data.drop(target_name, axis=1)
    y = data[target_name]
    estimator = LogisticRegression(solver='liblinear')
    selector = SFS(estimator, n_features_to_select=n_features, direction='forward')
    selector = selector.fit(X, y)
    return dict(zip(X.columns, selector.support_))

def fitness(solution, X, y):
    if np.sum(solution) == 0:
        return 0
    X_selected = X[:, solution == 1]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

def initialize_population(pop_size, num_features):
    return np.random.randint(2, size=(pop_size,num_features))

def select_parents(population, fitness_scores, num_parents):
    parents = np.zeros((num_parents, population.shape[1]))
    for parent_num in range(num_parents):
        max_fitness_idx = np.argmax(fitness_scores)
        parents[parent_num, :] = population[max_fitness_idx, :]
        fitness_scores[max_fitness_idx] = -np.inf
    return parents

def crossover(parents, offspring_size):
    offspring = np.zeros(offspring_size)
    crossover_point = np.uint8(offspring_size[1] / 2)
    for k in range(offspring_size[0]):
        parent1_idx = k % parents.shape[0]
        parent2_idx = (k + 1) % parents.shape[0]
        offspring[k, 0:crossover_point] = parents[parent1_idx, 0:crossover_point]
        offspring[k, crossover_point:] = parents[parent2_idx, crossover_point:]
    return offspring

def mutation(offspring_crossover, mutation_rate):
    for idx in range(offspring_crossover.shape[0]):
        for gene_idx in range(offspring_crossover.shape[1]):
            if random.random() < mutation_rate:
                offspring_crossover[idx, gene_idx] = 1 - offspring_crossover[idx, gene_idx]
    return offspring_crossover

def genetic_algorithm(X, y, pop_size, num_generations, num_parents_mating, mutation_rate):
    num_features = X.shape[1]
    population = initialize_population(pop_size, num_features)
    for generation in range(num_generations):
        fitness_scores = np.array([fitness(ind, X, y) for ind in population])
        parents = select_parents(population, fitness_scores, num_parents_mating)
        offspring_crossover = crossover(parents, (pop_size - parents.shape[0], num_features))
        offspring_mutation = mutation(offspring_crossover, mutation_rate)
        population[0:parents.shape[0], :] = parents
        population[parents.shape[0]:, :] = offspring_mutation
    fitness_scores = np.array([fitness(ind, X, y) for ind in population])
    best_solution_idx = np.argmax(fitness_scores)
    best_solution = population[best_solution_idx]
    return best_solution

# Load data
drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/LongParameterList.csv')

# Preprocess data
scaler = MinMaxScaler()
data[data.columns[:-1]] = scaler.fit_transform(data[data.columns[:-1]])

# Split data into features and target
X = data.drop('is_long_parameters_list', axis=1).values
y = data['is_long_parameters_list'].values

# Select features using different algorithms
gain_ratio_scores = select_features_by_gain_ratio(data, target_name='is_long_parameters_list')
mutual_info_scores = select_features_by_mutual_info(data, target_name='is_long_parameters_list')
spearman_scores = select_features_by_spearman(data, target_name='is_long_parameters_list')
rfe_scores = select_features_by_rfe(data, target_name='is_long_parameters_list')
sfs_scores = select_features_by_sfs(data, target_name='is_long_parameters_list')

# Display selected features
print("Gain Ratio Selected Features:")
print([feature for feature, selected in gain_ratio_scores.items() if selected])

print("Mutual Information Selected Features:")
print([feature for feature, selected in mutual_info_scores.items() if selected])

print("Spearman Correlation Selected Features:")
print([feature for feature, selected in spearman_scores.items() if selected])

print("RFE Selected Features:")
print([feature for feature, selected in rfe_scores.items() if selected])

print("SFS Selected Features:")
print([feature for feature, selected in sfs_scores.items() if selected])

# Genetic Algorithm for feature selection
best_solution = genetic_algorithm(X, y, pop_size=20, num_generations=50, num_parents_mating=10, mutation_rate=0.1)
selected_features = [feature for feature, selected in zip(data.columns[:-1], best_solution) if selected]

print("Genetic Algorithm Selected Features:")
print(selected_features)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Gain Ratio Selected Features:
['NOP_method', 'CC_method', 'ATFD_method', 'FDP_method', 'CM_method', 'MAXNESTING_method', 'LOC_method', 'CYCLO_method', 'NMCS_method', 'NOLV_method', 'MaMCL_method', 'NOAV_method', 'LAA_method', 'FANOUT_method', 'CFNAMM_method', 'ATLD_method', 'CINT_method', 'MeMCL_method', 'CDISP_method', 'NOII_type', 'NOAM_type', 'NOCS_type', 'NOM_type', 'NMO_type', 'ATFD_type', 'FANOUT_type', 'NOMNAMM_type', 'NOA_type', 'NIM_type', 'DIT_type', 'LOC_type', 'LOCNAMM_type', 'CFNAMM_type', 'TCC_type', 'NOPA_type', 'CBO_type', 'RFC_type', 'NOC_type', 'WMC_type', 'LCOM5_type', 'WOC_type', 'WMCNAMM_type', 'AMW_type', 'AMWNAMM_type', 'NOCS_package', 'NOMNAMM_package', 'NOI_package', 'LOC_package', 'NOM_package', 'NOPK_project', 'NOCS_project', 'NOI_project', 'NOM_project', 'NOMNAMM_project', 'LOC_project']
Mutual Information Selected Features:
['NOP_

In [None]:
import pandas as pd
import numpy as np
import random
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.gaussian_process import GaussianProcessClassifier

def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy = -np.sum([(counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(elements))])
    return entropy

def InfoGain(data, split_attribute_name, target_name="class"):
    total_entropy = entropy(data[target_name])
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    Weighted_Entropy = np.sum([(counts[i] / np.sum(counts)) * entropy(data.where(data[split_attribute_name] == vals[i]).dropna()[target_name]) for i in range(len(vals))])
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

def split_info(data, split_attribute_name):
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    split_info = -np.sum([(counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(vals))])
    return split_info

def gain_ratio(data, split_attribute_name, target_name="class"):
    gain = InfoGain(data, split_attribute_name, target_name)
    split = split_info(data, split_attribute_name)
    gain_ratio = gain / split
    return gain_ratio

def select_features_by_gain_ratio(data, target_name="class"):
    features = data.columns.drop(target_name)
    scores = {feature: gain_ratio(data, feature, target_name) for feature in features}
    return scores

def select_features_by_mutual_info(data, target_name="class"):
    features = data.columns.drop(target_name)
    scores = dict(zip(features, mutual_info_classif(data[features], data[target_name])))
    return scores

def spearman_corr(x, y):
    x_rank = x.rank()
    y_rank = y.rank()
    return ((x_rank - x_rank.mean()) * (y_rank - y_rank.mean())).mean() / (x_rank.std() * y_rank.std())

def select_features_by_spearman(data, target_name="class"):
    features = data.columns.drop(target_name)
    scores = {feature: spearman_corr(data[feature], data[target_name]) for feature in features}
    return scores



def fitness(solution, X, y):
    if np.sum(solution) == 0:
        return 0
    X_selected = X[:, solution == 1]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

def initialize_population(pop_size, num_features):
    return np.random.randint(2, size=(pop_size, num_features))

def select_parents(population, fitness_scores, num_parents):
    parents = np.zeros((num_parents, population.shape[1]))
    for parent_num in range(num_parents):
        max_fitness_idx = np.argmax(fitness_scores)
        parents[parent_num, :] = population[max_fitness_idx, :]
        fitness_scores[max_fitness_idx] = -np.inf
    return parents


def crossover(parents, offspring_size):
    offspring = np.zeros(offspring_size)
    crossover_point = np.uint8(offspring_size[1] / 2)
    for k in range(offspring_size[0]):
        parent1_idx = k % parents.shape[0]
        parent2_idx = (k + 1) % parents.shape[0]
        offspring[k, 0:crossover_point] = parents[parent1_idx, 0:crossover_point]
        offspring[k, crossover_point:] = parents[parent2_idx, crossover_point:]
    return offspring

def mutation(offspring_crossover, mutation_rate):
    for idx in range(offspring_crossover.shape[0]):
        for gene_idx in range(offspring_crossover.shape[1]):
            if random.random() < mutation_rate:
                offspring_crossover[idx, gene_idx] = 1 - offspring_crossover[idx, gene_idx]
    return offspring_crossover

def genetic_algorithm(X, y, pop_size, num_generations, num_parents_mating, mutation_rate):
    num_features = X.shape[1]
    population = initialize_population(pop_size, num_features)
    for generation in range(num_generations):
        fitness_scores = np.array([fitness(ind, X, y) for ind in population])
        parents = select_parents(population, fitness_scores, num_parents_mating)
        offspring_crossover = crossover(parents, (pop_size - parents.shape[0], num_features))
        offspring_mutation = mutation(offspring_crossover, mutation_rate)
        population[0:parents.shape[0], :] = parents
        population[parents.shape[0]:, :] = offspring_mutation
    fitness_scores = np.array([fitness(ind, X, y) for ind in population])
    best_solution_idx = np.argmax(fitness_scores)
    best_solution = population[best_solution_idx]
    return best_solution

def select_features_by_sfs(X, y):
    best_score = 0
    best_features = None
    for n_features in range(1, X.shape[1] + 1):
        estimator = LogisticRegression(solver='liblinear')
        sfs = SFS(estimator, n_features_to_select=n_features, direction='forward')
        sfs = sfs.fit(X, y)
        selected_features = X[:, sfs.get_support()]
        X_train, X_test, y_train, y_test = train_test_split(selected_features, y, test_size=0.3, random_state=42)
        estimator.fit(X_train, y_train)
        y_pred = estimator.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        if score > best_score:
            best_score = score
            best_features = sfs.get_support()
    return best_features

def select_features_by_rfe(X, y):
    best_score = 0
    best_features = None
    for n_features in range(1, X.shape[1] + 1):
        estimator = LogisticRegression(solver='liblinear')
        rfe = RFE(estimator, n_features_to_select=n_features)
        rfe = rfe.fit(X, y)
        selected_features = X[:, rfe.get_support()]
        X_train, X_test, y_train, y_test = train_test_split(selected_features, y, test_size=0.3, random_state=42)
        estimator.fit(X_train, y_train)
        y_pred = estimator.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        if score > best_score:
            best_score = score
            best_features = rfe.get_support()
    return best_features





# Load data
drive.mount('/content/drive')
Path = '/content/drive/My Drive/colab'
data = pd.read_csv(Path + '/LongParameterList.csv')

# Preprocess data
scaler = MinMaxScaler()
data[data.columns[:-1]] = scaler.fit_transform(data[data.columns[:-1]])

# Split data into features and target
X = data.drop('is_long_parameters_list', axis=1).values
y = data['is_long_parameters_list'].values

# Select features using different algorithms
gain_ratio_scores = select_features_by_gain_ratio(data, target_name='is_long_parameters_list')
mutual_info_scores = select_features_by_mutual_info(data, target_name='is_long_parameters_list')
spearman_scores = select_features_by_spearman(data, target_name='is_long_parameters_list')
# Select features using SFS
sfs_selected_features = select_features_by_sfs(X,y)
print("SFS Selected Features:")
print([feature for feature, selected in zip(data.columns[:-1], sfs_selected_features) if selected])

# Select features using RFE
rfe_selected_features = select_features_by_rfe(X,y)
print("RFE Selected Features:")
print([feature for feature, selected in zip(data.columns[:-1], rfe_selected_features) if selected])

# Display selected features
print("Gain Ratio Selected Features:")
print([feature for feature, selected in gain_ratio_scores.items() if selected])

print("Mutual Information Selected Features:")
print([feature for feature, selected in mutual_info_scores.items() if selected])

print("Spearman Correlation Selected Features:")
print([feature for feature, selected in spearman_scores.items() if selected])



# Genetic Algorithm for feature selection
#best_solution = genetic_algorithm(X, y, pop_size=20, num_generations=50, num_parents_mating=10, mutation_rate=0.1)
#selected_features = [feature for feature, selected in zip(data.columns[:-1], best_solution) if selected]

#print("Genetic Algorithm Selected Features:")
#print(selected_features)

# Genetic Algorithm for feature selection
best_solution = genetic_algorithm(X, y, pop_size=20, num_generations=50, num_parents_mating=10, mutation_rate=0.1)
selected_features = [feature for feature, selected in zip(data.columns[:-1], best_solution) if selected]

print("Genetic Algorithm Selected Features:")
print(selected_features)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


ValueError: n_features_to_select must be either 'auto', 'warn', None, an integer in [1, n_features - 1] representing the absolute number of features, or a float in (0, 1] representing a percentage of features to select. Got 55