In [7]:
import pandas as pd
import joblib
import tensorflow as tf
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression, SGDClassifier, Perceptron, PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import os
import ast
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
# Define the data columns and results columns
data_columns = [
    'OF2', 'OF3', 'OF4', 'OF5', 'OF6', 'OF7', 'OF8', 'OF9', 'OF10', 'OF11', 'OF13', 'OF14', 'OF15', 'OF16', 'OF17',
    'OF18', 'OF19', 'OF20', 'OF21', 'OF22', 'OF23', 'OF24', 'OF25', 'OF26', 'OF27', 'OF28','OF31',
    'OF33', 'OF34', 'OF37', 'OF38', 'F1', 'F2', 'F3_a', 'F3_b', 'F3_c', 'F3_d', 'F3_e', 'F3_f', 'F3_g', 'F4', 'F5', 'F6',
    'F7', 'F8', 'F9', 'F10',  'F13', 'F14', 'F15', 'F16', 'F17', 'F18', 'F19', 'F20', 'F21', 'F22', 'F23',
    'F24', 'F25', 'F28', 'F29', 'F30', 'F31', 'F32', 'F33', 'F34', 'F35', 'F36', 'F37', 'F38', 'F39', 'F40',
    'F41', 'F43', 'F44', 'F45', 'F46', 'F47', 'F48', 'F49', 'F50', 'F51', 'F52', 'F53', 'F54', 'F55', 'F56', 'F57',
    'F58', 'F59', 'F62', 'F63', 'F64', 'F65', 'F67', 'F68', 'S1', 'S2', 'S4', 'S5'
]

results_columns = ['WS']
best_models_df = pd.read_csv("../Training/Results/best_models_infoWS.csv")
model_directory = "../Training/Results/WS"

from sklearn.feature_selection import SelectPercentile, chi2

feature_selection_techniques = {
    "SelectKBest_f_classif": SelectKBest(score_func=f_classif),
    "SelectKBest_mutual_info_classif": SelectKBest(score_func=mutual_info_classif),
    "VarianceThreshold": VarianceThreshold(threshold=0.1),  # Example threshold, adjust as needed
}

# Define a mapping from model names to model classes
model_mapping = {
    'RidgeClassifier': RidgeClassifier,
    'DecisionTreeClassifier': DecisionTreeClassifier,
    'GradientBoostingClassifier': GradientBoostingClassifier,
    'RandomForestClassifier': RandomForestClassifier,
    'AdaBoostClassifier': AdaBoostClassifier,
    'KNeighborsClassifier': KNeighborsClassifier,
    'MLPClassifier': MLPClassifier,
    'LogisticRegression': LogisticRegression,
    'SGDClassifier': SGDClassifier,
    'SVC': SVC,
    'Perceptron': Perceptron,
    'PassiveAggressiveClassifier': PassiveAggressiveClassifier
}

# Function to load, evaluate, and retrain the best model with feature selection
def load_evaluate_and_retrain_best_model_with_feature_selection(csv_file, model_name, model_path, hyperparameters):
    # Load data from CSV
    data = pd.read_csv(csv_file)
    X = data[data_columns]
    y = data[results_columns[0]]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    # Standardize the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Load the original model and calculate accuracy
    if model_name == 'TensorFlow':
        model = tf.keras.models.load_model(model_path)
        y_pred = model.predict(X_test_scaled)
        y_pred_classes = np.argmax(y_pred, axis=1)
    else:
        model = joblib.load(model_path)
        y_pred_classes = model.predict(X_test_scaled)

    accuracy_original = accuracy_score(y_test, y_pred_classes)
    selection_results = {'Original': accuracy_original}
    feature_selection_data = []  # To store feature selection results
    test_results = []  # To store test results (actual and predicted)

    # Iterate over each feature selection technique
    for name, selector in feature_selection_techniques.items():
        print(f"Applying {name} feature selection")
        accuracy_values = []

        for k in range(2, len(data_columns) + 1):  # Iterate over all possible numbers of features
            if isinstance(selector, SelectKBest):
                selector.set_params(k=k)
                X_train_selected = selector.fit_transform(X_train, y_train)
                X_test_selected = selector.transform(X_test)
            elif isinstance(selector, VarianceThreshold):
                # VarianceThreshold does not support `k` parameter, handle it separately
                selector.set_params(threshold=0.1)
                X_train_selected = selector.fit_transform(X_train, y_train)[:, :k]
                X_test_selected = selector.transform(X_test)[:, :k]
            else:
                continue

            # Retrain the model with the selected features
            if model_name == 'TensorFlow':
                # Model setup and training for TensorFlow
                model = tf.keras.Sequential([
                    tf.keras.layers.InputLayer(input_shape=(X_train_selected.shape[1],)),
                    tf.keras.layers.Dense(hyperparameters['units'], activation=hyperparameters['activation']),
                    tf.keras.layers.Dense(3, activation='softmax')
                ])
                model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hyperparameters['learning_rate']),
                              loss='sparse_categorical_crossentropy', metrics=['accuracy'])
                model.fit(X_train_selected, y_train, epochs=hyperparameters['epochs'], verbose=0)
                y_pred = model.predict(X_test_selected)
                y_pred_classes = np.argmax(y_pred, axis=1)
            else:
                # Model setup and training for other models
                model_class = model_mapping.get(model_name)
                if not model_class:
                    print(f"Unknown model name: {model_name}")
                    continue
               
                model_hyperparameters = {k.split('__', 1)[1]: v for k, v in hyperparameters.items() if
                                         k.startswith(model_name.lower())}
                model = model_class(**model_hyperparameters)
                model.fit(X_train_selected, y_train)
                y_pred_classes = model.predict(X_test_selected)

            # Calculate accuracy and store the results
            accuracy = accuracy_score(y_test, y_pred_classes)
            accuracy_values.append(accuracy)

            # Store feature selection results
            if isinstance(selector, SelectKBest):
                selected_features = selector.get_support(indices=True)
            elif isinstance(selector, VarianceThreshold):
                selected_features = selector.get_support(indices=True)
            feature_selection_data.append({
                'CSV File': csv_file,
                'Model': model_name,
                'Selection Method': name,
                'Number of Features': k,
                'Accuracy': accuracy,
                'Selected Features': [data_columns[i] for i in selected_features]
            })

            # Store test results
            test_results.append({
                'CSV File': csv_file,
                'Model': model_name,
                'Selection Method': name,
                'Number of Features': k,
                'Actual': ','.join(map(str, y_test.values)),
                'Predicted': ','.join(map(str, y_pred_classes)),
                'Accuracy': accuracy,
                'Selected Features': [data_columns[i] for i in selected_features]
            })

        selection_results[name] = accuracy_values

    return selection_results, feature_selection_data, test_results

# Find the best model from best_models_df
best_model = best_models_df.loc[best_models_df['accuracy'].idxmax()]

# Load data for the best model
csv_file = os.path.join('../Training/All_Data', best_model['csv_file'])
model_name = best_model['model_name']
model_path = os.path.join(model_directory,
                          f"{best_model['csv_file']}_{model_name}_model.pkl" if model_name != 'TensorFlow' else f"{best_model['csv_file']}_TensorFlow_model.h5")
hyperparameters = ast.literal_eval(best_model['hyperparameters'])

# Perform feature selection and retraining for the best model
reduction_results, feature_selection_data, test_results = load_evaluate_and_retrain_best_model_with_feature_selection(csv_file, model_name, model_path, hyperparameters)

# Define X and y using the loaded data
data = pd.read_csv(csv_file)
X = data[data_columns]
y = data[results_columns[0]]

# Plot accuracy values for each reduction method for the current CSV file
plt.figure(figsize=(12, 8))
for name, accuracy_values in reduction_results.items():
    if name != 'Original':  # Skip 'Original' since it doesn't have varying components
        plt.plot(range(2, len(accuracy_values) + 2), accuracy_values, label=name, marker='o')

# Add original accuracy to the plot
plt.axhline(y=reduction_results['Original'], color='gray', linestyle='--', label='Original')

# Add labels and legend
plt.xlabel('Number of Selected Features')
plt.ylabel('Accuracy')
plt.title(f'Accuracy with Feature Selection for Model: {model_name} {results_columns[0]} ({best_model["csv_file"]})')
plt.legend()
plt.grid(True)
plt.tight_layout()

# Save plot
plt.savefig(f"feature_selection_accuracy_plot_{best_model['csv_file']}_{model_name}_{results_columns[0]}.png")

# Show plot
plt.show()

# Save feature selection results to a CSV file
feature_selection_df = pd.DataFrame(feature_selection_data)
feature_selection_df.to_csv("feature_selection_results_"+results_columns[0]+".csv", index=False)

# Save test results to a CSV file
test_results_df = pd.DataFrame(test_results)
test_results_df.to_csv("test_results_"+results_columns[0]+".csv", index=False)

# Find the method with the highest accuracy and lowest number of features
best_method = min(feature_selection_data, key=lambda x: (-x['Accuracy'], x['Number of Features']))
print("Method with the highest accuracy and lowest number of features:")
print(best_method)

# Find the method with the highest accuracy proportionate to the number of features
best_method_proportionate = max(feature_selection_data, key=lambda x: (x['Accuracy'] / x['Number of Features']))
print("Method with the highest accuracy proportionate to the number of features:")
print(best_method_proportionate)


AttributeError: 'HalfMultinomialLoss' object has no attribute 'get_init_raw_predictions'

In [6]:
pip install --upgrade scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.5.1-cp311-cp311-win_amd64.whl.metadata (12 kB)
Downloading scikit_learn-1.5.1-cp311-cp311-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB 682.7 kB/s eta 0:00:17
   ---------------------------------------- 0.1/11.0 MB 825.8 kB/s eta 0:00:14
   - -------------------------------------- 0.3/11.0 MB 2.7 MB/s eta 0:00:04
   -- ------------------------------------- 0.6/11.0 MB 3.7 MB/s eta 0:00:03
   --- ------------------------------------ 1.0/11.0 MB 4.7 MB/s eta 0:00:03
   ---- ----------------------------------- 1.4/11.0 MB 5.4 MB/s eta 0:00:02
   ------ --------------------------------- 1.8/11.0 MB 5.7 MB/s eta 0:00:02
   ------- -------------------------------- 2.1/11.0 MB 6.1 MB/s eta 0:00:02
   --------- ------------------------------ 2.5/11.0 MB 6.4 MB/s eta 0:00:02
   ---------- ----------------------------- 3.0/11.0 MB 7.1 MB/s e

  You can safely remove it manually.
