Import all libraries that we want to use

In [1]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import accuracy_score
import warnings

Set our optimal C parameter to 1.5

In [2]:
optimal_C = 1.5

ignore the library Warnings

In [3]:
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

Load the digits dataset from sklearn

In [4]:
digits = load_digits()
X, y = digits.data, digits.target

split the features by using KBinsDiscretizer func

In [5]:
splitTo3 = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
x_split = splitTo3.fit_transform(X)

THe Greedy Feature Selection Function

In [6]:

def greedy_feature_selection(X, y, num_features):
    important_features = []
    features = list(range(X.shape[1]))
    count = 0
    
    while count < num_features:
        top_score = 0
        top_feature = None
        feature_index = 0
        
        while feature_index < len(features):
            feature = features[feature_index]
            current_features = important_features + [feature]
            X_subset = X[:, current_features]
            
            X_train, X_val, y_train, y_val = train_test_split(X_subset, y, test_size=0.3, random_state=42)
            model = LogisticRegression(penalty='l2', C=optimal_C, solver='liblinear', multi_class='ovr')
            model.fit(X_train, y_train)
            score = accuracy_score(y_val, model.predict(X_val))
            
            if score > top_score:
                top_score = score
                top_feature = feature
            
            feature_index += 1
        
        important_features.append(top_feature)
        features.remove(top_feature)
        count += 1
        
    return important_features

The Mutual Information Feature Selection Function

In [7]:
def mutual_Info_feature_selection(X, y, num_features):
    mutual_Info_scores = mutual_info_classif(X, y, discrete_features=True)
    top_features = np.argsort(mutual_Info_scores)[-num_features:]
    return top_features

Select Important Features using Greedy Algorithm

In [8]:
important_features_greedy = greedy_feature_selection(X, y, 5)

Select Important Features using Mutual Information Method

In [9]:
important_features_mi = mutual_Info_feature_selection(x_split, y, 5)

Define function to evaluate model accuracy

In [10]:
def evaluate_model(X, y, important_features):
    X_subset = X[:, important_features]
    X_train, X_val, y_train, y_val = train_test_split(X_subset, y, test_size=0.3, random_state=42)
    model = LogisticRegression(penalty='l2', C=optimal_C, solver='liblinear', multi_class='ovr')
    model.fit(X_train, y_train)
    score = accuracy_score(y_val, model.predict(X_val))
    return score

Evaluate the greedy algorithm with selected features

In [11]:
greedy_score = evaluate_model(X, y, important_features_greedy)

Evaluate the mutual information method with selected features

In [12]:
mutual_Info_score = evaluate_model(X, y, important_features_mi)

Print important features and accuracy results

In [13]:
print("The important features by using Greedy Algorithm:", important_features_greedy)
print("Accuracy with Greedy Algorithm important features:", greedy_score)

print("The important features by using MI method:", important_features_mi)
print("Accuracy with MI method important features:",  mutual_Info_score)


if greedy_score > mutual_Info_score:
    print("Greedy algorithm performed better.")
else:
    print("MI method performed better.")

The important features by using Greedy Algorithm: [21, 42, 36, 26, 61]
Accuracy with Greedy Algorithm important features: 0.762962962962963
The important features by using MI method: [43 21 34 42 26]
Accuracy with MI method important features: 0.7129629629629629
Greedy algorithm performed better.
