In [28]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mutual_info_score
from sklearn.neighbors import KNeighborsClassifier

def mutual_information(x, y):
    c_xy = np.histogram2d(x, y, bins=10)[0]
    mi = mutual_info_score(None, None, contingency=c_xy)
    return mi

def CLMax(X, y, k):
    # Calculate the mutual information between each feature and the target variable
    mi = np.zeros(X.shape[1])
    for i in range(X.shape[1]):
        mi[i] = mutual_information(X.iloc[:,i], y)
    
    # Rank the features based on their mutual information with the target variable
    rank = np.argsort(-mi)
    
    # Select the top k features
    selected_features = rank[:k]
    
    return selected_features


data = pd.read_csv("data/AcousticFeatures.csv", delimiter=';')

train_data = data.sample(frac=0.75, random_state=200)
test_data = data.drop(train_data.index)
X_train = train_data.iloc[:, 1:]
X_test = test_data.iloc[:, 1:]
y_train = train_data.iloc[:, 0]
y_test = test_data.iloc[:, 0]

# Convert the target variable to a numeric form using label encoding
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

#3. and 4.
selected_features25 = CLMax(X_train, y_train, k=25)
selected_features15 = CLMax(X_train, y_train, k=15)
X_train_selected25 = X_train.iloc[:, selected_features25]
X_train_selected15 = X_train.iloc[:, selected_features15]

#5.
# Train a k-NN model using the selected features
k = 5
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train_selected25, y_train)
#knn.fit(X_train_selected15, y_train)
y_pred = knn.predict(X_test.iloc[:, selected_features25])
#y_pred = knn.predict(X_test.iloc[:, selected_features15])

# Calculate the number of correctly predicted labels
correct_predictions = 0
for i in range(len(y_test)):
    if y_pred[i] == y_test[i]:
        correct_predictions += 1
        
# Calculate the accuracy of the predicted values
accuracy = correct_predictions / len(y_test)

# Print the accuracy
print("Accuracy:", accuracy)

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 2 2 2 2 2 3 0 2 0 2 1 3 3 0 0 3 2 2 2 3 2 3 1 3 3 1 1 1 1 1 0 1 1 1 1 0
 1 0 1 1 1 1 3 1 1 1 1 3 3 1 1 1 0 0 2 3 0 3 3 2 2 2 0 2 3 2 0 3 0 2 3 0 3
 2 0 1 0 3 3 1 2 2 0 0 1 0 0 3 0 1 2 0 0 2 2 2 1 0 0]
Accuracy: 0.5
