In [1]:
!pip install numpy

Collecting numpy
  Downloading numpy-1.24.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.24.2


In [2]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tzdata>=2022.1
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tzdata, pandas
Successfully installed pandas-2.0.0 tzdata-2023.3


In [3]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting scipy>=1.3.2
  Downloading scipy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-learn-1.2.2 scipy-1.10.1 threadpoolctl-3.1.0


In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mutual_info_score, accuracy_score, precision_score, recall_score, f1_score 
from sklearn.neighbors import KNeighborsClassifier
from scipy.special import softmax
from sklearn.linear_model import LogisticRegression
#import warnings

#warnings.filterwarnings('ignore')

def select_features(X_train, y_train, k=25):
    # Initialize logistic regression model
    model = LogisticRegression(penalty=None, solver='lbfgs', multi_class='multinomial', max_iter=1000)

    # Loop over each feature and compute its score
    scores = []
    for j in range(X_train.shape[1]):
        # Select all features except for j
        X_sel = X_train.drop(X_train.columns[j], axis=1)
        # Fit the model and compute conditional probabilities
        model.fit(X_sel, y_train)
        probas = softmax(model.predict_log_proba(X_sel), axis=1)
        # Compute the conditional likelihood
        Lj = np.sum(np.log(probas[np.arange(len(y_train)), y_train]))
        # Compute the score as the negative of the conditional likelihood
        scores.append(-Lj)

    # Select the top-k features with the highest scores
    selected_features = np.argsort(scores)[:k]

    return selected_features

def mutual_information(x, y):
    c_xy = np.histogram2d(x, y, bins=10)[0]
    mi = mutual_info_score(None, None, contingency=c_xy)
    return mi

def CLMax(X, y, k):
    # Calculate the mutual information between each feature and the target variable
    mi = np.zeros(X.shape[1])
    for i in range(X.shape[1]):
        mi[i] = mutual_information(X.iloc[:,i], y)
    
    # Rank the features based on their mutual information with the target variable
    rank = np.argsort(-mi)
    
    # Select the top k features
    selected_features = rank[:k]
    
    return selected_features


data = pd.read_csv("data/AcousticFeatures.csv", delimiter=';')

train_data = data.sample(frac=0.75, random_state=200)
test_data = data.drop(train_data.index)
X_train = train_data.iloc[:, 1:]
X_test = test_data.iloc[:, 1:]
y_train = train_data.iloc[:, 0]
y_test = test_data.iloc[:, 0]

# Convert the target variable to a numeric form using label encoding
le = LabelEncoder()
classes = np.unique(le.fit_transform(data.iloc[:, 0]))
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

#3. and 4.
selected_features25 = CLMax(X_train, y_train, k=25)
selected_features15 = CLMax(X_train, y_train, k=15)
X_train_selected25 = X_train.iloc[:, selected_features25]
X_train_selected15 = X_train.iloc[:, selected_features15]

#5.
# Train a k-NN model using the selected features
k = 5
knn = KNeighborsClassifier(n_neighbors=k)
#knn.fit(X_train_selected25, y_train)
knn.fit(X_train_selected15, y_train)
#y_pred = knn.predict(X_test.iloc[:, selected_features25])
y_pred = knn.predict(X_test.iloc[:, selected_features15])
        
# Calculate the measures of performance
accuracy = 1 - np.count_nonzero(y_pred - y_test) / len(y_test)
macro_recall = recall_score(y_test, y_pred, average='macro')
macro_precision = precision_score(y_test, y_pred, average='macro')
macro_f1 = f1_score(y_test, y_pred, average='macro')

print("accuracy_score:", accuracy)
print("precision_score:", macro_recall)
print("recall_score:", macro_precision)
print("f1_score:", macro_f1)



# Select the top 25 features
print(f'Selected features before: {selected_features25}')
print(f'Selected features before: {selected_features15}')
selected_features25 = select_features(X_train, y_train, k=25)
selected_features15 = select_features(X_train, y_train, k=15)
X_train_selected25 = X_train.iloc[:, selected_features25]
X_train_selected15 = X_train.iloc[:, selected_features15]
print(f'Selected features: {selected_features25}')
print(f'Selected features: {selected_features15}')
knn.fit(X_train_selected25, y_train)
y_pred = knn.predict(X_test.iloc[:, selected_features25])
knn.fit(X_train_selected15, y_train)
y_pred = knn.predict(X_test.iloc[:, selected_features15])

accuracy = 1 - np.count_nonzero(y_pred - y_test) / len(y_test)
macro_recall = recall_score(y_test, y_pred, average='macro')
macro_precision = precision_score(y_test, y_pred, average='macro')
macro_f1 = f1_score(y_test, y_pred, average='macro')

print("accuracy_score:", accuracy)
print("precision_score:", macro_recall)
print("recall_score:", macro_precision)
print("f1_score:", macro_f1)


accuracy_score: 0.6
precision_score: 0.5815894746908071
recall_score: 0.5841927680162974
f1_score: 0.5782554291117777
Selected features before: [45 48 19 23 31 24 25 44  4 26 20  2  5 22 28 43 32 17 37 12  3 38 10  6
 34]
Selected features before: [45 48 19 23 31 24 25 44  4 26 20  2  5 22 28]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Selected features: [22 29 41  3 10 14 46 36 42 13  7 23  1 25 39 49 44 38 40 33 30 21 26 11
 15]
Selected features: [22 29 41  3 10 14 46 36 42 13  7 23  1 25 39]
accuracy_score: 0.29000000000000004
precision_score: 0.29392983127204725
recall_score: 0.30487649605296663
f1_score: 0.2881909756909756


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
