In [2]:
!pip install numpy

Collecting numpy
  Downloading numpy-1.24.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.24.2


In [3]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting tzdata>=2022.1
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tzdata, pandas
Successfully installed pandas-2.0.0 tzdata-2023.3


In [7]:
!pip install scikit-learn



In [10]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mutual_info_score, accuracy_score, precision_score, recall_score, f1_score 
from sklearn.neighbors import KNeighborsClassifier
from scipy.special import softmax
from sklearn.linear_model import LogisticRegression
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

@ignore_warnings(category=ConvergenceWarning)
def select_features(X_train, y_train, k=25):
    # Initialize logistic regression model
    model = LogisticRegression(penalty=None, solver='lbfgs', multi_class='multinomial', max_iter=1000)

    # Loop over each feature and compute its score
    scores = []
    for j in range(X_train.shape[1]):
        print("iteration: " + str(j))
        # Select all features except for j
        X_sel = X_train.drop(X_train.columns[j], axis=1)
        # Fit the model and compute conditional probabilities
        model.fit(X_sel, y_train)
        probas = softmax(model.predict_log_proba(X_sel), axis=1)
        # Compute the conditional likelihood
        Lj = np.sum(np.log(probas[np.arange(len(y_train)), y_train]))
        # Compute the score as the negative of the conditional likelihood
        scores.append(-Lj)

    # Select the top-k features with the highest scores
    selected_features = np.argsort(scores)[:k]

    return selected_features


data = pd.read_csv("data/AcousticFeatures.csv", delimiter=';')

train_data = data.sample(frac=0.75, random_state=200)
test_data = data.drop(train_data.index)
X_train = train_data.iloc[:, 1:]
X_test = test_data.iloc[:, 1:]
y_train = train_data.iloc[:, 0]
y_test = test_data.iloc[:, 0]

# Convert the target variable to a numeric form using label encoding
le = LabelEncoder()
classes = np.unique(le.fit_transform(data.iloc[:, 0]))
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

#3. and 4.
selected_features25 = select_features(X_train, y_train, k=25)
selected_features15 = select_features(X_train, y_train, k=15)
X_train_selected25 = X_train.iloc[:, selected_features25]
X_train_selected15 = X_train.iloc[:, selected_features15]
print(f'Selected features: {selected_features25}')
print(f'Selected features: {selected_features15}')

#5.
# Train a k-NN model using the selected features
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_selected25, y_train)
y_pred = knn.predict(X_test.iloc[:, selected_features25])
#knn.fit(X_train_selected15, y_train)
#y_pred = knn.predict(X_test.iloc[:, selected_features15])

# Calculate the measures of performance
accuracy = 1 - np.count_nonzero(y_pred - y_test) / len(y_test)
macro_recall = recall_score(y_test, y_pred, average='macro')
macro_precision = precision_score(y_test, y_pred, average='macro')
macro_f1 = f1_score(y_test, y_pred, average='macro')

print("accuracy_score:", accuracy)
print("precision_score:", macro_recall)
print("recall_score:", macro_precision)
print("f1_score:", macro_f1)


iteration: 0
iteration: 1
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 9
iteration: 10
iteration: 11
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 22
iteration: 23
iteration: 24
iteration: 25
iteration: 26
iteration: 27
iteration: 28
iteration: 29
iteration: 30
iteration: 31
iteration: 32
iteration: 33
iteration: 34
iteration: 35
iteration: 36
iteration: 37
iteration: 38
iteration: 39
iteration: 40
iteration: 41
iteration: 42
iteration: 43
iteration: 44
iteration: 45
iteration: 46
iteration: 47
iteration: 48
iteration: 49
iteration: 0
iteration: 1
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 9
iteration: 10
iteration: 11
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 2