In [None]:
!pip install numpy

In [None]:
!pip install pandas

In [None]:
!pip install scikit-learn

In [26]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mutual_info_score, accuracy_score, precision_score, recall_score, f1_score 
from sklearn.neighbors import KNeighborsClassifier
from scipy.special import softmax
from sklearn.linear_model import LogisticRegression
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler

# Define the model and approximation
def model(x, beta):
    # Multivariate Gaussian with mean and covariance parameters that depend on the selected features
    mu = np.dot(x, beta[1:])
    sigma = np.exp(beta[0])
    return np.random.normal(mu, sigma)

def approx(x, tau, beta):
    # Multivariate Gaussian with mean and covariance parameters that depend on all the features
    mu = np.dot(x, beta[1:])
    sigma = np.exp(beta[0])
    return np.random.normal(mu, sigma)

# Define the feature selection algorithm
def select_features(X, y, num_features):
    # Standardize the features
    scaler = StandardScaler()
    X_std = scaler.fit_transform(X)

    # Initialize the selected features and the beta parameters
    selected_features = []
    beta = np.zeros(X.shape[1] + 1)

    # Run the feature selection algorithm
    for k in range(num_features):
        # Compute the conditional likelihood for each feature
        conditional_likelihoods = np.zeros(X.shape[1])
        for j in range(X.shape[1]):
            if j not in selected_features:
                # Select the candidate feature to evaluate
                candidate_feature = np.hstack((X_std[:, selected_features], X_std[:, j][:, np.newaxis]))
                
                # Compute the conditional likelihood of the target variable given the selected features and the candidate feature
                conditional_likelihoods[j] = np.mean([np.log(model(candidate_feature[i], beta)) - np.log(approx(candidate_feature[i], X_std[:, selected_features], beta)) for i in range(X.shape[0]) if y[i] == y_train.values[0]])

        # Select the feature with the highest conditional likelihood
        best_feature = np.argmax(conditional_likelihoods)
        selected_features.append(best_feature)

        # Fit the model with the selected features
        X_selected = X_std[:, selected_features]
        X_selected[:, 0] = 1.0 # Add a constant term to the model
        beta = np.linalg.lstsq(X_selected, y, rcond=None)[0]

    return selected_features

def empirical_distributions(X, y):
    n_samples, n_features = X.shape
    C = len(np.unique(y))
    emp_dists = np.zeros((n_features, C), dtype=object)
    for c in range(C):
        X_c = X[y == c]
        for i in range(n_features):
            emp_dists[i, c] = norm(loc=np.mean(X_c[:, i]), scale=np.std(X_c[:, i]))
    return emp_dists

def conditional_likelihood(emp_dists, X):
    n_samples, n_features = X.shape
    C = emp_dists.shape[1]
    p = np.zeros((n_features, C))
    for c in range(C):
        for i in range(n_features):
            p[i, c] = np.mean(emp_dists[i, c].pdf(X[:, i]))
    return p

def conditional_mutual_information(p, y):
    n_samples = y.shape[0]
    C = len(np.unique(y))
    mi = np.zeros(p.shape[0])
    for i in range(p.shape[0]):
        for c in range(C):
            mi[i] += np.sum(p[i, c] * np.log(p[i, c] / np.mean(p[i, c])))
        mi[i] *= np.sum(p[i, :]) / n_samples
    return mi

def feature_selection(X, y, K):
    emp_dists = empirical_distributions(X, y)
    p = conditional_likelihood(emp_dists, X)
    mi = conditional_mutual_information(p, y)
    idx = np.argsort(mi)[::-1][:K]
    return idx



def select_features(X_train, y_train, k=25):
    # Initialize logistic regression model
    model = LogisticRegression(penalty=None, solver='lbfgs', multi_class='multinomial', max_iter=1000)

    # Loop over each feature and compute its score
    scores = []
    for j in range(X_train.shape[1]):
        # Select all features except for j
        X_sel = X_train.drop(X_train.columns[j], axis=1)
        # Fit the model and compute conditional probabilities
        model.fit(X_sel, y_train)
        probas = softmax(model.predict_log_proba(X_sel), axis=1)
        # Compute the conditional likelihood
        Lj = np.sum(np.log(probas[np.arange(len(y_train)), y_train]))
        # Compute the score as the negative of the conditional likelihood
        scores.append(-Lj)

    # Select the top-k features with the highest scores
    selected_features = np.argsort(scores)[:k]

    return selected_features

def mutual_information(x, y):
    c_xy = np.histogram2d(x, y, bins=10)[0]
    mi = mutual_info_score(None, None, contingency=c_xy)
    return mi

def CLMax(X, y, k):
    # Calculate the mutual information between each feature and the target variable
    mi = np.zeros(X.shape[1])
    for i in range(X.shape[1]):
        mi[i] = mutual_information(X.iloc[:,i], y)
    
    # Rank the features based on their mutual information with the target variable
    rank = np.argsort(-mi)
    
    # Select the top k features
    selected_features = rank[:k]
    
    return selected_features


data = pd.read_csv("data/AcousticFeatures.csv", delimiter=';')

train_data = data.sample(frac=0.75, random_state=200)
test_data = data.drop(train_data.index)
X_train = train_data.iloc[:, 1:]
X_test = test_data.iloc[:, 1:]
y_train = train_data.iloc[:, 0]
y_test = test_data.iloc[:, 0]

# Convert the target variable to a numeric form using label encoding
le = LabelEncoder()
classes = np.unique(le.fit_transform(data.iloc[:, 0]))
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

#3. and 4.
selected_features25 = CLMax(X_train, y_train, k=25)
selected_features15 = CLMax(X_train, y_train, k=15)
X_train_selected25 = X_train.iloc[:, selected_features25]
X_train_selected15 = X_train.iloc[:, selected_features15]

#5.
# Train a k-NN model using the selected features
k = 5
knn = KNeighborsClassifier(n_neighbors=k)
#knn.fit(X_train_selected25, y_train)
knn.fit(X_train_selected15, y_train)
#y_pred = knn.predict(X_test.iloc[:, selected_features25])
y_pred = knn.predict(X_test.iloc[:, selected_features15])
        
# Calculate the measures of performance
accuracy = 1 - np.count_nonzero(y_pred - y_test) / len(y_test)
macro_recall = recall_score(y_test, y_pred, average='macro')
macro_precision = precision_score(y_test, y_pred, average='macro')
macro_f1 = f1_score(y_test, y_pred, average='macro')

print("accuracy_score:", accuracy)
print("precision_score:", macro_recall)
print("recall_score:", macro_precision)
print("f1_score:", macro_f1)



# Select the top 25 features
print(f'Selected features before: {selected_features25}')
print(f'Selected features before: {selected_features15}')
#selected_features25 = select_features(X_train, y_train, k=25)
#selected_features15 = select_features(X_train, y_train, k=15)
#X_train_selected25 = X_train.iloc[:, selected_features25]
#X_train_selected15 = X_train.iloc[:, selected_features15]
#print(f'Selected features: {selected_features25}')
#print(f'Selected features: {selected_features15}')
#knn.fit(X_train_selected25, y_train)
#y_pred = knn.predict(X_test.iloc[:, selected_features25])
#knn.fit(X_train_selected15, y_train)
#y_pred = knn.predict(X_test.iloc[:, selected_features15])

#accuracy = 1 - np.count_nonzero(y_pred - y_test) / len(y_test)
#macro_recall = recall_score(y_test, y_pred, average='macro')
#macro_precision = precision_score(y_test, y_pred, average='macro')
#macro_f1 = f1_score(y_test, y_pred, average='macro')

#print("accuracy_score:", accuracy)
#print("precision_score:", macro_recall)
#print("recall_score:", macro_precision)
#print("f1_score:", macro_f1)



selected_features25 = feature_selection(X_train.values, y_train, 25)
selected_features15 = feature_selection(X_train.values, y_train, 15)
# Print the selected features
print("Selected features:", selected_features25)
print("Selected features:", selected_features15)
X_train_selected25 = X_train.iloc[:, selected_features25]
X_train_selected15 = X_train.iloc[:, selected_features15]
knn.fit(X_train_selected25, y_train)
y_pred = knn.predict(X_test.iloc[:, selected_features25])
knn.fit(X_train_selected15, y_train)
y_pred = knn.predict(X_test.iloc[:, selected_features15])

accuracy = 1 - np.count_nonzero(y_pred - y_test) / len(y_test)
macro_recall = recall_score(y_test, y_pred, average='macro')
macro_precision = precision_score(y_test, y_pred, average='macro')
macro_f1 = f1_score(y_test, y_pred, average='macro')

print("accuracy_score:", accuracy)
print("precision_score:", macro_recall)
print("recall_score:", macro_precision)
print("f1_score:", macro_f1)




accuracy_score: 0.6
precision_score: 0.5815894746908071
recall_score: 0.5841927680162974
f1_score: 0.5782554291117777
