In [2]:
import os
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns  # Import seaborn for enhanced visualizations

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import average_precision_score, confusion_matrix, roc_curve, precision_recall_curve

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
def concatenate_numpy_arrays_from_folder(directory):
    """
    Concatenates all numpy arrays in the specified directory into a single numpy array.

    Args:
        directory (str): The path to the directory containing the numpy array files.

    Returns:
        np.ndarray: The concatenated numpy array.
    """
    # Initialize an empty list to store the arrays
    arrays_list = []

    # Loop through all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.npy'):  # Check if the file is a numpy array file
            file_path = os.path.join(directory, filename)

            # Load the numpy array from the file
            numpy_array = np.load(file_path)

            # Print the shape of the numpy array
            #print(f'Shape of {filename}: {numpy_array.shape}')

            # Append the numpy array to the list
            arrays_list.append(numpy_array)

    # Concatenate all numpy arrays in the list into a single array
    concatenated_array = np.concatenate(arrays_list, axis=0)  # Change axis if needed

    # Print the shape of the concatenated array
    print(f'Shape of the concatenated array: {concatenated_array.shape}')

    return concatenated_array

In [4]:
# for 5 secondes samples

# test_ads_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_5sec/grouped_embeddings_ad/test_embeddings_ads'
# ad_test_embeddings = concatenate_numpy_arrays_from_folder(test_ads_directory)

# val_ads_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_5sec/grouped_embeddings_ad/val_embeddings_ads'
# ad_val_embeddings = concatenate_numpy_arrays_from_folder(val_ads_directory)

# train_ads_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_5sec/grouped_embeddings_ad/train_embeddings_ads'
# ad_train_embeddings = concatenate_numpy_arrays_from_folder(train_ads_directory)


# test_pods_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_5sec/grouped_embeddings_podcast/test_embeddings'
# pod_test_embeddings = concatenate_numpy_arrays_from_folder(test_pods_directory)

# val_pods_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_5sec/grouped_embeddings_podcast/val_embeddings'
# pod_val_embeddings = concatenate_numpy_arrays_from_folder(val_pods_directory)

# train_pod_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_5sec/grouped_embeddings_podcast/train_embeddings'
# pod_train_embeddings = concatenate_numpy_arrays_from_folder(train_pod_directory)[:507, :, :]

# print(pod_train_embeddings.shape)


In [5]:
# for 1 secondes samples

test_ads_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_1sec/grouped_embeddings_ad/test_embeddings_ads'
ad_test_embeddings = concatenate_numpy_arrays_from_folder(test_ads_directory)

val_ads_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_1sec/grouped_embeddings_ad/val_embeddings_ads'
ad_val_embeddings = concatenate_numpy_arrays_from_folder(val_ads_directory)

train_ads_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_1sec/grouped_embeddings_ad/train_embeddings_ads'
ad_train_embeddings = concatenate_numpy_arrays_from_folder(train_ads_directory)


test_pods_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_1sec/grouped_embeddings_podcast/test_embeddings_pods'
pod_test_embeddings = concatenate_numpy_arrays_from_folder(test_pods_directory)

val_pods_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_1sec/grouped_embeddings_podcast/val_embeddings_pods'
pod_val_embeddings = concatenate_numpy_arrays_from_folder(val_pods_directory)

train_pod_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_1sec/grouped_embeddings_podcast/train_embeddings_pods'
pod_train_embeddings = concatenate_numpy_arrays_from_folder(train_pod_directory)[:2597, :, :]

print(pod_train_embeddings.shape)

Shape of the concatenated array: (302, 1, 128)
Shape of the concatenated array: (319, 1, 128)
Shape of the concatenated array: (2597, 1, 128)
Shape of the concatenated array: (668, 1, 128)
Shape of the concatenated array: (1940, 1, 128)
Shape of the concatenated array: (6115, 1, 128)
(2597, 1, 128)


In [6]:
def convert_to_2d_array(three_d_array):
    # Get the dimensions of the input array
    depth, rows, cols = three_d_array.shape

    # Reshape each 2D array to 1D and concatenate them
    flattened_arrays = [matrix.flatten() for matrix in three_d_array]
    two_d_array = np.vstack(flattened_arrays)

    return two_d_array

In [7]:
pod_train_embeddings = convert_to_2d_array(pod_train_embeddings)
ad_train_embeddings = convert_to_2d_array(ad_train_embeddings)

pod_val_embeddings = convert_to_2d_array(pod_val_embeddings)
ad_val_embeddings = convert_to_2d_array(ad_val_embeddings)

pod_test_embeddings =  convert_to_2d_array(pod_test_embeddings)
ad_test_embeddings = convert_to_2d_array(ad_test_embeddings)

print(pod_train_embeddings.shape)
print(ad_train_embeddings.shape)

print(pod_val_embeddings.shape)
print(ad_val_embeddings.shape)

print(pod_test_embeddings.shape)
print(ad_test_embeddings.shape)

(2597, 128)
(2597, 128)
(1940, 128)
(319, 128)
(668, 128)
(302, 128)


In [8]:
train_embeddings = np.concatenate((pod_train_embeddings, ad_train_embeddings))
val_embeddings = np.concatenate((pod_val_embeddings, ad_val_embeddings))
test_embeddings = np.concatenate((pod_test_embeddings, ad_test_embeddings))

In [9]:
def concatenate_zeros_and_ones(podcast_length, commercials_length):
    # Create array of zeros with size of podcast array
    zeros_array = np.zeros(podcast_length, dtype=int)

    # Create array of ones with size of commercials array
    ones_array = np.ones(commercials_length, dtype=int)

    # Concatenate arrays
    concatenated_array = np.concatenate((zeros_array, ones_array))

    return concatenated_array

In [10]:
train_labels = concatenate_zeros_and_ones(pod_train_embeddings.shape[0], ad_train_embeddings.shape[0])
val_labels = concatenate_zeros_and_ones(pod_val_embeddings.shape[0], ad_val_embeddings.shape[0])
test_lables = concatenate_zeros_and_ones(pod_test_embeddings.shape[0], ad_test_embeddings.shape[0])

In [11]:
def train_knn_model(normal_embeddings, n_neighbors=11):
    """
    Trains a K-Nearest Neighbors (KNN) classifier on the provided normal embeddings.

    Parameters:
    ---------
    - normal_embeddings (numpy.ndarray): Embeddings of normal class samples for training.
    - n_neighbors (int, optional): Number of neighbors to consider. Default is 11.

    Returns:
    -------
    - sklearn.neighbors.KNeighborsClassifier: Trained KNN classifier.
    """
    # Create a KNN classifier with the specified number of neighbors
    knn_classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

    # Label all normal embeddings as 0
    labels = np.zeros(len(normal_embeddings)) # check this line with odelia, why do we need to provied array of zeros if we already know what are the labels

    # Train the KNN classifier
    knn_classifier.fit(normal_embeddings, labels)

    return knn_classifier

In [12]:
# Example usage:
trained_knn_classifier = train_knn_model(pod_train_embeddings, n_neighbors=60)

In [13]:
def calculate_mean_distances(knn_model, embeddings):
    """
    Calculates the mean distances of embeddings using the trained KNN model.

    Parameters:
    ----------
    - knn_model (sklearn.neighbors.KNeighborsClassifier): Trained KNN model.
    - embeddings (numpy.ndarray): Embeddings to calculate distances for.

    Returns:
    -------
    - numpy.ndarray: Mean distances of embeddings.
    """
    # Find distances and indices of k-neighbors for each embedding
    distances, _ = knn_model.kneighbors(embeddings)

    # Calculate mean distances for each embedding
    mean_distances = distances.mean(axis=1)

    return mean_distances

In [14]:
def calculate_thresholds(knn_model, anomaly_embeddings_train, normal_embeddings_train):
    """
    Calculates anomaly and normal thresholds based on mean distances of embeddings using a trained KNN model.

    Parameters:
    ----------
    - knn_model (sklearn.neighbors.KNeighborsClassifier): Trained KNN model.
    - anomaly_embeddings_train (numpy.ndarray): Embeddings of anomaly class samples for training.
    - normal_embeddings_train (numpy.ndarray): Embeddings of normal class samples for training.

    Returns:
    -------
    - Anomaly threshold (float)
    - Normal threshold (float)
    """
    # Calculate mean distances for anomaly and normal embeddings
    anomaly_mean_distance = calculate_mean_distances(knn_model, anomaly_embeddings_train)
    normal_mean_distance = calculate_mean_distances(knn_model, normal_embeddings_train)

    # Determine threshold based on means and factors
    anomaly_threshold = anomaly_mean_distance.mean() + 2 * anomaly_mean_distance.std()  # Set threshold 2 standard deviations above anomaly mean
    normal_threshold = normal_mean_distance.mean() - 2 * normal_mean_distance.std()  # Set threshold 2 standard deviations below normal mean

    return anomaly_threshold, normal_threshold

In [15]:
# Call the function to calculate thresholds
anomaly_threshold, normal_threshold = calculate_thresholds(trained_knn_classifier, ad_train_embeddings, pod_train_embeddings)

# Print the calculated thresholds
print("Anomaly threshold:", anomaly_threshold)
print("Normal threshold:", normal_threshold)

Anomaly threshold: 6.1433550443179925
Normal threshold: 1.3256375932644553


In [16]:
def validate_knn_model(knn_model, validation_embeddings, anomaly_threshold, normal_threshold, true_validation_labels):
    """
    Validates the KNN model using validation embeddings.

    Parameters:
    ----------
    - knn_model (sklearn.neighbors.KNeighborsClassifier): Trained KNN model.
    - validation_embeddings (numpy.ndarray): Validation embeddings.
    - anomaly_threshold (float): Threshold for classifying anomalies.
    - normal_threshold (float): Threshold for classifying normal samples.
    - true_validation_labels (numpy.ndarray): True labels of validation samples.

    Returns:
    -------
    - Validation accuracy (float)
    - Validation predictions (numpy.ndarray)
    - Threshold used for classification (float)
    """
    # Calculate distances to nearest neighbors
    distances, _ = knn_model.kneighbors(validation_embeddings)

    # Calculate mean distances for each validation embedding
    mean_distances = distances.mean(axis=1)

    # Calculate threshold for classification
    threshold = (anomaly_threshold + normal_threshold) / 2

    # Classify validation embeddings based on mean distances
    validation_predictions = mean_distances > threshold

    # Calculate validation accuracy
    validation_accuracy = accuracy_score(true_validation_labels, validation_predictions)

    return validation_accuracy, validation_predictions, threshold

In [17]:
# Call the function and store the return values
validation_accuracy, validation_predictions, threshold = validate_knn_model(trained_knn_classifier, val_embeddings, anomaly_threshold, normal_threshold, val_labels)

# Print the different results
print("Validation Accuracy:", validation_accuracy)
print("Validation Predictions:", validation_predictions)
print("Threshold:", threshold)

Validation Accuracy: 0.8154050464807437
Validation Predictions: [False False False ... False False False]
Threshold: 3.7344963187912237


In [18]:
def test_knn_model(knn_model, test_embeddings, anomaly_threshold, normal_threshold, true_test_labels):
    """
    Tests the KNN model using test embeddings.

    Parameters:
    ---------
    - knn_model (sklearn.neighbors.KNeighborsClassifier): Trained KNN model.
    - test_embeddings (numpy.ndarray): Test embeddings.
    - anomaly_threshold (float): Threshold for classifying anomalies.
    - normal_threshold (float): Threshold for classifying normal samples.
    - true_test_labels (numpy.ndarray): True labels of test samples.

    Returns:
    -------
    - Test accuracy (float)
    - Test predictions (numpy.ndarray)
    - Threshold used for classification (float)
    """
    # Calculate distances to nearest neighbors
    distances, _ = knn_model.kneighbors(test_embeddings)

    # Calculate mean distances for each test embedding
    mean_distances = distances.mean(axis=1)

    # Calculate threshold for classification
    threshold = (anomaly_threshold + normal_threshold) / 2

    # Classify test embeddings based on mean distances
    test_predictions = mean_distances > threshold

    # Calculate test accuracy
    test_accuracy = accuracy_score(true_test_labels, test_predictions)

    return test_accuracy, test_predictions, threshold

In [19]:
# Call the function and store the return values
test_accuracy, test_predictions, threshold = test_knn_model(trained_knn_classifier, test_embeddings, anomaly_threshold, normal_threshold, test_lables)

# Print the different results
print("Test Accuracy:", test_accuracy)
print("Test Predictions:", test_predictions)
print("Threshold:", threshold)

Test Accuracy: 0.7412371134020619
Test Predictions: [False False False False False  True  True False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False  True False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False  True False False False False False False False
 False False False False False False False False False False False False

In [20]:
def calculate_confusion_matrix(actual_labels, predicted_labels):
    """
    Calculate the confusion matrix based on actual and predicted labels.

    Parameters:
    actual_labels (list): List of actual labels (-1 for ads, 1 for podcasts).
    predicted_labels (list): List of predicted labels (-1 for ads, 1 for podcasts).

    Returns:
    tuple: A tuple containing true positives, false positives, true negatives, and false negatives.
    """
    true_positives = 0
    false_positives = 0
    true_negatives = 0
    false_negatives = 0

    for actual, predicted in zip(actual_labels, predicted_labels):
        if actual == 0 and predicted == 0:  # true_negatives
            true_negatives += 1
        elif actual == 1 and predicted == 0:  # false_negatives
            false_negatives += 1
        elif actual == 1 and predicted == 1:  # true_positives
            true_positives += 1
        elif actual == 0 and predicted == 1:  # false_positives
            false_positives += 1

    return true_positives, false_positives, true_negatives, false_negatives

In [21]:
# Example usage:
true_positives, false_positives, true_negatives, false_negatives = calculate_confusion_matrix(test_lables, test_predictions)
print("True Positives:", true_positives)
print("False Positives:", false_positives)
print("True Negatives:", true_negatives)
print("False Negatives:", false_negatives)

True Positives: 113
False Positives: 62
True Negatives: 606
False Negatives: 189


In [None]:
def calculate_accuracy(actual_labels, predicted_labels):
    """
    Calculate the accuracy of the predictions.

    Parameters:
    actual_labels (list): List of actual labels (-1 for ads, 1 for podcasts).
    predicted_labels (list): List of predicted labels (-1 for ads, 1 for podcasts).

    Returns:
    float: Accuracy of the predictions.
    """
    correct_predictions = sum(1 for actual, predicted in zip(actual_labels, predicted_labels) if actual == predicted)
    total_predictions = len(actual_labels)
    accuracy = correct_predictions / total_predictions
    return accuracy


def calculate_precision(actual_labels, predicted_labels):
    """
    Calculate the precision of the predictions.

    Parameters:
    actual_labels (list): List of actual labels (-1 for ads, 1 for podcasts).
    predicted_labels (list): List of predicted labels (-1 for ads, 1 for podcasts).

    Returns:
    float: Precision of the predictions.
    """
    true_positives, false_positives, _, _ = calculate_confusion_matrix(actual_labels, predicted_labels)
    precision = true_positives / (true_positives + false_positives)
    return precision


def calculate_recall(actual_labels, predicted_labels):
    """
    Calculate the recall of the predictions.

    Parameters:
    actual_labels (list): List of actual labels (-1 for ads, 1 for podcasts).
    predicted_labels (list): List of predicted labels (-1 for ads, 1 for podcasts).

    Returns:
    float: Recall of the predictions.
    """
    true_positives, _, _, false_negatives = calculate_confusion_matrix(actual_labels, predicted_labels)
    recall = true_positives / (true_positives + false_negatives)
    return recall


def calculate_f1_score(actual_labels, predicted_labels):
    """
    Calculate the F1 score of the predictions.

    Parameters:
    actual_labels (list): List of actual labels (-1 for ads, 1 for podcasts).
    predicted_labels (list): List of predicted labels (-1 for ads, 1 for podcasts).

    Returns:
    float: F1 score of the predictions.
    """
    precision = calculate_precision(actual_labels, predicted_labels)
    recall = calculate_recall(actual_labels, predicted_labels)
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

In [None]:
# Assuming you have actual labels stored in 'true_test_label' and predicted
# labels stored in 'anomaly_predictions_test'

accuracy = calculate_accuracy(test_lables, test_predictions)
precision = calculate_precision(test_lables, test_predictions)
recall = calculate_recall(test_lables, test_predictions)
f1_score = calculate_f1_score(test_lables, test_predictions)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

In [23]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

def train_knn_model(normal_embeddings, n_neighbors=11):
    knn_classifier = KNeighborsClassifier(n_neighbors=n_neighbors)
    labels = np.zeros(len(normal_embeddings))
    knn_classifier.fit(normal_embeddings, labels)
    return knn_classifier

def calculate_mean_distances(knn_model, embeddings):
    distances, _ = knn_model.kneighbors(embeddings)
    return distances.mean(axis=1)

def calculate_thresholds(knn_model, anomaly_embeddings_train, normal_embeddings_train):
    anomaly_mean_distance = calculate_mean_distances(knn_model, anomaly_embeddings_train)
    normal_mean_distance = calculate_mean_distances(knn_model, normal_embeddings_train)
    anomaly_threshold = anomaly_mean_distance.mean() + 2 * anomaly_mean_distance.std()
    normal_threshold = normal_mean_distance.mean() - 2 * normal_mean_distance.std()
    return anomaly_threshold, normal_threshold

def validate_knn_model(knn_model, validation_embeddings, anomaly_threshold, normal_threshold, true_validation_labels):
    distances, _ = knn_model.kneighbors(validation_embeddings)
    mean_distances = distances.mean(axis=1)
    threshold = (anomaly_threshold + normal_threshold) / 2
    validation_predictions = mean_distances > threshold
    validation_accuracy = accuracy_score(true_validation_labels, validation_predictions)
    return validation_accuracy, validation_predictions, threshold

def concatenate_zeros_and_ones(podcast_length, commercials_length):
    zeros_array = np.zeros(podcast_length, dtype=int)
    ones_array = np.ones(commercials_length, dtype=int)
    return np.concatenate((zeros_array, ones_array))

def calculate_confusion_matrix(actual_labels, predicted_labels):
    true_positives = false_positives = true_negatives = false_negatives = 0
    for actual, predicted in zip(actual_labels, predicted_labels):
        if actual == 0 and predicted == 0:
            true_negatives += 1
        elif actual == 1 and predicted == 0:
            false_negatives += 1
        elif actual == 1 and predicted == 1:
            true_positives += 1
        elif actual == 0 and predicted == 1:
            false_positives += 1
    return true_positives, false_positives, true_negatives, false_negatives

# Assuming embeddings and labels are defined
podcast_length = pod_train_embeddings.shape[0]
ads_length = ad_train_embeddings.shape[0]

train_labels = concatenate_zeros_and_ones(podcast_length, ads_length)
val_labels = concatenate_zeros_and_ones(pod_val_embeddings.shape[0], ad_val_embeddings.shape[0])
test_labels = concatenate_zeros_and_ones(pod_test_embeddings.shape[0], ad_test_embeddings.shape[0])

best_k = None
min_false_positives = float('inf')

# Range of k values to test
k_values = range(1, 60)

for k in k_values:
    print(f"Training KNN model with k={k}")
    trained_knn_classifier = train_knn_model(pod_train_embeddings, n_neighbors=k)
    anomaly_threshold, normal_threshold = calculate_thresholds(trained_knn_classifier, ad_train_embeddings, pod_train_embeddings)
    _, validation_predictions, _ = validate_knn_model(trained_knn_classifier, val_embeddings, anomaly_threshold, normal_threshold, val_labels)
    test_accuracy, test_predictions, threshold = test_knn_model(trained_knn_classifier, test_embeddings, anomaly_threshold, normal_threshold, test_labels)
    # _, false_positives, _, _ = calculate_confusion_matrix(val_labels, validation_predictions)
    true_positives, false_positives, true_negatives, false_negatives = calculate_confusion_matrix(test_labels, test_predictions)

    print(f"False positives for k={k}: {false_positives}")

    if false_positives < min_false_positives:
        min_false_positives = false_positives
        best_k = k

print(f"Best k with minimal false positives: {best_k}")

# Train and test the final model with the best k
trained_knn_classifier = train_knn_model(pod_train_embeddings, n_neighbors=best_k)
anomaly_threshold, normal_threshold = calculate_thresholds(trained_knn_classifier, ad_train_embeddings, pod_train_embeddings)
test_accuracy, test_predictions, threshold = test_knn_model(trained_knn_classifier, test_embeddings, anomaly_threshold, normal_threshold, test_labels)
print("Test Accuracy:", test_accuracy)
print("Test Predictions:", test_predictions)

true_positives, false_positives, true_negatives, false_negatives = calculate_confusion_matrix(test_labels, test_predictions)
print("True Positives:", true_positives)
print("False Positives:", false_positives)
print("True Negatives:", true_negatives)
print("False Negatives:", false_negatives)

Training KNN model with k=1
False positives for k=1: 116
Training KNN model with k=2
False positives for k=2: 86
Training KNN model with k=3
False positives for k=3: 72
Training KNN model with k=4
False positives for k=4: 68
Training KNN model with k=5
False positives for k=5: 65
Training KNN model with k=6
False positives for k=6: 65
Training KNN model with k=7
False positives for k=7: 65
Training KNN model with k=8
False positives for k=8: 65
Training KNN model with k=9
False positives for k=9: 63
Training KNN model with k=10
False positives for k=10: 63
Training KNN model with k=11
False positives for k=11: 63
Training KNN model with k=12
False positives for k=12: 63
Training KNN model with k=13
False positives for k=13: 63
Training KNN model with k=14
False positives for k=14: 62
Training KNN model with k=15
False positives for k=15: 63
Training KNN model with k=16
False positives for k=16: 62
Training KNN model with k=17
False positives for k=17: 62
Training KNN model with k=18
Fa

In [24]:
def train_knn_model(normal_embeddings, n_neighbors=11):
    knn_classifier = KNeighborsClassifier(n_neighbors=n_neighbors)
    labels = np.zeros(len(normal_embeddings))
    knn_classifier.fit(normal_embeddings, labels)
    return knn_classifier

def calculate_mean_distances(knn_model, embeddings):
    distances, _ = knn_model.kneighbors(embeddings)
    return distances.mean(axis=1)

def calculate_thresholds(knn_model, anomaly_embeddings_train, normal_embeddings_train):
    anomaly_mean_distance = calculate_mean_distances(knn_model, anomaly_embeddings_train)
    normal_mean_distance = calculate_mean_distances(knn_model, normal_embeddings_train)
    return anomaly_mean_distance, normal_mean_distance

def validate_knn_model(knn_model, validation_embeddings, threshold, true_validation_labels):
    distances, _ = knn_model.kneighbors(validation_embeddings)
    mean_distances = distances.mean(axis=1)
    validation_predictions = mean_distances > threshold
    validation_accuracy = accuracy_score(true_validation_labels, validation_predictions)
    return validation_accuracy, validation_predictions

def test_knn_model(knn_model, test_embeddings, threshold, true_test_labels):
    distances, _ = knn_model.kneighbors(test_embeddings)
    mean_distances = distances.mean(axis=1)
    test_predictions = mean_distances > threshold
    test_accuracy = accuracy_score(true_test_labels, test_predictions)
    return test_accuracy, test_predictions

def calculate_confusion_matrix(actual_labels, predicted_labels):
    true_positives = false_positives = true_negatives = false_negatives = 0
    for actual, predicted in zip(actual_labels, predicted_labels):
        if actual == 0 and predicted == 0:
            true_negatives += 1
        elif actual == 1 and predicted == 0:
            false_negatives += 1
        elif actual == 1 and predicted == 1:
            true_positives += 1
        elif actual == 0 and predicted == 1:
            false_positives += 1
    return true_positives, false_positives, true_negatives, false_negatives

def concatenate_zeros_and_ones(podcast_length, commercials_length):
    zeros_array = np.zeros(podcast_length, dtype=int)
    ones_array = np.ones(commercials_length, dtype=int)
    return np.concatenate((zeros_array, ones_array))

# Assuming embeddings and labels are defined
podcast_length = pod_train_embeddings.shape[0]
ads_length = ad_train_embeddings.shape[0]

train_labels = concatenate_zeros_and_ones(podcast_length, ads_length)
val_labels = concatenate_zeros_and_ones(pod_val_embeddings.shape[0], ad_val_embeddings.shape[0])
test_labels = concatenate_zeros_and_ones(pod_test_embeddings.shape[0], ad_test_embeddings.shape[0])

best_k = None
best_threshold = None
min_false_positives = float('inf')

# Range of k values to test
k_values = range(1, 60)

for k in k_values:
    print(f"Training KNN model with k={k}")
    trained_knn_classifier = train_knn_model(pod_train_embeddings, n_neighbors=k)
    anomaly_mean_distance, normal_mean_distance = calculate_thresholds(trained_knn_classifier, ad_train_embeddings, pod_train_embeddings)

    # Define a range of thresholds to test
    min_distance = min(anomaly_mean_distance.min(), normal_mean_distance.min())
    max_distance = max(anomaly_mean_distance.max(), normal_mean_distance.max())
    thresholds = np.linspace(min_distance, max_distance, 100)

    for threshold in thresholds:
        _, validation_predictions = validate_knn_model(trained_knn_classifier, val_embeddings, threshold, val_labels)
        test_accuracy, test_predictions = test_knn_model(trained_knn_classifier, test_embeddings, threshold, test_labels)
        true_positives, false_positives, true_negatives, false_negatives = calculate_confusion_matrix(test_labels, test_predictions)

        if false_positives < min_false_positives:
            min_false_positives = false_positives
            best_k = k
            best_threshold = threshold

print(f"Best k with minimal false positives: {best_k}")
print(f"Best threshold with minimal false positives: {best_threshold}")

# Train and test the final model with the best k and threshold
trained_knn_classifier = train_knn_model(pod_train_embeddings, n_neighbors=best_k)
test_accuracy, test_predictions = test_knn_model(trained_knn_classifier, test_embeddings, best_threshold, test_labels)
print("Test Accuracy:", test_accuracy)
print("Test Predictions:", test_predictions)

true_positives, false_positives, true_negatives, false_negatives = calculate_confusion_matrix(test_labels, test_predictions)
print("True Positives:", true_positives)
print("False Positives:", false_positives)
print("True Negatives:", true_negatives)
print("False Negatives:", false_negatives)

Training KNN model with k=1
Training KNN model with k=2
Training KNN model with k=3
Training KNN model with k=4
Training KNN model with k=5
Training KNN model with k=6
Training KNN model with k=7
Training KNN model with k=8
Training KNN model with k=9
Training KNN model with k=10
Training KNN model with k=11
Training KNN model with k=12
Training KNN model with k=13
Training KNN model with k=14
Training KNN model with k=15
Training KNN model with k=16
Training KNN model with k=17
Training KNN model with k=18
Training KNN model with k=19
Training KNN model with k=20
Training KNN model with k=21
Training KNN model with k=22
Training KNN model with k=23
Training KNN model with k=24
Training KNN model with k=25
Training KNN model with k=26
Training KNN model with k=27
Training KNN model with k=28
Training KNN model with k=29
Training KNN model with k=30
Training KNN model with k=31
Training KNN model with k=32
Training KNN model with k=33
Training KNN model with k=34
Training KNN model with