# ***Establishing Connection to Google Drive***

To initiate the project, the primary step entails establishing a seamless connection to Google Drive. This connection is pivotal for accessing and utilizing the requisite files and datasets essential for the project's execution.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# ***Importing Essential Libraries***

Following the establishment of the Google Drive connection, the subsequent step involves importing the essential libraries necessary for executing the code. These libraries serve as the foundational framework, providing the functionality and tools required to implement various tasks and analyses within the project.

In [2]:
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns  # Import seaborn for enhanced visualizations

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import average_precision_score, confusion_matrix, roc_curve, precision_recall_curve

# ***Loading Data for Processing and Testing***

In this pivotal stage, we load all pertinent data into the project environment for comprehensive processing and testing. By importing the datasets integral to our analysis, we ensure a robust foundation for conducting experiments and evaluations crucial to the project's objectives.

In [None]:
# for 1 second vggish
pod_train_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/OpenL3_5sec/OpenL3_5sec_pod_train_embeddings.npy")[:502, :]
ad_train_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/OpenL3_5sec/OpenL3_5sec_ad_train_embeddings.npy")

pod_val_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/OpenL3_5sec/OpenL3_5sec_pod_val_embeddings.npy")
ad_val_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/OpenL3_5sec/OpenL3_5sec_ad_val_embeddings.npy")

pod_test_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/OpenL3_5sec/OpenL3_5sec_pod_test_embeddings.npy")
ad_test_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/OpenL3_5sec/OpenL3_5sec_ad_test_embeddings.npy")

# load all relevant data for the ads
#ad_train_embeddings = np.load('need to add the actual path to the relevant')
#ad_train_labels = np.load('need to add the actual path to the relevant')
#ad_val_embeddings = np.load('need to add the actual path to the relevant')
#ad_val_labels = np.load('need to add the actual path to the relevant')
#ad_test_embeddings = np.load('need to add the actual path to the relevant')
#ad_ test_labels = np.load('need to add the actual path to the relevant')

In [None]:
# load all relevant data for the Podcasts
#pod_train_embeddings = np.load('need to add the actual path to the relevant')
#pod_train_labels = np.load('need to add the actual path to the relevant')

#pod_val_embeddings = np.load('need to add the actual path to the relevant')
#pod_val_labels = np.load('need to add the actual path to the relevant')

#pod_test_embeddings = np.load('need to add the actual path to the relevant')
#pod_ test_labels = np.load('need to add the actual path to the relevant')

In [None]:
print(pod_train_embeddings.shape)
print(ad_train_embeddings.shape)

print(pod_val_embeddings.shape)
print(ad_val_embeddings.shape)

print(pod_test_embeddings.shape)
print(ad_test_embeddings.shape)

(502, 23552)
(502, 23552)
(410, 23552)
(62, 23552)
(412, 23552)
(64, 23552)


In [None]:
# def convert_to_2d_array(three_d_array):
#     # Get the dimensions of the input array
#     depth, rows, cols = three_d_array.shape

#     # Reshape each 2D array to 1D and concatenate them
#     flattened_arrays = [matrix.flatten() for matrix in three_d_array]
#     two_d_array = np.vstack(flattened_arrays)

#     return two_d_array

In [None]:
# pod_train_embeddings = convert_to_2d_array(pod_train_embeddings)
# ad_train_embeddings = convert_to_2d_array(ad_train_embeddings)

# pod_val_embeddings = convert_to_2d_array(pod_val_embeddings)
# ad_val_embeddings = convert_to_2d_array(ad_val_embeddings)

# pod_test_embeddings =  convert_to_2d_array(pod_test_embeddings)
# ad_test_embeddings = convert_to_2d_array(ad_test_embeddings)

# print(pod_train_embeddings.shape)
# print(ad_train_embeddings.shape)

# print(pod_val_embeddings.shape)
# print(ad_val_embeddings.shape)

# print(pod_test_embeddings.shape)
# print(ad_test_embeddings.shape)

(240, 1280)
(240, 1280)
(204, 1280)
(30, 1280)
(206, 1280)
(30, 1280)


In [None]:
train_embeddings = np.concatenate((pod_train_embeddings, ad_train_embeddings))
val_embeddings = np.concatenate((pod_val_embeddings, ad_val_embeddings))
test_embeddings = np.concatenate((pod_test_embeddings, ad_test_embeddings))

# ***Training KNN Model***

In this critical phase, we commence the training process for our KNN model by executing the data allocated to the training set. Prior to this, we crafted a function designed to furnish us with the requisite model. Subsequently, invoking this function enables us to obtain the model tailored to our specifications, facilitating the subsequent stages of our analysis.

In [None]:
def train_knn_model(normal_embeddings, n_neighbors=11):
    """
    Trains a K-Nearest Neighbors (KNN) classifier on the provided normal embeddings.

    Parameters:
    ---------
    - normal_embeddings (numpy.ndarray): Embeddings of normal class samples for training.
    - n_neighbors (int, optional): Number of neighbors to consider. Default is 11.

    Returns:
    -------
    - sklearn.neighbors.KNeighborsClassifier: Trained KNN classifier.
    """
    # Create a KNN classifier with the specified number of neighbors
    knn_classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

    # Label all normal embeddings as 0
    labels = np.zeros(len(normal_embeddings)) # check this line with odelia, why do we need to provied array of zeros if we already know what are the labels

    # Train the KNN classifier
    knn_classifier.fit(normal_embeddings, labels)

    return knn_classifier

In [None]:
# Example usage:
trained_knn_classifier = train_knn_model(pod_train_embeddings, n_neighbors=17)

# ***Function Definition for Mean Distance Calculation***

In this section, we define a crucial function tasked with computing the mean distances for both anomaly and normal embeddings. By encapsulating this functionality within a dedicated function, we streamline the process of deriving insightful metrics crucial for evaluating the model's performance. This function serves as a cornerstone in our analysis, facilitating the comparison and interpretation of distances between embeddings corresponding to anomalies and normal samples.

In [None]:
def calculate_mean_distances(knn_model, embeddings):
    """
    Calculates the mean distances of embeddings using the trained KNN model.

    Parameters:
    ----------
    - knn_model (sklearn.neighbors.KNeighborsClassifier): Trained KNN model.
    - embeddings (numpy.ndarray): Embeddings to calculate distances for.

    Returns:
    -------
    - numpy.ndarray: Mean distances of embeddings.
    """
    # Find distances and indices of k-neighbors for each embedding
    distances, _ = knn_model.kneighbors(embeddings)

    # Calculate mean distances for each embedding
    mean_distances = distances.mean(axis=1)

    return mean_distances

In [None]:
def calculate_thresholds(knn_model, anomaly_embeddings_train, normal_embeddings_train):
    """
    Calculates anomaly and normal thresholds based on mean distances of embeddings using a trained KNN model.

    Parameters:
    ----------
    - knn_model (sklearn.neighbors.KNeighborsClassifier): Trained KNN model.
    - anomaly_embeddings_train (numpy.ndarray): Embeddings of anomaly class samples for training.
    - normal_embeddings_train (numpy.ndarray): Embeddings of normal class samples for training.

    Returns:
    -------
    - Anomaly threshold (float)
    - Normal threshold (float)
    """
    # Calculate mean distances for anomaly and normal embeddings
    anomaly_mean_distance = calculate_mean_distances(knn_model, anomaly_embeddings_train)
    normal_mean_distance = calculate_mean_distances(knn_model, normal_embeddings_train)

    # Determine threshold based on means and factors
    anomaly_threshold = anomaly_mean_distance.mean() + 2 * anomaly_mean_distance.std()  # Set threshold 2 standard deviations above anomaly mean
    normal_threshold = normal_mean_distance.mean() - 2 * normal_mean_distance.std()  # Set threshold 2 standard deviations below normal mean

    return anomaly_threshold, normal_threshold

In [None]:
# Call the function to calculate thresholds
anomaly_threshold, normal_threshold = calculate_thresholds(trained_knn_classifier, ad_train_embeddings, pod_train_embeddings)

# Print the calculated thresholds
print("Anomaly threshold:", anomaly_threshold)
print("Normal threshold:", normal_threshold)

Anomaly threshold: 130.86699024351654
Normal threshold: 59.30972263135324


# ***Model Validation and Adjustment***

In this pivotal stage, we evaluate the performance of our model by running the data assigned to the validation set. This step enables us to conduct a thorough examination of the model's efficacy and identify any necessary adjustments. By scrutinizing the model's performance against validation data, we iteratively refine its parameters to enhance its accuracy and robustness.

In [None]:
def validate_knn_model(knn_model, validation_embeddings, anomaly_threshold, normal_threshold, true_validation_labels):
    """
    Validates the KNN model using validation embeddings.

    Parameters:
    ----------
    - knn_model (sklearn.neighbors.KNeighborsClassifier): Trained KNN model.
    - validation_embeddings (numpy.ndarray): Validation embeddings.
    - anomaly_threshold (float): Threshold for classifying anomalies.
    - normal_threshold (float): Threshold for classifying normal samples.
    - true_validation_labels (numpy.ndarray): True labels of validation samples.

    Returns:
    -------
    - Validation accuracy (float)
    - Validation predictions (numpy.ndarray)
    - Threshold used for classification (float)
    """
    # Calculate distances to nearest neighbors
    distances, _ = knn_model.kneighbors(validation_embeddings)

    # Calculate mean distances for each validation embedding
    mean_distances = distances.mean(axis=1)

    # Calculate threshold for classification
    threshold = (anomaly_threshold + normal_threshold) / 2

    # Classify validation embeddings based on mean distances
    validation_predictions = mean_distances > threshold

    # Calculate validation accuracy
    validation_accuracy = accuracy_score(true_validation_labels, validation_predictions)

    return validation_accuracy, validation_predictions, threshold

In [None]:
def concatenate_zeros_and_ones(podcast_length, commercials_length):
    # Create array of zeros with size of podcast array
    zeros_array = np.zeros(podcast_length, dtype=int)

    # Create array of ones with size of commercials array
    ones_array = np.ones(commercials_length, dtype=int)

    # Concatenate arrays
    concatenated_array = np.concatenate((zeros_array, ones_array))

    return concatenated_array


podcast_length = pod_train_embeddings.shape[0]
ads_length = ad_train_embeddings.shape[0]

In [None]:
train_labels = concatenate_zeros_and_ones(pod_train_embeddings.shape[0], ad_train_embeddings.shape[0])
val_labels = concatenate_zeros_and_ones(pod_val_embeddings.shape[0], ad_val_embeddings.shape[0])
test_lables = concatenate_zeros_and_ones(pod_test_embeddings.shape[0], ad_test_embeddings.shape[0])

In [None]:
# Call the function and store the return values
validation_accuracy, validation_predictions, threshold = validate_knn_model(trained_knn_classifier, val_embeddings, anomaly_threshold, normal_threshold, val_labels)

# Print the different results
print("Validation Accuracy:", validation_accuracy)
print("Validation Predictions:", validation_predictions)
print("Threshold:", threshold)

Validation Accuracy: 0.9385593220338984
Validation Predictions: [False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False  True False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False  True False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False  True False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False  True False False False False False False

# ***Final Model Evaluation with Test Data***

In this culminating phase, we subject the test data to our final model for comprehensive evaluation of its performance on real-world datasets. This step allows us to ascertain the effectiveness and generalizability of our model beyond the training and validation stages. By rigorously scrutinizing the model's performance against unseen data, we derive insights into its real-world applicability and overall efficacy.

In [None]:
def test_knn_model(knn_model, test_embeddings, anomaly_threshold, normal_threshold, true_test_labels):
    """
    Tests the KNN model using test embeddings.

    Parameters:
    ---------
    - knn_model (sklearn.neighbors.KNeighborsClassifier): Trained KNN model.
    - test_embeddings (numpy.ndarray): Test embeddings.
    - anomaly_threshold (float): Threshold for classifying anomalies.
    - normal_threshold (float): Threshold for classifying normal samples.
    - true_test_labels (numpy.ndarray): True labels of test samples.

    Returns:
    -------
    - Test accuracy (float)
    - Test predictions (numpy.ndarray)
    - Threshold used for classification (float)
    """
    # Calculate distances to nearest neighbors
    distances, _ = knn_model.kneighbors(test_embeddings)

    # Calculate mean distances for each test embedding
    mean_distances = distances.mean(axis=1)

    # Calculate threshold for classification
    threshold = (anomaly_threshold + normal_threshold) / 2

    # Classify test embeddings based on mean distances
    test_predictions = mean_distances > threshold

    # Calculate test accuracy
    test_accuracy = accuracy_score(true_test_labels, test_predictions)

    return test_accuracy, test_predictions, threshold

In [None]:
# Call the function and store the return values
test_accuracy, test_predictions, threshold = test_knn_model(trained_knn_classifier, test_embeddings, anomaly_threshold, normal_threshold, test_lables)

# Print the different results
print("Test Accuracy:", test_accuracy)
print("Test Predictions:", test_predictions)
print("Threshold:", threshold)

Test Accuracy: 0.9180672268907563
Test Predictions: [False False  True False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False  True False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False  True False False False False False False False False
 False False  True  True False False False False False False False False
 False False False False False False False False False False False False

# ***Presentation of Model Evaluation Metrics***

In this critical juncture, we showcase a comprehensive array of evaluation metrics meticulously designed to gauge the efficacy and performance of our model. Through the presentation of these metrics, including but not limited to accuracy, precision, recall, and F1-score, we offer a nuanced understanding of the model's strengths and limitations. This holistic evaluation serves to validate the model's efficacy and aids in informing future iterations or enhancements.

In [None]:
def calculate_confusion_matrix(actual_labels, predicted_labels):
    """
    Calculate the confusion matrix based on actual and predicted labels.

    Parameters:
    actual_labels (list): List of actual labels (-1 for ads, 1 for podcasts).
    predicted_labels (list): List of predicted labels (-1 for ads, 1 for podcasts).

    Returns:
    tuple: A tuple containing true positives, false positives, true negatives, and false negatives.
    """
    true_positives = 0
    false_positives = 0
    true_negatives = 0
    false_negatives = 0

    for actual, predicted in zip(actual_labels, predicted_labels):
        if actual == 0 and predicted == 0:  # true_negatives
            true_negatives += 1
        elif actual == 1 and predicted == 0:  # false_negatives
            false_negatives += 1
        elif actual == 1 and predicted == 1:  # true_positives
            true_positives += 1
        elif actual == 0 and predicted == 1:  # false_positives
            false_positives += 1

    return true_positives, false_positives, true_negatives, false_negatives

In [None]:
# Example usage:
true_positives, false_positives, true_negatives, false_negatives = calculate_confusion_matrix(test_lables, test_predictions)
print("True Positives:", true_positives)
print("False Positives:", false_positives)
print("True Negatives:", true_negatives)
print("False Negatives:", false_negatives)

True Positives: 38
False Positives: 13
True Negatives: 399
False Negatives: 26


In [None]:
def calculate_accuracy(actual_labels, predicted_labels):
    """
    Calculate the accuracy of the predictions.

    Parameters:
    actual_labels (list): List of actual labels (-1 for ads, 1 for podcasts).
    predicted_labels (list): List of predicted labels (-1 for ads, 1 for podcasts).

    Returns:
    float: Accuracy of the predictions.
    """
    correct_predictions = sum(1 for actual, predicted in zip(actual_labels, predicted_labels) if actual == predicted)
    total_predictions = len(actual_labels)
    accuracy = correct_predictions / total_predictions
    return accuracy


def calculate_precision(actual_labels, predicted_labels):
    """
    Calculate the precision of the predictions.

    Parameters:
    actual_labels (list): List of actual labels (-1 for ads, 1 for podcasts).
    predicted_labels (list): List of predicted labels (-1 for ads, 1 for podcasts).

    Returns:
    float: Precision of the predictions.
    """
    true_positives, false_positives, _, _ = calculate_confusion_matrix(actual_labels, predicted_labels)
    precision = true_positives / (true_positives + false_positives)
    return precision


def calculate_recall(actual_labels, predicted_labels):
    """
    Calculate the recall of the predictions.

    Parameters:
    actual_labels (list): List of actual labels (-1 for ads, 1 for podcasts).
    predicted_labels (list): List of predicted labels (-1 for ads, 1 for podcasts).

    Returns:
    float: Recall of the predictions.
    """
    true_positives, _, _, false_negatives = calculate_confusion_matrix(actual_labels, predicted_labels)
    recall = true_positives / (true_positives + false_negatives)
    return recall


def calculate_f1_score(actual_labels, predicted_labels):
    """
    Calculate the F1 score of the predictions.

    Parameters:
    actual_labels (list): List of actual labels (-1 for ads, 1 for podcasts).
    predicted_labels (list): List of predicted labels (-1 for ads, 1 for podcasts).

    Returns:
    float: F1 score of the predictions.
    """
    precision = calculate_precision(actual_labels, predicted_labels)
    recall = calculate_recall(actual_labels, predicted_labels)
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

In [None]:
# Assuming you have actual labels stored in 'true_test_label' and predicted
# labels stored in 'anomaly_predictions_test'

accuracy = calculate_accuracy(test_lables, test_predictions)
precision = calculate_precision(test_lables, test_predictions)
recall = calculate_recall(test_lables, test_predictions)
f1_score = calculate_f1_score(test_lables, test_predictions)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Accuracy: 0.9180672268907563
Precision: 0.7450980392156863
Recall: 0.59375
F1 Score: 0.6608695652173913


In [None]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

def train_knn_model(normal_embeddings, n_neighbors=11):
    knn_classifier = KNeighborsClassifier(n_neighbors=n_neighbors)
    labels = np.zeros(len(normal_embeddings))
    knn_classifier.fit(normal_embeddings, labels)
    return knn_classifier

def calculate_mean_distances(knn_model, embeddings):
    distances, _ = knn_model.kneighbors(embeddings)
    return distances.mean(axis=1)

def calculate_thresholds(knn_model, anomaly_embeddings_train, normal_embeddings_train):
    anomaly_mean_distance = calculate_mean_distances(knn_model, anomaly_embeddings_train)
    normal_mean_distance = calculate_mean_distances(knn_model, normal_embeddings_train)
    anomaly_threshold = anomaly_mean_distance.mean() + 2 * anomaly_mean_distance.std()
    normal_threshold = normal_mean_distance.mean() - 2 * normal_mean_distance.std()
    return anomaly_threshold, normal_threshold

def validate_knn_model(knn_model, validation_embeddings, anomaly_threshold, normal_threshold, true_validation_labels):
    distances, _ = knn_model.kneighbors(validation_embeddings)
    mean_distances = distances.mean(axis=1)
    threshold = (anomaly_threshold + normal_threshold) / 2
    validation_predictions = mean_distances > threshold
    validation_accuracy = accuracy_score(true_validation_labels, validation_predictions)
    return validation_accuracy, validation_predictions, threshold

def concatenate_zeros_and_ones(podcast_length, commercials_length):
    zeros_array = np.zeros(podcast_length, dtype=int)
    ones_array = np.ones(commercials_length, dtype=int)
    return np.concatenate((zeros_array, ones_array))

def calculate_confusion_matrix(actual_labels, predicted_labels):
    true_positives = false_positives = true_negatives = false_negatives = 0
    for actual, predicted in zip(actual_labels, predicted_labels):
        if actual == 0 and predicted == 0:
            true_negatives += 1
        elif actual == 1 and predicted == 0:
            false_negatives += 1
        elif actual == 1 and predicted == 1:
            true_positives += 1
        elif actual == 0 and predicted == 1:
            false_positives += 1
    return true_positives, false_positives, true_negatives, false_negatives

# Assuming embeddings and labels are defined
podcast_length = pod_train_embeddings.shape[0]
ads_length = ad_train_embeddings.shape[0]

train_labels = concatenate_zeros_and_ones(podcast_length, ads_length)
val_labels = concatenate_zeros_and_ones(pod_val_embeddings.shape[0], ad_val_embeddings.shape[0])
test_labels = concatenate_zeros_and_ones(pod_test_embeddings.shape[0], ad_test_embeddings.shape[0])

best_k = None
min_false_positives = float('inf')

# Range of k values to test
k_values = range(1, 100)

for k in k_values:
    print(f"Training KNN model with k={k}")
    trained_knn_classifier = train_knn_model(pod_train_embeddings, n_neighbors=k)
    anomaly_threshold, normal_threshold = calculate_thresholds(trained_knn_classifier, ad_train_embeddings, pod_train_embeddings)
    _, validation_predictions, _ = validate_knn_model(trained_knn_classifier, val_embeddings, anomaly_threshold, normal_threshold, val_labels)
    test_accuracy, test_predictions, threshold = test_knn_model(trained_knn_classifier, test_embeddings, anomaly_threshold, normal_threshold, test_labels)
    # _, false_positives, _, _ = calculate_confusion_matrix(val_labels, validation_predictions)
    true_positives, false_positives, true_negatives, false_negatives = calculate_confusion_matrix(test_labels, test_predictions)

    print(f"False positives for k={k}: {false_positives}")

    if false_positives < min_false_positives:
        min_false_positives = false_positives
        best_k = k

print(f"Best k with minimal false positives: {best_k}")

# Train and test the final model with the best k
trained_knn_classifier = train_knn_model(pod_train_embeddings, n_neighbors=best_k)
anomaly_threshold, normal_threshold = calculate_thresholds(trained_knn_classifier, ad_train_embeddings, pod_train_embeddings)
test_accuracy, test_predictions, threshold = test_knn_model(trained_knn_classifier, test_embeddings, anomaly_threshold, normal_threshold, test_labels)
print("Test Accuracy:", test_accuracy)
print("Test Predictions:", test_predictions)

true_positives, false_positives, true_negatives, false_negatives = calculate_confusion_matrix(test_labels, test_predictions)
print("True Positives:", true_positives)
print("False Positives:", false_positives)
print("True Negatives:", true_negatives)
print("False Negatives:", false_negatives)

Training KNN model with k=1
False positives for k=1: 404
Training KNN model with k=2
False positives for k=2: 81
Training KNN model with k=3
False positives for k=3: 36
Training KNN model with k=4
False positives for k=4: 24
Training KNN model with k=5
False positives for k=5: 20
Training KNN model with k=6
False positives for k=6: 18
Training KNN model with k=7
False positives for k=7: 16
Training KNN model with k=8
False positives for k=8: 16
Training KNN model with k=9
False positives for k=9: 16
Training KNN model with k=10
False positives for k=10: 15
Training KNN model with k=11
False positives for k=11: 14
Training KNN model with k=12
False positives for k=12: 14
Training KNN model with k=13
False positives for k=13: 14
Training KNN model with k=14
False positives for k=14: 14
Training KNN model with k=15
False positives for k=15: 14
Training KNN model with k=16
False positives for k=16: 14
Training KNN model with k=17
False positives for k=17: 13
Training KNN model with k=18
Fa