# ***Establishing Connection to Google Drive***

To initiate the project, the primary step entails establishing a seamless connection to Google Drive. This connection is pivotal for accessing and utilizing the requisite files and datasets essential for the project's execution.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# ***Importing Essential Libraries***

Following the establishment of the Google Drive connection, the subsequent step involves importing the essential libraries necessary for executing the code. These libraries serve as the foundational framework, providing the functionality and tools required to implement various tasks and analyses within the project.

In [2]:
import os
import librosa
import numpy as np

from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import average_precision_score, confusion_matrix

# ***Loading Data for Processing and Testing***

In this pivotal stage, we load all pertinent data into the project environment for comprehensive processing and testing. By importing the datasets integral to our analysis, we ensure a robust foundation for conducting experiments and evaluations crucial to the project's objectives.

In [110]:
# for 1 second vggish
pod_train_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_10sec/vggish_10sec_pod_train_embeddings.npy")
# [:502, :, :]
ad_train_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_10sec/vggish_10sec_ad_train_embeddings.npy")

pod_val_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_10sec/vggish_10sec_pod_val_embeddings.npy")
ad_val_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_10sec/vggish_10sec_ad_val_embeddings.npy")

pod_test_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_10sec/vggish_10sec_pod_test_embeddings.npy")
ad_test_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_10sec/vggish_10sec_ad_test_embeddings.npy")

# load all relevant data for the ads
#ad_train_embeddings = np.load('need to add the actual path to the relevant')
#ad_train_labels = np.load('need to add the actual path to the relevant')
#ad_val_embeddings = np.load('need to add the actual path to the relevant')
#ad_val_labels = np.load('need to add the actual path to the relevant')
#ad_test_embeddings = np.load('need to add the actual path to the relevant')
#ad_ test_labels = np.load('need to add the actual path to the relevant')

In [111]:
print(pod_train_embeddings.shape)
print(ad_train_embeddings.shape)

print(pod_val_embeddings.shape)
print(ad_val_embeddings.shape)

print(pod_test_embeddings.shape)
print(ad_test_embeddings.shape)

(1639, 10, 128)
(240, 10, 128)
(204, 10, 128)
(30, 10, 128)
(206, 10, 128)
(30, 10, 128)


In [112]:
def convert_to_2d_array(three_d_array):
    # Get the dimensions of the input array
    depth, rows, cols = three_d_array.shape

    # Reshape each 2D array to 1D and concatenate them
    flattened_arrays = [matrix.flatten() for matrix in three_d_array]
    two_d_array = np.vstack(flattened_arrays)

    return two_d_array

In [113]:
pod_train_embeddings = convert_to_2d_array(pod_train_embeddings)
ad_train_embeddings = convert_to_2d_array(ad_train_embeddings)

pod_val_embeddings = convert_to_2d_array(pod_val_embeddings)
ad_val_embeddings = convert_to_2d_array(ad_val_embeddings)

pod_test_embeddings =  convert_to_2d_array(pod_test_embeddings)
ad_test_embeddings = convert_to_2d_array(ad_test_embeddings)

print(pod_train_embeddings.shape)
print(ad_train_embeddings.shape)

print(pod_val_embeddings.shape)
print(ad_val_embeddings.shape)

print(pod_test_embeddings.shape)
print(ad_test_embeddings.shape)

(1639, 1280)
(240, 1280)
(204, 1280)
(30, 1280)
(206, 1280)
(30, 1280)


In [114]:
train_embeddings = np.concatenate((pod_train_embeddings, ad_train_embeddings))
val_embeddings = np.concatenate((pod_val_embeddings, ad_val_embeddings))
test_embeddings = np.concatenate((pod_test_embeddings, ad_test_embeddings))

In [115]:
print(train_embeddings.shape)
print(val_embeddings.shape)
print(test_embeddings.shape)

(1879, 1280)
(234, 1280)
(236, 1280)


# ***Training Isolation Forest Model***

In this critical phase, we commence the training process for our  Isolation Forest model by executing the data allocated to the training set. Prior to this, we crafted a function designed to furnish us with the requisite model. Subsequently, invoking this function enables us to obtain the model tailored to our specifications, facilitating the subsequent stages of our analysis.

In [116]:
def train_isolation_forest_model(normal_embeddings, contamination_rate=0.05):
    """
    Train an Isolation Forest model on normal data points.

    Parameters:
    ----------
    normal_embeddings (numpy.ndarray): Array containing normal data points.
    contamination_rate (float, optional): Contamination rate for the Isolation
    Forest model.  Defaults to 0.05 (5%).

    Returns:
    -------
    sklearn.ensemble.IsolationForest: Trained Isolation Forest model.
    """
    # Initialize Isolation Forest model with the specified contamination rate
    contamination='auto'
    isolation_forest_model = IsolationForest(contamination='auto')
    # isolation_forest_model = IsolationForest(contamination=contamination_rate)

    # Fit Isolation Forest model to normal data points
    isolation_forest_model.fit(normal_embeddings)

    return isolation_forest_model

In [117]:
Isolation_Forest_trained_model = train_isolation_forest_model(train_embeddings, contamination_rate=0.13) # can Change the contamination rate if needed

# ***Model Validation and Adjustment***

In this pivotal stage, we evaluate the performance of our model by running the data assigned to the validation set. This step enables us to conduct a thorough examination of the model's efficacy and identify any necessary adjustments. By scrutinizing the model's performance against validation data, we iteratively refine its parameters to enhance its accuracy and robustness.

In [118]:
def predict_anomalies(isolation_forest_model, data_points):
    """
    Predict anomalies using the trained Isolation Forest model.

    Parameters:
    ---------
    isolation_forest_model (sklearn.ensemble.IsolationForest): Trained Isolation Forest model.
    data_points (numpy.ndarray): Array containing data points to be predicted.

    Returns:
    -------
    numpy.ndarray: Predicted labels indicating anomalies (1 for anomalies, -1 for normal data points).
    """
    # Predict anomalies using Isolation Forest model
    anomaly_labels = isolation_forest_model.predict(data_points)

    return anomaly_labels

In [119]:
# Assuming you have a trained Isolation Forest model named 'trained_model' and data points stored in a numpy array named 'data_points'

# Call the predict_anomalies function
val_predicted_labels = predict_anomalies(Isolation_Forest_trained_model, val_embeddings)
print(val_predicted_labels.shape)

(234,)


# ***Final Model Evaluation with Test Data***

In this culminating phase, we subject the test data to our final model for comprehensive evaluation of its performance on real-world datasets. This step allows us to ascertain the effectiveness and generalizability of our model beyond the training and validation stages. By rigorously scrutinizing the model's performance against unseen data, we derive insights into its real-world applicability and overall efficacy.

In [120]:
test_predicted_labels =  predict_anomalies(Isolation_Forest_trained_model, test_embeddings)
print(test_predicted_labels.shape)

(236,)


# ***Label Transformation for Model Evaluation***

At this critical stage, we undertake the transformation of label arrays to facilitate the evaluation process of our model. Specifically, we convert the labels representing podcasts into "-1" and those denoting anomalies (advertisements) into "1". This transformation enables a standardized evaluation framework, enhancing the model's interpretability and facilitating the assessment of its performance metrics with clarity and precision.


In [121]:
# def concatenate_ones_and_minus_one(podcast_length, commercials_length):
#     # Create array of zeros with size of podcast array
#     zeros_array = np.ones(podcast_length, dtype=int)

#     # Create array of ones with size of commercials array
#     # ones_array = np.ones(commercials_length, dtype=int)
#     minus_ones_array = np.full(commercials_length, -1, dtype=int)
#     # Concatenate arrays
#     concatenated_array = np.concatenate((zeros_array, minus_ones_array))

#     return concatenated_array


def concatenate_zeros_and_ones(podcast_length, commercials_length):
    # Create array of zeros with size of podcast array
    zeros_array = np.full(podcast_length, 0, dtype=int)

    # Create array of ones with size of commercials array
    # ones_array = np.ones(commercials_length, dtype=int)
    ones_array = np.ones(commercials_length, dtype=int)
    # Concatenate arrays
    concatenated_array = np.concatenate((zeros_array, ones_array))

    return concatenated_array




podcast_length = pod_train_embeddings.shape[0]
ads_length = ad_train_embeddings.shape[0]

In [122]:
train_labels = concatenate_zeros_and_ones(pod_train_embeddings.shape[0], ad_train_embeddings.shape[0])
val_labels = concatenate_zeros_and_ones(pod_val_embeddings.shape[0], ad_val_embeddings.shape[0])
test_lables = concatenate_zeros_and_ones(pod_test_embeddings.shape[0], ad_test_embeddings.shape[0])

In [None]:
# def convert_normal_to_minus_one(samples):
#     """
#     Convert elements equal to 0 to -1 in the samples.

#     Parameters:
#     samples (list of numpy.ndarray): List of one-dimensional numpy arrays containing zeros or ones.

#     Returns:
#     list of numpy.ndarray: List of samples with 0 replaced by -1.
#     """
#     converted_samples = []
#     for sample in samples:
#         converted_sample = np.where(sample == 1, -1, sample)
#         converted_samples.append(converted_sample)
#     return converted_samples

# def convert_anomalies_from_1_to_minus_1(samples):
#     """
#     Convert elements equal to 0 to -1 in the samples.

#     Parameters:
#     samples (list of numpy.ndarray): List of one-dimensional numpy arrays containing zeros or ones.

#     Returns:
#     list of numpy.ndarray: List of samples with 0 replaced by -1.
#     """
#     converted_samples = []
#     for sample in samples:
#         converted_sample = np.where(sample == 1, -1, sample)
#         converted_samples.append(converted_sample)
#     return converted_samples

# def convert_podcasts_from_0_to_1(samples):
#     """
#     Convert elements equal to 0 to -1 in the samples.

#     Parameters:
#     samples (list of numpy.ndarray): List of one-dimensional numpy arrays containing zeros or ones.

#     Returns:
#     list of numpy.ndarray: List of samples with 0 replaced by -1.
#     """
#     converted_samples = []
#     for sample in samples:
#         converted_sample = np.where(sample == 1, -1, sample)
#         converted_samples.append(converted_sample)
#     return converted_samples

In [None]:
# Call the function to convert the val_labels and test_labels to be array pf
# 1 and -1 so we can evaluate the model
#val_converted_labels = convert_normal_to_minus_one(val_labels) # validation labels
#test_converted_labels = convert_normal_to_minus_one(test_lables) # test lables

# val_converted_labels = convert_anomalies_from_1_to_minus_1(val_labels) # validation labels
# test_converted_labels = convert_anomalies_from_1_to_minus_1(test_lables) # test lables

# val_converted_labels = convert_podcasts_from_0_to_1(val_labels) # validation labels
# test_converted_labels = convert_podcasts_from_0_to_1(test_lables) # test lables


# ***Presentation of Model Evaluation Metrics***

In this critical juncture, we showcase a comprehensive array of evaluation metrics meticulously designed to gauge the efficacy and performance of our model. Through the presentation of these metrics, including but not limited to accuracy, precision, recall, and F1-score, we offer a nuanced understanding of the model's strengths and limitations. This holistic evaluation serves to validate the model's efficacy and aids in informing future iterations or enhancements.

In [123]:
# Convert predictions: -1 (anomaly) to 1, 1 (normal) to 0
test_predicted_labels_binary = np.where(test_predicted_labels == -1, 1, 0)

# Compute the confusion matrix
cm = confusion_matrix(test_lables, test_predicted_labels_binary)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[204   2]
 [ 29   1]]


In [124]:
def calculate_confusion_matrix(actual_labels, predicted_labels):
    """
    Calculate the confusion matrix based on actual and predicted labels.

    Parameters:
    actual_labels (list): List of actual labels (-1 for ads, 1 for podcasts).
    predicted_labels (list): List of predicted labels (-1 for ads, 1 for podcasts).

    Returns:
    tuple: A tuple containing true positives, false positives, true negatives, and false negatives.
    """
    true_positives = 0
    false_positives = 0
    true_negatives = 0
    false_negatives = 0

    for actual, predicted in zip(actual_labels, predicted_labels):
        if actual == 0 and predicted == 0:  # true_negatives
            true_negatives += 1
        elif actual == 1 and predicted == 0:  # false_negatives
            false_negatives += 1
        elif actual == 1 and predicted == 1:  # true_positives
            true_positives += 1
        elif actual == 0 and predicted == 1:  # false_positives
            false_positives += 1

    return true_positives, false_positives, true_negatives, false_negatives
# Example usage:
true_positives, false_positives, true_negatives, false_negatives = calculate_confusion_matrix(test_lables, test_predicted_labels_binary)
print("True Positives:", true_positives)
print("False Positives:", false_positives)
print("True Negatives:", true_negatives)
print("False Negatives:", false_negatives)

True Positives: 1
False Positives: 2
True Negatives: 204
False Negatives: 29


In [76]:
# def calculate_confusion_matrix(actual_labels, predicted_labels):
#     """
#     Calculate the confusion matrix based on actual and predicted labels.

#     Parameters:
#     actual_labels (list): List of actual labels (-1 for ads, 1 for podcasts).
#     predicted_labels (list): List of predicted labels (-1 for ads, 1 for podcasts).

#     Returns:
#     tuple: A tuple containing true positives, false positives, true negatives, and false negatives.
#     """
#     true_positives = 0
#     false_positives = 0
#     true_negatives = 0
#     false_negatives = 0

#     for actual, predicted in zip(actual_labels, predicted_labels):
#         if actual == -1 and predicted == -1:  # True positive
#             true_positives += 1
#         elif actual == 1 and predicted == -1:  # False positive
#             false_positives += 1
#         elif actual == 1 and predicted == 1:  # True negative
#             true_negatives += 1
#         elif actual == -1 and predicted == 1:  # False negative
#            false_negatives += 1
#
#    return true_positives, false_positives, true_negatives, false_negatives

def calculate_confusion_matrix(actual_labels, predicted_labels):
    """
    Calculate the confusion matrix based on actual and predicted labels.

    Parameters:
    actual_labels (list): List of actual labels (-1 for ads, 1 for podcasts).
    predicted_labels (list): List of predicted labels (-1 for ads, 1 for podcasts).

    Returns:
    tuple: A tuple containing true positives, false positives, true negatives, and false negatives.
    """
    true_positives = 0
    false_positives = 0
    true_negatives = 0
    false_negatives = 0

    for actual, predicted in zip(actual_labels, predicted_labels):
        if actual == 1 and predicted == 1:  # True positive
            true_positives += 1
        elif actual == -1 and predicted == 1:  # False positive
            false_positives += 1
        elif actual == -1 and predicted == -1:  # True negative
            true_negatives += 1
        elif actual == 1 and predicted == -1:  # False negative
            false_negatives += 1

    return true_positives, false_positives, true_negatives, false_negatives

# Example usage:
true_positives, false_positives, true_negatives, false_negatives = calculate_confusion_matrix(test_lables, test_predicted_labels)
print("True Positives:", true_positives)
print("False Positives:", false_positives)
print("True Negatives:", true_negatives)
print("False Negatives:", false_negatives)

# Compute the confusion matrix
cm = confusion_matrix(test_lables, test_predicted_labels)
print("Confusion Matrix:\n", cm)

True Positives: 1997
False Positives: 297
True Negatives: 26
False Negatives: 60
Confusion Matrix:
 [[  26  297]
 [  60 1997]]


In [74]:
def calculate_accuracy(actual_labels, predicted_labels):
    """
    Calculate the accuracy of the predictions.

    Parameters:
    actual_labels (list): List of actual labels (-1 for ads, 1 for podcasts).
    predicted_labels (list): List of predicted labels (-1 for ads, 1 for podcasts).

    Returns:
    float: Accuracy of the predictions.
    """
    correct_predictions = sum(1 for actual, predicted in zip(actual_labels, predicted_labels) if actual == predicted)
    total_predictions = len(actual_labels)
    accuracy = correct_predictions / total_predictions
    return accuracy


def calculate_precision(actual_labels, predicted_labels):
    """
    Calculate the precision of the predictions.

    Parameters:
    actual_labels (list): List of actual labels (-1 for ads, 1 for podcasts).
    predicted_labels (list): List of predicted labels (-1 for ads, 1 for podcasts).

    Returns:
    float: Precision of the predictions.
    """
    true_positives, false_positives, _, _ = calculate_confusion_matrix(actual_labels, predicted_labels)
    precision = true_positives / (true_positives + false_positives)
    return precision


def calculate_recall(actual_labels, predicted_labels):
    """
    Calculate the recall of the predictions.

    Parameters:
    actual_labels (list): List of actual labels (-1 for ads, 1 for podcasts).
    predicted_labels (list): List of predicted labels (-1 for ads, 1 for podcasts).

    Returns:
    float: Recall of the predictions.
    """
    true_positives, _, _, false_negatives = calculate_confusion_matrix(actual_labels, predicted_labels)
    recall = true_positives / (true_positives + false_negatives)
    return recall


def calculate_f1_score(actual_labels, predicted_labels):
    """
    Calculate the F1 score of the predictions.

    Parameters:
    actual_labels (list): List of actual labels (-1 for ads, 1 for podcasts).
    predicted_labels (list): List of predicted labels (-1 for ads, 1 for podcasts).

    Returns:
    float: F1 score of the predictions.
    """
    precision = calculate_precision(actual_labels, predicted_labels)
    recall = calculate_recall(actual_labels, predicted_labels)
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

In [75]:
# Assuming you have actual labels stored in 'true_test_label' and predicted
# labels stored in 'anomaly_predictions_test'

accuracy = calculate_accuracy(test_lables, test_predicted_labels)
precision = calculate_precision(test_lables, test_predicted_labels)
recall = calculate_recall(test_lables, test_predicted_labels)
f1_score = calculate_f1_score(test_lables, test_predicted_labels)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Accuracy: 0.85
Precision: 0.8705318221447254
Recall: 0.9708313077297035
F1 Score: 0.9179498965754999
