# ***Establishing Connection to Google Drive***

To initiate the project, the primary step entails establishing a seamless connection to Google Drive. This connection is pivotal for accessing and utilizing the requisite files and datasets essential for the project's execution.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# ***Importing Essential Libraries***

Following the establishment of the Google Drive connection, the subsequent step involves importing the essential libraries necessary for executing the code. These libraries serve as the foundational framework, providing the functionality and tools required to implement various tasks and analyses within the project.

In [2]:
import os
import librosa
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

from sklearn.model_selection import train_test_split

# ***Loading Data for Processing and Testing***

In this pivotal stage, we load all pertinent data into the project environment for comprehensive processing and testing. By importing the datasets integral to our analysis, we ensure a robust foundation for conducting experiments and evaluations crucial to the project's objectives.

In [3]:

# pod_train_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_1sec/vggish_1sec_pod_train_embeddings.npy")[:2574,:,:]
# ad_train_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_1sec/vggish_1sec_ad_train_embeddings.npy")

# pod_val_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_1sec/vggish_1sec_pod_val_embeddings.npy")
# ad_val_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_1sec/vggish_1sec_ad_val_embeddings.npy")

# pod_test_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_1sec/vggish_1sec_pod_test_embeddings.npy")
# ad_test_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_1sec/vggish_1sec_ad_test_embeddings.npy")



pod_train_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_3sec/vggish_3sec_pod_train_embeddings.npy")
ad_train_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_3sec/vggish_3sec_ad_train_embeddings.npy")

pod_val_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_3sec/vggish_3sec_pod_val_embeddings.npy")
ad_val_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_3sec/vggish_3sec_ad_val_embeddings.npy")

pod_test_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_3sec/vggish_3sec_pod_test_embeddings.npy")
ad_test_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_3sec/vggish_3sec_ad_test_embeddings.npy")




# pod_train_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_5sec/vggish_5sec_pod_train_embeddings.npy")
# ad_train_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_5sec/vggish_5sec_ad_train_embeddings.npy")

# pod_val_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_5sec/vggish_5sec_pod_val_embeddings.npy")
# ad_val_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_5sec/vggish_5sec_ad_val_embeddings.npy")

# pod_test_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_5sec/vggish_5sec_pod_test_embeddings.npy")
# ad_test_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/vggish_5sec/vggish_5sec_ad_test_embeddings.npy")




# pod_train_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/OpenL3_1sec/OpenL3_1sec_pod_train_embeddings.npy")
# ad_train_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/OpenL3_1sec/OpenL3_1sec_ad_train_embeddings.npy")

# pod_val_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/OpenL3_1sec/OpenL3_1sec_pod_val_embeddings.npy")
# ad_val_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/OpenL3_1sec/OpenL3_1sec_ad_val_embeddings.npy")

# pod_test_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/OpenL3_1sec/OpenL3_1sec_pod_test_embeddings.npy")
# ad_test_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/OpenL3_1sec/OpenL3_1sec_ad_test_embeddings.npy")



# pod_train_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/OpenL3_5sec/OpenL3_5sec_pod_train_embeddings.npy")
# ad_train_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/OpenL3_5sec/OpenL3_5sec_ad_train_embeddings.npy")

# pod_val_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/OpenL3_5sec/OpenL3_5sec_pod_val_embeddings.npy")
# ad_val_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/OpenL3_5sec/OpenL3_5sec_ad_val_embeddings.npy")

# pod_test_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/OpenL3_5sec/OpenL3_5sec_pod_test_embeddings.npy")
# ad_test_embeddings = np.load("/content/drive/MyDrive/AD-Blocker Project/embeddings/OpenL3_5sec/OpenL3_5sec_ad_test_embeddings.npy")


In [4]:
def convert_to_2d_array(three_d_array):
    # Get the dimensions of the input array
    depth, rows, cols = three_d_array.shape

    # Reshape each 2D array to 1D and concatenate them
    flattened_arrays = [matrix.flatten() for matrix in three_d_array]
    two_d_array = np.vstack(flattened_arrays)

    return two_d_array

In [5]:
pod_train_embeddings = convert_to_2d_array(pod_train_embeddings)
ad_train_embeddings = convert_to_2d_array(ad_train_embeddings)

pod_val_embeddings = convert_to_2d_array(pod_val_embeddings)
ad_val_embeddings = convert_to_2d_array(ad_val_embeddings)

pod_test_embeddings =  convert_to_2d_array(pod_test_embeddings)
ad_test_embeddings = convert_to_2d_array(ad_test_embeddings)

In [6]:
print(pod_train_embeddings.shape)
print(ad_train_embeddings.shape)

print(pod_val_embeddings.shape)
print(ad_val_embeddings.shape)

print(pod_test_embeddings.shape)
print(ad_test_embeddings.shape)

(5260, 384)
(852, 384)
(795, 384)
(101, 384)
(796, 384)
(103, 384)


In [None]:
# pod_train_embeddings = pod_train_embeddings[:240, :]

In [7]:
train_embeddings = np.concatenate((pod_train_embeddings, ad_train_embeddings))
val_embeddings = np.concatenate((pod_val_embeddings, ad_val_embeddings))
test_embeddings = np.concatenate((pod_test_embeddings, ad_test_embeddings))

In [8]:
def concatenate_zeros_and_ones(podcast_length, commercials_length):
    # Create array of zeros with size of podcast array
    zeros_array = np.zeros(podcast_length, dtype=int)

    # Create array of ones with size of commercials array
    ones_array = np.ones(commercials_length, dtype=int)

    # Concatenate arrays
    concatenated_array = np.concatenate((zeros_array, ones_array))

    return concatenated_array

In [9]:
train_labels = concatenate_zeros_and_ones(pod_train_embeddings.shape[0], ad_train_embeddings.shape[0])
val_labels = concatenate_zeros_and_ones(pod_val_embeddings.shape[0], ad_val_embeddings.shape[0])
test_labels = concatenate_zeros_and_ones(pod_test_embeddings.shape[0], ad_test_embeddings.shape[0])

# ***Training Random Forest Model***

In this critical phase, we commence the training process for our Random Forest model by executing the data allocated to the training set. Prior to this, we crafted a function designed to furnish us with the requisite model. Subsequently, invoking this function enables us to obtain the model tailored to our specifications, facilitating the subsequent stages of our analysis.

In [40]:
def train_random_forest_classifier(train_embeddings, train_labels, n_estimators=12, random_state=4):
    """
    Train a Random Forest Classifier on the given training data.

    Parameters:
    ----------
    - train_embeddings (array-like): Feature vectors or embeddings of the training data.
    - train_labels (array-like): Labels corresponding to the training data.
    - n_estimators (int, optional): Number of trees in the forest. Default is 20.
    - random_state (int, optional): Seed for random number generation. Default is 50.

    Returns:
    -------
    - clf (RandomForestClassifier): Trained Random Forest Classifier model.
    """

    # Input validation
    train_embeddings, train_labels = check_X_y(train_embeddings, train_labels)

    # Initialize Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)

    # Train the classifier
    clf.fit(train_embeddings, train_labels)

    return clf

In [41]:
clf = train_random_forest_classifier(train_embeddings, train_labels)

# ***Model Validation and Adjustment***

In this pivotal stage, we evaluate the performance of our model by running the data assigned to the validation set. This step enables us to conduct a thorough examination of the model's efficacy and identify any necessary adjustments. By scrutinizing the model's performance against validation data, we iteratively refine its parameters to enhance its accuracy and robustness.

In [42]:
from sklearn.metrics import accuracy_score

def evaluate_classifier(clf, validation_embeddings, validation_labels):
    """
    Evaluate the trained classifier on validation data.

    Parameters:
    ----------
    - clf (object): Trained classifier object.
    - validation_embeddings (array-like): Feature vectors or embeddings of the validation data.
    - validation_labels (array-like): Labels corresponding to the validation data.

    Returns:
    -------
    - val_accuracy (float): Accuracy of the classifier on the validation data.
    """

    # Perform predictions on validation data
    val_predictions = clf.predict(validation_embeddings)

    # Calculate validation accuracy
    val_accuracy = accuracy_score(validation_labels, val_predictions)
    #print("val prediction",val_predictions)
    return val_accuracy

In [43]:
# Example usage:
val_accuracy = evaluate_classifier(clf, val_embeddings, val_labels)
print("Validation Accuracy:", val_accuracy)
#print("val labels", val_labels)

Validation Accuracy: 0.9274553571428571


# ***Final Model Evaluation with Test Data***

In this culminating phase, we subject the test data to our final model for comprehensive evaluation of its performance on real-world datasets. This step allows us to ascertain the effectiveness and generalizability of our model beyond the training and validation stages. By rigorously scrutinizing the model's performance against unseen data, we derive insights into its real-world applicability and overall efficacy.

In [44]:
def evaluate_test_data(clf, test_embeddings, test_labels):
    """
    Evaluate the trained classifier on test data.

    Parameters:
    - clf (object): Trained classifier object.
    - test_embeddings (array-like): Feature vectors or embeddings of the test data.
    - test_labels (array-like): Labels corresponding to the test data.

    Returns:
    - test_predictions (array-like): Predicted labels for the test data.
    - test_accuracy (float): Accuracy of the classifier on the test data.
    """

    # Perform predictions on test data
    test_predictions = clf.predict(test_embeddings)

    # Calculate test accuracy
    test_accuracy = accuracy_score(test_labels, test_predictions)

    return test_predictions, test_accuracy


In [45]:
# Example usage:
test_predictions, test_accuracy = evaluate_test_data(clf, test_embeddings, test_labels)
#print("Test Predictions:", test_predictions)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.9332591768631813


# ***Presentation of Model Evaluation Metrics***

In this critical juncture, we showcase a comprehensive array of evaluation metrics meticulously designed to gauge the efficacy and performance of our model. Through the presentation of these metrics, including but not limited to accuracy, precision, recall, and F1-score, we offer a nuanced understanding of the model's strengths and limitations. This holistic evaluation serves to validate the model's efficacy and aids in informing future iterations or enhancements.

In [46]:
def calculate_confusion_matrix(actual_labels, predicted_labels):
    """
    Calculate the confusion matrix based on actual and predicted labels.

    Parameters:
    actual_labels (list): List of actual labels (-1 for ads, 1 for podcasts).
    predicted_labels (list): List of predicted labels (-1 for ads, 1 for podcasts).

    Returns:
    tuple: A tuple containing true positives, false positives, true negatives, and false negatives.
    """
    true_positives = 0
    false_positives = 0
    true_negatives = 0
    false_negatives = 0

    for actual, predicted in zip(actual_labels, predicted_labels):
        if actual == 0 and predicted == 0:  # true_negatives
            true_negatives += 1
        elif actual == 1 and predicted == 0:  # false_negatives
            false_negatives += 1
        elif actual == 1 and predicted == 1:  # true_positives
            true_positives += 1
        elif actual == 0 and predicted == 1:  # false_positives
            false_positives += 1

    return true_positives, false_positives, true_negatives, false_negatives

In [47]:
# Call the function by passing the actual labels and predicted labels
true_positives, false_positives, true_negatives, false_negatives = calculate_confusion_matrix(test_labels, test_predictions)

# Print the results
print("True Positives:", true_positives)
print("False Positives:", false_positives)
print("True Negatives:", true_negatives)
print("False Negatives:", false_negatives)

# Calculate evaluation values
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
specificity = true_negatives / (true_negatives + false_positives)
f1_score = 2 * (precision * recall) / (precision + recall)

# Print the evaluation values
print("Precision:", precision)
print("Recall:", recall)
print("Specificity:", specificity)
print("F1 Score:", f1_score)

True Positives: 62
False Positives: 19
True Negatives: 777
False Negatives: 41
Precision: 0.7654320987654321
Recall: 0.6019417475728155
Specificity: 0.9761306532663316
F1 Score: 0.6739130434782608


In [48]:
import joblib

# Save the trained model to a file
joblib_file = "random_forest_model_vggish_3sec.pkl"
joblib.dump(clf, joblib_file)

['random_forest_model_vggish_3sec.pkl']

In [49]:
from google.colab import files

# Download the file
files.download(joblib_file)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>