In [1]:
import os
import librosa
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def concatenate_numpy_arrays_from_folder(directory):
    """
    Concatenates all numpy arrays in the specified directory into a single numpy array.

    Args:
        directory (str): The path to the directory containing the numpy array files.

    Returns:
        np.ndarray: The concatenated numpy array.
    """
    # Initialize an empty list to store the arrays
    arrays_list = []

    # Loop through all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.npy'):  # Check if the file is a numpy array file
            file_path = os.path.join(directory, filename)

            # Load the numpy array from the file
            numpy_array = np.load(file_path)

            # Print the shape of the numpy array
            #print(f'Shape of {filename}: {numpy_array.shape}')

            # Append the numpy array to the list
            arrays_list.append(numpy_array)

    # Concatenate all numpy arrays in the list into a single array
    concatenated_array = np.concatenate(arrays_list, axis=0)  # Change axis if needed

    # Print the shape of the concatenated array
    print(f'Shape of the concatenated array: {concatenated_array.shape}')

    return concatenated_array

In [4]:
# for 5 secondes samples

test_ads_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_5sec/grouped_embeddings_ad/test_embeddings_ads'
ad_test_embeddings = concatenate_numpy_arrays_from_folder(test_ads_directory)

val_ads_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_5sec/grouped_embeddings_ad/val_embeddings_ads'
ad_val_embeddings = concatenate_numpy_arrays_from_folder(val_ads_directory)

train_ads_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_5sec/grouped_embeddings_ad/train_embeddings_ads'
ad_train_embeddings = concatenate_numpy_arrays_from_folder(train_ads_directory)


test_pods_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_5sec/grouped_embeddings_podcast/test_embeddings'
pod_test_embeddings = concatenate_numpy_arrays_from_folder(test_pods_directory)

val_pods_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_5sec/grouped_embeddings_podcast/val_embeddings'
pod_val_embeddings = concatenate_numpy_arrays_from_folder(val_pods_directory)

train_pod_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_5sec/grouped_embeddings_podcast/train_embeddings'
pod_train_embeddings = concatenate_numpy_arrays_from_folder(train_pod_directory)

# print(pod_train_embeddings.shape)

Shape of the concatenated array: (59, 5, 128)
Shape of the concatenated array: (62, 5, 128)
Shape of the concatenated array: (507, 5, 128)
Shape of the concatenated array: (625, 5, 128)
Shape of the concatenated array: (621, 5, 128)
Shape of the concatenated array: (2860, 5, 128)


In [None]:
# for 1 secondes samples

# test_ads_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_1sec/grouped_embeddings_ad/test_embeddings_ads'
# ad_test_embeddings = concatenate_numpy_arrays_from_folder(test_ads_directory)

# val_ads_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_1sec/grouped_embeddings_ad/val_embeddings_ads'
# ad_val_embeddings = concatenate_numpy_arrays_from_folder(val_ads_directory)

# train_ads_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_1sec/grouped_embeddings_ad/train_embeddings_ads'
# ad_train_embeddings = concatenate_numpy_arrays_from_folder(train_ads_directory)


# test_pods_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_1sec/grouped_embeddings_podcast/test_embeddings_pods'
# pod_test_embeddings = concatenate_numpy_arrays_from_folder(test_pods_directory)

# val_pods_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_1sec/grouped_embeddings_podcast/val_embeddings_pods'
# pod_val_embeddings = concatenate_numpy_arrays_from_folder(val_pods_directory)

# train_pod_directory = '/content/drive/MyDrive/AD-Blocker Project/grouped_embeddings/vggish_1sec/grouped_embeddings_podcast/train_embeddings_pods'
# pod_train_embeddings = concatenate_numpy_arrays_from_folder(train_pod_directory)

print(pod_train_embeddings.shape)

Shape of the concatenated array: (302, 1, 128)
Shape of the concatenated array: (319, 1, 128)
Shape of the concatenated array: (2597, 1, 128)
Shape of the concatenated array: (668, 1, 128)
Shape of the concatenated array: (1940, 1, 128)
Shape of the concatenated array: (6115, 1, 128)
(6115, 1, 128)


In [5]:
def convert_to_2d_array(three_d_array):
    # Get the dimensions of the input array
    depth, rows, cols = three_d_array.shape

    # Reshape each 2D array to 1D and concatenate them
    flattened_arrays = [matrix.flatten() for matrix in three_d_array]
    two_d_array = np.vstack(flattened_arrays)

    return two_d_array

In [6]:
pod_train_embeddings = convert_to_2d_array(pod_train_embeddings)
ad_train_embeddings = convert_to_2d_array(ad_train_embeddings)

pod_val_embeddings = convert_to_2d_array(pod_val_embeddings)
ad_val_embeddings = convert_to_2d_array(ad_val_embeddings)

pod_test_embeddings =  convert_to_2d_array(pod_test_embeddings)
ad_test_embeddings = convert_to_2d_array(ad_test_embeddings)

print(pod_train_embeddings.shape)
print(ad_train_embeddings.shape)

print(pod_val_embeddings.shape)
print(ad_val_embeddings.shape)

print(pod_test_embeddings.shape)
print(ad_test_embeddings.shape)

(2860, 640)
(507, 640)
(621, 640)
(62, 640)
(625, 640)
(59, 640)


In [7]:
train_embeddings = np.concatenate((pod_train_embeddings, ad_train_embeddings))
val_embeddings = np.concatenate((pod_val_embeddings, ad_val_embeddings))
test_embeddings = np.concatenate((pod_test_embeddings, ad_test_embeddings))

In [8]:
def concatenate_zeros_and_ones(podcast_length, commercials_length):
    # Create array of zeros with size of podcast array
    zeros_array = np.zeros(podcast_length, dtype=int)

    # Create array of ones with size of commercials array
    ones_array = np.ones(commercials_length, dtype=int)

    # Concatenate arrays
    concatenated_array = np.concatenate((zeros_array, ones_array))

    return concatenated_array

In [9]:
train_labels = concatenate_zeros_and_ones(pod_train_embeddings.shape[0], ad_train_embeddings.shape[0])
val_labels = concatenate_zeros_and_ones(pod_val_embeddings.shape[0], ad_val_embeddings.shape[0])
test_lables = concatenate_zeros_and_ones(pod_test_embeddings.shape[0], ad_test_embeddings.shape[0])

In [10]:
def train_random_forest_classifier(train_embeddings, train_labels, n_estimators=8, random_state=2):
    """
    Train a Random Forest Classifier on the given training data.

    Parameters:
    ----------
    - train_embeddings (array-like): Feature vectors or embeddings of the training data.
    - train_labels (array-like): Labels corresponding to the training data.
    - n_estimators (int, optional): Number of trees in the forest. Default is 20.
    - random_state (int, optional): Seed for random number generation. Default is 50.

    Returns:
    -------
    - clf (RandomForestClassifier): Trained Random Forest Classifier model.
    """

    # Input validation
    train_embeddings, train_labels = check_X_y(train_embeddings, train_labels)

    # Initialize Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)

    # Train the classifier
    clf.fit(train_embeddings, train_labels)

    return clf

In [11]:
clf = train_random_forest_classifier(train_embeddings, train_labels)

In [12]:
from sklearn.metrics import accuracy_score

def evaluate_classifier(clf, validation_embeddings, validation_labels):
    """
    Evaluate the trained classifier on validation data.

    Parameters:
    ----------
    - clf (object): Trained classifier object.
    - validation_embeddings (array-like): Feature vectors or embeddings of the validation data.
    - validation_labels (array-like): Labels corresponding to the validation data.

    Returns:
    -------
    - val_accuracy (float): Accuracy of the classifier on the validation data.
    """

    # Perform predictions on validation data
    val_predictions = clf.predict(validation_embeddings)

    # Calculate validation accuracy
    val_accuracy = accuracy_score(validation_labels, val_predictions)
    # print("val prediction",val_predictions)
    return val_accuracy

In [13]:
# Example usage:
val_accuracy = evaluate_classifier(clf, val_embeddings, val_labels)
print("Validation Accuracy:", val_accuracy)
# print("val labels", val_labels)

Validation Accuracy: 0.9385065885797951


In [14]:
def evaluate_test_data(clf, test_embeddings, test_labels):
    """
    Evaluate the trained classifier on test data.

    Parameters:
    - clf (object): Trained classifier object.
    - test_embeddings (array-like): Feature vectors or embeddings of the test data.
    - test_labels (array-like): Labels corresponding to the test data.

    Returns:
    - test_predictions (array-like): Predicted labels for the test data.
    - test_accuracy (float): Accuracy of the classifier on the test data.
    """

    # Perform predictions on test data
    test_predictions = clf.predict(test_embeddings)

    # Calculate test accuracy
    test_accuracy = accuracy_score(test_labels, test_predictions)

    return test_predictions, test_accuracy


In [15]:
# Example usage:
test_predictions, test_accuracy = evaluate_test_data(clf, test_embeddings, test_lables)
#print("Test Predictions:", test_predictions)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.9312865497076024


In [16]:
def calculate_confusion_matrix(actual_labels, predicted_labels):
    """
    Calculate the confusion matrix based on actual and predicted labels.

    Parameters:
    actual_labels (list): List of actual labels (-1 for ads, 1 for podcasts).
    predicted_labels (list): List of predicted labels (-1 for ads, 1 for podcasts).

    Returns:
    tuple: A tuple containing true positives, false positives, true negatives, and false negatives.
    """
    true_positives = 0
    false_positives = 0
    true_negatives = 0
    false_negatives = 0

    for actual, predicted in zip(actual_labels, predicted_labels):
        if actual == 0 and predicted == 0:  # true_negatives
            true_negatives += 1
        elif actual == 1 and predicted == 0:  # false_negatives
            false_negatives += 1
        elif actual == 1 and predicted == 1:  # true_positives
            true_positives += 1
        elif actual == 0 and predicted == 1:  # false_positives
            false_positives += 1

    return true_positives, false_positives, true_negatives, false_negatives

In [17]:
# Call the function by passing the actual labels and predicted labels
true_positives, false_positives, true_negatives, false_negatives = calculate_confusion_matrix(test_lables, test_predictions)

# Print the results
print("True Positives:", true_positives)
print("False Positives:", false_positives)
print("True Negatives:", true_negatives)
print("False Negatives:", false_negatives)

# Calculate evaluation values
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
specificity = true_negatives / (true_negatives + false_positives)
f1_score = 2 * (precision * recall) / (precision + recall)

# Print the evaluation values
print("Precision:", precision)
print("Recall:", recall)
print("Specificity:", specificity)
print("F1 Score:", f1_score)

True Positives: 21
False Positives: 9
True Negatives: 616
False Negatives: 38
Precision: 0.7
Recall: 0.3559322033898305
Specificity: 0.9856
F1 Score: 0.4719101123595506


In [None]:
def train_random_forest_classifier(train_embeddings, train_labels, n_estimators=12, random_state=6):
    """
    Train a Random Forest Classifier on the given training data.

    Parameters:
    ----------
    - train_embeddings (array-like): Feature vectors or embeddings of the training data.
    - train_labels (array-like): Labels corresponding to the training data.
    - n_estimators (int, optional): Number of trees in the forest. Default is 20.
    - random_state (int, optional): Seed for random number generation. Default is 50.

    Returns:
    -------
    - clf (RandomForestClassifier): Trained Random Forest Classifier model.
    """

    # Input validation
    train_embeddings, train_labels = check_X_y(train_embeddings, train_labels)

    # Initialize Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)

    # Train the classifier
    clf.fit(train_embeddings, train_labels)

    return clf


clf = train_random_forest_classifier(train_embeddings, train_labels)


from sklearn.metrics import accuracy_score

def evaluate_classifier(clf, validation_embeddings, validation_labels):
    """
    Evaluate the trained classifier on validation data.

    Parameters:
    ----------
    - clf (object): Trained classifier object.
    - validation_embeddings (array-like): Feature vectors or embeddings of the validation data.
    - validation_labels (array-like): Labels corresponding to the validation data.

    Returns:
    -------
    - val_accuracy (float): Accuracy of the classifier on the validation data.
    """

    # Perform predictions on validation data
    val_predictions = clf.predict(validation_embeddings)

    # Calculate validation accuracy
    val_accuracy = accuracy_score(validation_labels, val_predictions)
    # print("val prediction",val_predictions)
    return val_accuracy

# Example usage:
val_accuracy = evaluate_classifier(clf, val_embeddings, val_labels)
print("Validation Accuracy:", val_accuracy)
# print("val labels", val_labels)


def evaluate_test_data(clf, test_embeddings, test_labels):
    """
    Evaluate the trained classifier on test data.

    Parameters:
    - clf (object): Trained classifier object.
    - test_embeddings (array-like): Feature vectors or embeddings of the test data.
    - test_labels (array-like): Labels corresponding to the test data.

    Returns:
    - test_predictions (array-like): Predicted labels for the test data.
    - test_accuracy (float): Accuracy of the classifier on the test data.
    """

    # Perform predictions on test data
    test_predictions = clf.predict(test_embeddings)

    # Calculate test accuracy
    test_accuracy = accuracy_score(test_labels, test_predictions)

    return test_predictions, test_accuracy


# Example usage:
test_predictions, test_accuracy = evaluate_test_data(clf, test_embeddings, test_lables)
#print("Test Predictions:", test_predictions)
print("Test Accuracy:", test_accuracy)

def calculate_confusion_matrix(actual_labels, predicted_labels):
    """
    Calculate the confusion matrix based on actual and predicted labels.

    Parameters:
    actual_labels (list): List of actual labels (-1 for ads, 1 for podcasts).
    predicted_labels (list): List of predicted labels (-1 for ads, 1 for podcasts).

    Returns:
    tuple: A tuple containing true positives, false positives, true negatives, and false negatives.
    """
    true_positives = 0
    false_positives = 0
    true_negatives = 0
    false_negatives = 0

    for actual, predicted in zip(actual_labels, predicted_labels):
        if actual == 0 and predicted == 0:  # true_negatives
            true_negatives += 1
        elif actual == 1 and predicted == 0:  # false_negatives
            false_negatives += 1
        elif actual == 1 and predicted == 1:  # true_positives
            true_positives += 1
        elif actual == 0 and predicted == 1:  # false_positives
            false_positives += 1

    return true_positives, false_positives, true_negatives, false_negatives

# Call the function by passing the actual labels and predicted labels
true_positives, false_positives, true_negatives, false_negatives = calculate_confusion_matrix(test_lables, test_predictions)

# Print the results
# print("True Positives:", true_positives)
print("False Positives:", false_positives)
# print("True Negatives:", true_negatives)
# print("False Negatives:", false_negatives)

# # Calculate evaluation values
# precision = true_positives / (true_positives + false_positives)
# recall = true_positives / (true_positives + false_negatives)
# specificity = true_negatives / (true_negatives + false_positives)
# f1_score = 2 * (precision * recall) / (precision + recall)

# Print the evaluation values
# print("Precision:", precision)
# print("Recall:", recall)
# print("Specificity:", specificity)
# print("F1 Score:", f1_score)





Validation Accuracy: 0.8654271801682161
Test Accuracy: 0.8288659793814434
False Positives: 56
