In [None]:
import os
from kaggle_secrets import UserSecretsClient

os.environ['WANDB_API_KEY'] =  UserSecretsClient().get_secret("WANDB_API_KEY")

In [None]:
# Install dependencies
!pip install wandb scikit-multilearn-ng scikit-learn

# Login to wandb
!wandb login

In [None]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
import gzip
from skmultilearn.adapt import MLkNN
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
import wandb

In [None]:
# Setup WANDB
run = wandb.init(
    project='pms',
    job_type='zip-test',
    save_code=True,
    name='zip-10',
    config={
    },
)

In [None]:
from sklearn.metrics import accuracy_score


# Micro averaged metrics, or per instance metrics

def exact_match_ratio(y_true, y_pred, epoch):
    exact_match_ratio_score = accuracy_score(y_true, y_pred)
    print(f"Epoch {epoch} - Exact Match Ratio: {exact_match_ratio_score}")
    run.summary[f"exact_match_ratio_epoch_{epoch}"] = exact_match_ratio_score


def micro_accuracy(y_true, y_pred, epoch):
    correctly_predicted_labels = np.logical_and(y_true, y_pred).sum(axis=1)  # Intersection of true and predicted labels
    total_labels = np.logical_or(y_true, y_pred).sum(axis=1)  # Union of true and predicted labels   
    instance_acc = correctly_predicted_labels / total_labels

    average_accuracy = np.average(instance_acc)
    print(f"Epoch {epoch} - Overall Accuracy: {average_accuracy}")
    run.summary[f"overall_accuracy_epoch_{epoch}"] = average_accuracy


def micro_precision(y_true, y_pred, epoch):
    correctly_predicted_labels = np.logical_and(y_true, y_pred).sum(axis=1)  # | Y ∩ Z | in the formula 
    count_of_predicted_labels = y_pred.sum(axis=1)  # | Z | in the formula 

    with np.errstate(divide='ignore', invalid='ignore'):  # Avoid division by zero
        precision_per_instance = np.true_divide(correctly_predicted_labels, count_of_predicted_labels)
        precision_per_instance[
            count_of_predicted_labels == 0] = 0  # Set precision to 0 where there are no actual positives

    average_precision = np.average(precision_per_instance)
    print(f"Epoch {epoch} - Overall Precision: {average_precision}")
    run.summary[f"overall_precision_epoch_{epoch}"] = average_precision


def micro_recall(y_true, y_pred, epoch):
    correctly_predicted_labels = np.logical_and(y_true, y_pred).sum(axis=1)  # | Y ∩ Z | in the formula
    count_of_true_positives = y_true.sum(axis=1)  # | Y | in the formula

    with np.errstate(divide='ignore', invalid='ignore'):  # Avoid division by zero
        recall_per_instance = np.true_divide(correctly_predicted_labels, count_of_true_positives)
        recall_per_instance[count_of_true_positives == 0] = 0  # Set recall to 0 where there are no predicted positives

    average_recall = np.average(recall_per_instance)
    print(f"Epoch {epoch} - Overall Recall: {average_recall}")
    run.summary[f"overall_recall_epoch_{epoch}"] = average_recall


def micro_f1_score(y_true, y_pred, epoch):
    true_positives = np.logical_and(y_true, y_pred).sum(axis=1)
    total_actual_positives = y_true.sum(axis=1)
    total_predicted_positives = y_pred.sum(axis=1)

    with np.errstate(divide='ignore', invalid='ignore'):
        f1_per_instance = 2 * true_positives / (total_actual_positives + total_predicted_positives)
        f1_per_instance[np.isnan(f1_per_instance)] = 0  # Set F1 to 0 where there are no positives (actual or predicted)

    average_f1 = np.average(f1_per_instance)

    print(f"Epoch {epoch} - Overall F1 Score: {average_f1}")
    run.summary[f"overall_f1_score_epoch_{epoch}"] = average_f1


# Hamming loss shows error, lower is better
def instance_hamming_loss(y_true, y_pred, epoch):
    xor_result = np.logical_xor(y_true, y_pred)  # True if prediction is wrong

    incorrect_predictions = np.sum(xor_result)  # Total number of incorrect predictions
    print(f"Total incorrect predictions: {incorrect_predictions}")

    total_labels = np.size(y_true)  # Total number of labels
    print(f"Total labels: {total_labels}")
    hamming_losses = incorrect_predictions / total_labels
    average_hamming_loss = np.average(hamming_losses)

    print(f"Epoch {epoch} - Hamming Loss: {average_hamming_loss}")
    run.summary[f"hamming_loss_epoch_{epoch}"] = average_hamming_loss


# Macro averaged metrics, or per label metrics

def log_per_label_metric(metric_values, metric_name, genres, epoch):
    for i, current_genre in enumerate(genres):
        # TODO: maybe log this as a table
        print(f"Epoch {epoch} - Per Label {metric_name} {current_genre}: {metric_values[i]:.4f}")
        run.summary[f"per_label_{metric_name}_{current_genre}_epoch_{epoch}"] = metric_values[i]

    log_per_label_metric_table(metric_values, metric_name, genres, epoch)


def log_per_label_metric_table(metric_values, metric_name, genres, epoch):
    table_rows = []

    for i, current_genre in enumerate(genres):
        row = [epoch, metric_name, current_genre, metric_values[i]]
        table_rows.append(row)

    table = wandb.Table(data=table_rows, columns=["Epoch", "Metric Name", "Genre", "Metric Value"])

    run.log({f"per_label_{metric_name}_epoch_{epoch}": table})


def log_per_label_average(metric_values, metric_name, epoch):
    average = np.average(metric_values)
    print(f"Per Label Average {metric_name} epoch {epoch}: {average:.4f}")
    run.summary[f"per_label_average_{metric_name}_epoch_{epoch}"] = average
    run.summary[f"per_label_average_{metric_name}"] = average


def label_based_accuracy(y_true, y_pred, epoch):
    num_labels = y_true.shape[1]
    _accuracy_per_label = []

    for label_idx in range(num_labels):
        correct_predictions = np.sum(y_true[:, label_idx] == y_pred[:, label_idx])
        total_predictions = y_true.shape[0]

        label_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
        _accuracy_per_label.append(label_accuracy)

    log_per_label_metric(_accuracy_per_label, 'Accuracy', INCLUDED_GENRES, epoch)
    log_per_label_average(_accuracy_per_label, 'Accuracy', epoch)


def precision_per_label(y_true, y_pred, epoch):
    num_labels = y_true.shape[1]
    _precision_per_label = []
    for label_idx in range(num_labels):
        true_positives = np.sum((y_pred[:, label_idx] == 1) & (y_true[:, label_idx] == 1))
        false_positives = np.sum((y_pred[:, label_idx] == 1) & (y_true[:, label_idx] == 0))

        label_precision = true_positives / (
                true_positives + false_positives) if true_positives + false_positives > 0 else 0
        _precision_per_label.append(label_precision)

    log_per_label_metric(_precision_per_label, 'Precision', INCLUDED_GENRES, epoch)
    log_per_label_average(_precision_per_label, 'Precision', epoch)


def recall_per_label(y_true, y_pred, epoch):
    num_labels = y_true.shape[1]
    _recall_per_label = []
    for label_idx in range(num_labels):
        true_positives = np.sum((y_pred[:, label_idx] == 1) & (y_true[:, label_idx] == 1))
        false_negatives = np.sum((y_pred[:, label_idx] == 0) & (y_true[:, label_idx] == 1))

        label_recall = true_positives / (
                true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
        _recall_per_label.append(label_recall)

    log_per_label_metric(_recall_per_label, 'Recall', INCLUDED_GENRES, epoch)
    log_per_label_average(_recall_per_label, 'Recall', epoch)


def f1_score_per_label(y_true, y_pred, epoch):
    num_labels = y_true.shape[1]
    _f1_scores = []
    for label_idx in range(num_labels):
        true_positives = np.sum((y_pred[:, label_idx] == 1) & (y_true[:, label_idx] == 1))
        false_positives = np.sum((y_pred[:, label_idx] == 1) & (y_true[:, label_idx] == 0))
        false_negatives = np.sum((y_pred[:, label_idx] == 0) & (y_true[:, label_idx] == 1))

        label_precision = true_positives / (
                true_positives + false_positives) if true_positives + false_positives > 0 else 0
        label_recall = true_positives / (
                true_positives + false_negatives) if true_positives + false_negatives > 0 else 0

        label_f1 = 2 * (label_precision * label_recall) / (label_precision + label_recall) if (
                                                                                                      label_precision + label_recall) > 0 else 0
        _f1_scores.append(label_f1)

    log_per_label_metric(_f1_scores, 'F1', INCLUDED_GENRES, epoch)
    log_per_label_average(_f1_scores, 'F1', epoch)


# Calculate and log all metrics

def calculate_and_log_metrics(true_values, predicted_values, _epoch):
    # Exact match ratio
    exact_match_ratio(true_values, predicted_values, _epoch)

    # Micro averaged metrics
    micro_accuracy(true_values, predicted_values, _epoch)
    micro_precision(true_values, predicted_values, _epoch)
    micro_recall(true_values, predicted_values, _epoch)
    micro_f1_score(true_values, predicted_values, _epoch)
    instance_hamming_loss(true_values, predicted_values, _epoch)

    # Macro averaged metrics
    label_based_accuracy(true_values, predicted_values, _epoch)
    precision_per_label(true_values, predicted_values, _epoch)
    recall_per_label(true_values, predicted_values, _epoch)
    f1_score_per_label(true_values, predicted_values, _epoch)


In [None]:

file_path = '/kaggle/input/movie-dataset-filtered/movies_metadata_filtered.csv'

data = pd.read_csv(file_path, low_memory=False)

# convert transformed genres to list using ast
data['genres'] = data['transformed_genres'].apply(lambda x: ast.literal_eval(x))

data.head()

train, temp = train_test_split(data, test_size=0.2, random_state=23)
val, test = train_test_split(temp, test_size=0.5, random_state=23)

data = pd.concat([train, val])

GENRES_2 = ['Comedy', 'Drama', 'Documentary', 'Romance', 'Horror', 'Action', 'Thriller', 'Family', 'Adventure', 'Crime', 'Science Fiction']
data['genres'] = data['genres'].apply(lambda x: [genre for genre in x if genre in GENRES_2])
data = data[data['genres'].apply(len) > 0]
train, val = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
INCLUDED_GENRES = GENRES_2

In [None]:
train['overview_compressed'] = train['overview'].apply(lambda x: len(gzip.compress(x.encode())))

In [None]:
def get_multilabel_data(overview):
    # Standard gzip distance calculation
    x1 = overview
    cx1 = len(gzip.compress(x1.encode()))
    distance_from_x1 = []

    training_set_idx = []
    genres_idx = []

    for train_index, train_row in train.iterrows():
        x2 = train_row.overview
        cx2 = train_row.overview_compressed
        x1x2 = " ".join([x1, x2])
        cx1x2 = len(gzip.compress(x1x2.encode()))
        ncd = (cx1x2 - min(cx1 ,cx2)) / max(cx1 , cx2)
        distance_from_x1.append(ncd)
        training_set_idx.append(train_index)
        genres_idx.append(train_row.genres)

    return distance_from_x1, training_set_idx, genres_idx

In [None]:
import time

In [None]:
def balanced_genre_sampling(df, first_target, second_target):
    sampled_indices = []
    genre_counts = df.explode('genres')['genres'].value_counts().sort_values()
    print(genre_counts)
    genre_to_indices = {genre: [] for genre in genre_counts.keys()}

    for genre in genre_counts.index:
        filtered_df = df[df['genres'].apply(lambda genres: genre in genres)]
        print("Genre: ", genre)
        for index, row in filtered_df.iterrows():
            genres = row['genres']
            
            if len(genres) <= 1:
                continue

            if all(len(genre_to_indices[g]) < first_target for g in genres):
                for g in genres:
                    if len(genre_to_indices[g]) < first_target:
                        genre_to_indices[g].append(index)
                if index not in sampled_indices:
                    sampled_indices.append(index)


    second_pass_samples = []
    for genre, indices in genre_to_indices.items():
        if len(indices) < second_target:
            single_genre_df = df[df.apply(lambda x: x['genres'] == [genre], axis=1)]
            additional_samples = single_genre_df.sample(
                min(second_target - len(indices), len(single_genre_df)), random_state=42
            )
            second_pass_samples.extend(additional_samples.index.tolist())

    # Combine and remove duplicates
    final_indices = list(set(sampled_indices + second_pass_samples))
    final_sampled_df = df.loc[final_indices]

    return final_sampled_df


In [None]:
sampled_df = balanced_genre_sampling(val, 35, 30)

In [None]:
sampled_df

In [None]:
sampled_df.explode('genres')['genres'].value_counts()

In [None]:
val_predictions = []
val_predictions_binary = []
counter = 0
start_time = time.time()

for idx, row in sampled_df.iterrows():
    # calculate distances between current movie and all movies in training set
    distances, rows, genres = get_multilabel_data(row.overview)

    # prepare data
    mlb = MultiLabelBinarizer(classes=GENRES_2)
    y_train_binarized = mlb.fit_transform(genres)
    x_train = np.array(distances).reshape(-1, 1)

    # use scikit multilearn 
    mlknn = MLkNN(k=10)
    mlknn.fit(x_train, y_train_binarized)

    X_test = np.array([[0]])
    y_pred = mlknn.predict(X_test)

    predicted_genres = mlb.inverse_transform(y_pred)

    y_pred = y_pred.toarray()[0]

    val_predictions.append(predicted_genres)
    val_predictions_binary.append(y_pred)
    
    counter += 1
  
    if counter == 1:
        process_time = time.time() - start_time
        print(f"Time taken for 1 row: {process_time}")
        
    if counter % 10 == 0:
        print("processed 10th row")
    if counter == 100:
        print("processed 100th row")
    if counter % 1000 == 0:
        print(f"Processed {counter} rows")
        break
        
# multilabel binarize dataframe of validation data
mlb = MultiLabelBinarizer(classes=GENRES_2)
val_genres_binarized = mlb.fit_transform(val['genres'])

# print first val genres binarized
print(val_genres_binarized[0])
print(val_predictions_binary[0])

In [None]:
val_predictions_binary1 = np.array(val_predictions_binary)

In [None]:
# ensure same length
val_genres_binarized1 = val_genres_binarized[:len(val_predictions_binary1)]

In [None]:
calculate_and_log_metrics(val_genres_binarized1, val_predictions_binary1, 0)

In [None]:
wandb.finish()