In [None]:
# Setup environment
import os
from kaggle_secrets import UserSecretsClient

os.environ['TF_USE_LEGACY_KERAS'] = '1'  # To fix ktrain installation
os.environ['WANDB_API_KEY'] =  UserSecretsClient().get_secret("WANDB_API_KEY")

In [None]:
# Install dependencies
!pip install ktrain wandb

# Login to wandb
!wandb login

In [None]:
# Standard imports
import pandas as pd
import numpy as np

# ktrain imports
import ktrain

# wandb import
import wandb

In [None]:
# Hyperparameter values
ALL_GENRES = ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy',
              'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War',
              'Western']

# GENRES
GENRES_1 = ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy',
            'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War',
            'Western']
GENRES_2 = ['Comedy', 'Drama', 'Documentary', 'Romance', 'Horror', 'Action', 'Thriller', 'Family', 'Adventure',
            'Crime', 'Science Fiction']

In [None]:
 # HYPERPARAMETER CONSTANTS

INCLUDED_GENRES = GENRES_2
FILTERING_STRATEGY_FOR_GENRES = "remove_only_labels"  # remove_only_labels, remove_movies

In [None]:
# NON HYPERPARAMETER CONSTANTS

THRESHOLD = 0.5

In [None]:
# Setup WANDB
run = wandb.init(
    project='test-pms',
    job_type='test',
    save_code=True,
    config={
    },
)

- `remove_only_labels`: removes labels from samples that are not in the included genres and removes samples that have no labels left
- `remove_movies`: removes samples where any of the labels are not in the included genres

In [None]:
# Load data
test_path = '/kaggle/input/movie-genre/test.csv'

test_data = pd.read_csv(test_path)

In [None]:
feature_column = 'overview'
label_columns = INCLUDED_GENRES

In [None]:
from sklearn.model_selection import train_test_split


def split_data(_data, test_size=.2, random_state=42):
    train, test = train_test_split(_data, test_size=test_size, random_state=random_state)

    return train, test


def filter_data(_data, _label_columns, strategy):
    if strategy == 'remove_only_labels':
        label_sum = _data[_label_columns].sum(axis=1)
        _data = _data[label_sum > 0]
    elif strategy == 'remove_movies':
        all_genres = set(ALL_GENRES)
        not_interest_genres = all_genres - set(_label_columns)

        for col in not_interest_genres:
            _data = _data[data[col] == 0]
    else:
        raise ValueError(f"Unknown strategy: {strategy}")

    return _data

In [None]:
filtering_strategy = FILTERING_STRATEGY_FOR_GENRES

# Filter the data
initial_number_of_samples = test_data.shape[0]
data = filter_data(test_data, label_columns, filtering_strategy)

data = data[label_columns + [col for col in data.columns if col not in label_columns]]

# Print the number of excluded samples
excluded_samples = initial_number_of_samples - data.shape[0]
print("Number of excluded samples:", excluded_samples)

In [None]:
# Prepare data format for keras use
X_test = test_data[feature_column].tolist()
Y_test = test_data[label_columns].to_numpy()

In [None]:
# Calculate class weights
# Needed for weighted average evaluation, not for training

genre_counts = np.sum(Y_test, axis=0)
total_samples = len(Y_test)

genre_freq = genre_counts / total_samples

class_weights = {i: (1 / freq) if freq > 0 else 0 for i, freq in enumerate(genre_freq)}

# Normalize class weights to make the minimum weight 1.0
# https://www.analyticsvidhya.com/blog/2020/10/improve-class-imbalance-class-weights/
min_weight = min(class_weights.values())
class_weights = {i: weight / min_weight for i, weight in class_weights.items()}

print(class_weights)

In [None]:
# Load the model and the learner

# Fetch artifact
artifact = run.use_artifact(f'pms/model_epoch_4:latest')

# Download the artifact
artifact_dir = artifact.download()

# Load the predictor
predictor = ktrain.load_predictor(artifact_dir)

# Load the preprocessor
preprocessor = predictor.preproc

In [None]:
val_data = preprocessor.preprocess_test(X_test, Y_test)

# Evaluation preparation

# FUNCTIONS FOR METRICS EVALUATION AND LOGGING

In [None]:
from sklearn.metrics import accuracy_score


# Micro averaged metrics, or per instance metrics

def exact_match_ratio(y_true, y_pred, epoch):
    exact_match_ratio_score = accuracy_score(y_true, y_pred)
    print(f"Epoch {epoch} - Exact Match Ratio: {exact_match_ratio_score}")
    run.summary[f"exact_match_ratio_epoch_{epoch}"] = exact_match_ratio_score


def micro_accuracy(y_true, y_pred, epoch):
    correctly_predicted_labels = np.logical_and(y_true, y_pred).sum(axis=1)  # Intersection of true and predicted labels
    total_labels = np.logical_or(y_true, y_pred).sum(axis=1)  # Union of true and predicted labels   
    instance_acc = correctly_predicted_labels / total_labels

    average_accuracy = np.average(instance_acc)
    print(f"Epoch {epoch} - Overall Accuracy: {average_accuracy}")
    run.summary[f"overall_accuracy_epoch_{epoch}"] = average_accuracy


def micro_precision(y_true, y_pred, epoch):
    correctly_predicted_labels = np.logical_and(y_true, y_pred).sum(axis=1)  # | Y ∩ Z | in the formula 
    count_of_predicted_labels = y_pred.sum(axis=1)  # | Z | in the formula 

    with np.errstate(divide='ignore', invalid='ignore'):  # Avoid division by zero
        precision_per_instance = np.true_divide(correctly_predicted_labels, count_of_predicted_labels)
        precision_per_instance[
            count_of_predicted_labels == 0] = 0  # Set precision to 0 where there are no actual positives

    average_precision = np.average(precision_per_instance)
    print(f"Epoch {epoch} - Overall Precision: {average_precision}")
    run.summary[f"overall_precision_epoch_{epoch}"] = average_precision


def micro_recall(y_true, y_pred, epoch):
    correctly_predicted_labels = np.logical_and(y_true, y_pred).sum(axis=1)  # | Y ∩ Z | in the formula
    count_of_true_positives = y_true.sum(axis=1)  # | Y | in the formula

    with np.errstate(divide='ignore', invalid='ignore'):  # Avoid division by zero
        recall_per_instance = np.true_divide(correctly_predicted_labels, count_of_true_positives)
        recall_per_instance[count_of_true_positives == 0] = 0  # Set recall to 0 where there are no predicted positives

    average_recall = np.average(recall_per_instance)
    print(f"Epoch {epoch} - Overall Recall: {average_recall}")
    run.summary[f"overall_recall_epoch_{epoch}"] = average_recall


def micro_f1_score(y_true, y_pred, epoch):
    true_positives = np.logical_and(y_true, y_pred).sum(axis=1)
    total_actual_positives = y_true.sum(axis=1)
    total_predicted_positives = y_pred.sum(axis=1)

    with np.errstate(divide='ignore', invalid='ignore'):
        f1_per_instance = 2 * true_positives / (total_actual_positives + total_predicted_positives)
        f1_per_instance[np.isnan(f1_per_instance)] = 0  # Set F1 to 0 where there are no positives (actual or predicted)

    average_f1 = np.average(f1_per_instance)

    print(f"Epoch {epoch} - Overall F1 Score: {average_f1}")
    run.summary[f"overall_f1_score_epoch_{epoch}"] = average_f1


# Hamming loss shows error, lower is better
def instance_hamming_loss(y_true, y_pred, epoch):
    xor_result = np.logical_xor(y_true, y_pred)  # True if prediction is wrong

    incorrect_predictions = np.sum(xor_result)  # Total number of incorrect predictions
    print(f"Total incorrect predictions: {incorrect_predictions}")

    total_labels = np.size(y_true)  # Total number of labels
    print(f"Total labels: {total_labels}")
    hamming_losses = incorrect_predictions / total_labels
    average_hamming_loss = np.average(hamming_losses)

    print(f"Epoch {epoch} - Hamming Loss: {average_hamming_loss}")
    run.summary[f"hamming_loss_epoch_{epoch}"] = average_hamming_loss


# Macro averaged metrics, or per label metrics

def log_per_label_metric(metric_values, metric_name, genres, epoch):
    for i, current_genre in enumerate(genres):
        # TODO: maybe log this as a table
        print(f"Epoch {epoch} - Per Label {metric_name} {current_genre}: {metric_values[i]:.4f}")
        run.summary[f"per_label_{metric_name}_{current_genre}_epoch_{epoch}"] = metric_values[i]

    log_per_label_metric_table(metric_values, metric_name, genres, epoch)


def log_per_label_metric_table(metric_values, metric_name, genres, epoch):
    table_rows = []

    for i, current_genre in enumerate(genres):
        row = [epoch, metric_name, current_genre, metric_values[i]]
        table_rows.append(row)

    table = wandb.Table(data=table_rows, columns=["Epoch", "Metric Name", "Genre", "Metric Value"])

    run.log({f"per_label_{metric_name}_epoch_{epoch}": table})


def log_per_label_average(metric_values, metric_name, epoch):
    average = np.average(metric_values)
    print(f"Per Label Average {metric_name} epoch {epoch}: {average:.4f}")
    run.summary[f"per_label_average_{metric_name}_epoch_{epoch}"] = average
    run.summary[f"per_label_average_{metric_name}"] = average

    log_per_label_weighted_average(metric_values, metric_name, epoch)


def log_per_label_weighted_average(metric_values, metric_name, epoch):
    weights = [class_weights[i] for i in range(len(metric_values))]
    print(metric_values)
    print(weights)
    weighted_average = np.average(metric_values, weights=list(weights))
    print(f"Per Label Weighted Average {metric_name} epoich {epoch}: {weighted_average:.4f}")
    run.summary[f"per_label_weighted_average_{metric_name}_epoch_{epoch}"] = weighted_average
    run.summary[f"per_label_weighted_average_{metric_name}"] = weighted_average


def label_based_accuracy(y_true, y_pred, epoch):
    num_labels = y_true.shape[1]
    _accuracy_per_label = []

    for label_idx in range(num_labels):
        correct_predictions = np.sum(y_true[:, label_idx] == y_pred[:, label_idx])
        total_predictions = y_true.shape[0]

        label_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
        _accuracy_per_label.append(label_accuracy)

    log_per_label_metric(_accuracy_per_label, 'Accuracy', INCLUDED_GENRES, epoch)
    log_per_label_average(_accuracy_per_label, 'Accuracy', epoch)


def precision_per_label(y_true, y_pred, epoch):
    num_labels = y_true.shape[1]
    _precision_per_label = []
    for label_idx in range(num_labels):
        true_positives = np.sum((y_pred[:, label_idx] == 1) & (y_true[:, label_idx] == 1))
        false_positives = np.sum((y_pred[:, label_idx] == 1) & (y_true[:, label_idx] == 0))

        label_precision = true_positives / (
                true_positives + false_positives) if true_positives + false_positives > 0 else 0
        _precision_per_label.append(label_precision)

    log_per_label_metric(_precision_per_label, 'Precision', INCLUDED_GENRES, epoch)
    log_per_label_average(_precision_per_label, 'Precision', epoch)


def recall_per_label(y_true, y_pred, epoch):
    num_labels = y_true.shape[1]
    _recall_per_label = []
    for label_idx in range(num_labels):
        true_positives = np.sum((y_pred[:, label_idx] == 1) & (y_true[:, label_idx] == 1))
        false_negatives = np.sum((y_pred[:, label_idx] == 0) & (y_true[:, label_idx] == 1))

        label_recall = true_positives / (
                true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
        _recall_per_label.append(label_recall)

    log_per_label_metric(_recall_per_label, 'Recall', INCLUDED_GENRES, epoch)
    log_per_label_average(_recall_per_label, 'Recall', epoch)


def f1_score_per_label(y_true, y_pred, epoch):
    num_labels = y_true.shape[1]
    _f1_scores = []
    for label_idx in range(num_labels):
        true_positives = np.sum((y_pred[:, label_idx] == 1) & (y_true[:, label_idx] == 1))
        false_positives = np.sum((y_pred[:, label_idx] == 1) & (y_true[:, label_idx] == 0))
        false_negatives = np.sum((y_pred[:, label_idx] == 0) & (y_true[:, label_idx] == 1))

        label_precision = true_positives / (
                true_positives + false_positives) if true_positives + false_positives > 0 else 0
        label_recall = true_positives / (
                true_positives + false_negatives) if true_positives + false_negatives > 0 else 0

        label_f1 = 2 * (label_precision * label_recall) / (label_precision + label_recall) if (
                                                                                                      label_precision + label_recall) > 0 else 0
        _f1_scores.append(label_f1)

    log_per_label_metric(_f1_scores, 'F1', INCLUDED_GENRES, epoch)
    log_per_label_average(_f1_scores, 'F1', epoch)


# Calculate and log all metrics

def calculate_and_log_metrics(true_values, predicted_values, _epoch):
    # Exact match ratio
    exact_match_ratio(true_values, predicted_values, _epoch)

    # Micro averaged metrics
    micro_accuracy(true_values, predicted_values, _epoch)
    micro_precision(true_values, predicted_values, _epoch)
    micro_recall(true_values, predicted_values, _epoch)
    micro_f1_score(true_values, predicted_values, _epoch)
    instance_hamming_loss(true_values, predicted_values, _epoch)

    # Macro averaged metrics
    label_based_accuracy(true_values, predicted_values, _epoch)
    precision_per_label(true_values, predicted_values, _epoch)
    recall_per_label(true_values, predicted_values, _epoch)
    f1_score_per_label(true_values, predicted_values, _epoch)


# Train and evaluate the model

In [None]:
def predictions_to_probability_array(Y_pred):
    all_ordered_probabilities = []

    for prediction_set in Y_pred:
        genre_to_prob = {genre_prob[0]: float(genre_prob[1]) for genre_prob in prediction_set}
        ordered_probabilities = np.array([genre_to_prob[genre] for genre in INCLUDED_GENRES])
        all_ordered_probabilities.append(ordered_probabilities)

    all_ordered_probabilities = np.array(all_ordered_probabilities)

    return all_ordered_probabilities

In [None]:
total_training_time = 0

# Predict the validation set
Y_pred = predictor.predict(X_test)
Y_pred = predictions_to_probability_array(Y_pred)

In [None]:
# Binarize the predictions based on a threshold
threshold = THRESHOLD
Y_pred_binarized = (Y_pred > threshold).astype(int)

calculate_and_log_metrics(Y_test, Y_pred_binarized, 0)

In [None]:
wandb.finish()