In [None]:
# Setup environment
import os
from kaggle_secrets import UserSecretsClient

os.environ['TF_USE_LEGACY_KERAS'] = '1'  # To fix ktrain installation
os.environ['WANDB_API_KEY'] =  UserSecretsClient().get_secret("WANDB_API_KEY")

In [None]:
# Install dependencies
!pip install ktrain wandb

# Login to wandb
!wandb login

In [None]:
# Standard imports
import pandas as pd
import numpy as np

import ast
from sklearn.preprocessing import MultiLabelBinarizer


In [None]:
# ktrain imports
import ktrain

# wandb import
import wandb

In [None]:
# Hyperparameter values

ALL_GENRES = ['Comedy', 'Drama', 'Documentary', 'Romance', 'Horror', 'Action', 'Thriller', 'Family', 'Adventure',
            'Crime', 'Science Fiction']

In [None]:
# NON HYPERPARAMETER CONSTANTS

THRESHOLD = 0.5

- `remove_only_labels`: removes labels from samples that are not in the included genres and removes samples that have no labels left
- `remove_movies`: removes samples where any of the labels are not in the included genres

In [None]:
# Load data
test_path = '/kaggle/input/movie-dataset-filtered/movies_metadata_filtered.csv'

data = pd.read_csv(test_path)

In [None]:
data['genres'] = data['transformed_genres'].apply(ast.literal_eval)

In [None]:
data

In [None]:
mlb = MultiLabelBinarizer()
data = data.join(pd.DataFrame(mlb.fit_transform(data.pop('genres')),
                                  columns=mlb.classes_,
                                  index=data.index))

In [None]:
feature_column = 'overview'
label_columns = ALL_GENRES

In [None]:
data = data[label_columns + [col for col in data.columns if col not in label_columns]]

In [None]:
# Prepare data format for keras use
X_test = data[feature_column].tolist()

In [None]:
Y_test = data[label_columns].to_numpy()

In [None]:
wandb.init()

In [None]:
# Load the model and the learner

# Fetch artifact
artifact = wandb.use_artifact(f'pms/model_epoch_4:latest')

# Download the artifact
artifact_dir = artifact.download()

# Load the predictor
predictor = ktrain.load_predictor(artifact_dir)

# Load the preprocessor
preprocessor = predictor.preproc

In [None]:
def predictions_to_probability_array(Y_pred):
    all_ordered_probabilities = []

    for prediction_set in Y_pred:
        genre_to_prob = {genre_prob[0]: float(genre_prob[1]) for genre_prob in prediction_set}
        ordered_probabilities = np.array([genre_to_prob[genre] for genre in INCLUDED_GENRES])
        all_ordered_probabilities.append(ordered_probabilities)

    all_ordered_probabilities = np.array(all_ordered_probabilities)

    return all_ordered_probabilities

In [None]:
# example prediction
Y_pred = predictor.predict(["satan goes to town to play violin"])

In [None]:
Y_pred = predictor.predict(X_test)

In [None]:
Y_pred

In [None]:
data_dicts = [{genre: prob for genre, prob in sublist} for sublist in Y_pred]

In [None]:
data_dicts

In [None]:
df = pd.DataFrame(data_dicts)

In [None]:
df.to_csv('/kaggle/working/genre_probabilities.csv', index=False)

In [None]:
merged_df = pd.concat([data, df], axis=1)

In [None]:
merged_df

In [None]:
df.to_csv('/kaggle/working/movie_probabilities.csv', index=False)

In [None]:
wandb.finish()