In [None]:
# Setup environment
import os
from kaggle_secrets import UserSecretsClient

os.environ['TF_USE_LEGACY_KERAS'] = '1' # To fix ktrain installation
os.environ['WANDB_API_KEY'] =  UserSecretsClient().get_secret("WANDB_API_KEY")

In [None]:
# Install dependencies
!pip install ktrain wandb

# Login to wandb
!wandb login

In [None]:
# Standard imports
import pandas as pd
import numpy as np

# ktrain imports
import ktrain
from ktrain import text

# wandb import
import wandb

In [None]:
# Hyperparameter values
ALL_GENRES = ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy',
              'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War',
              'Western']

# MODEL
MODEL_1 = 'distilbert/distilbert-base-uncased'
MODEL_2 = 'google-bert/bert-base-uncased'
MODEL_3 = 'distilbert/distilroberta-base'

# BATCH_SIZE
BATCH_SIZE_1 = 16
BATCH_SIZE_2 = 32
BATCH_SIZE_3 = 64
BATCH_SIZE_4 = 128

# LEARNING_RATE
LEARNING_RATE_1 = 3.4e-5
LEARNING_RATE_2 = 1e-5
LEARNING_RATE_3 = 5e-5

# GENRES
GENRES_1 = ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy',
            'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War',
            'Western']
GENRES_2 = ['Comedy', 'Drama', 'Documentary', 'Romance', 'Horror', 'Action', 'Thriller', 'Family', 'Adventure',
            'Crime', 'Science Fiction']

In [None]:
# HYPERPARAMETERS

BATCH_SIZE = BATCH_SIZE_3
EPOCHS = 3
LEARNING_RATE = 3.4e-5 # not in use for lr_estimate job type
MODEL = MODEL_3
INCLUDED_GENRES = GENRES_2
FILTERING_STRATEGY_FOR_GENRES = "remove_only_labels" # remove_only_labels, remove_movies

In [None]:
model_short_name = "distilbert" if MODEL == MODEL_1 else "bert" if MODEL == MODEL_2 else "distilroberta"

run_name = f"model={MODEL}_genresCount={len(INCLUDED_GENRES)}_filtering={FILTERING_STRATEGY_FOR_GENRES}_bs={BATCH_SIZE}"

print(run_name)

In [None]:
# Setup WANDB
run = wandb.init(
    project='pms',
    job_type='lr_estimate',
    save_code=True,
    name=run_name,
    config={
        "batch_size": BATCH_SIZE,
        "epochs": EPOCHS,
        "learning_rate": LEARNING_RATE,
        "model": MODEL,
        "included_genres": INCLUDED_GENRES,
        "filtering_strategy_for_genres": FILTERING_STRATEGY_FOR_GENRES,
        "GPU_USED": "P100",
    },
)

- `remove_only_labels`: removes labels from samples that are not in the included genres and removes samples that have no labels left
- `remove_movies`: removes samples where any of the labels are not in the included genres

In [None]:
# Load data
train_path = '/kaggle/input/train.csv'
val_path = '/kaggle/input/val.csv'

train_data = pd.read_csv(train_path)
val_data = pd.read_csv(val_path)

# Join train and val before filtering
data = pd.concat([train_data, val_data])

In [None]:
feature_column = 'overview'
all_label_columns = train_data.columns[13:]
label_columns = INCLUDED_GENRES

In [None]:
from sklearn.model_selection import train_test_split

def split_data(data, test_size=.2, random_state=42):
    train, test = train_test_split(data, test_size=test_size, random_state=random_state)
    
    return train, test

def filter_data(data, label_columns, strategy):
    if strategy == 'remove_only_labels':
        label_sum = data[label_columns].sum(axis=1)
        data = data[label_sum > 0]
    elif strategy == 'remove_movies':
        # TODO: think about this one
        all_genres = set(ALL_GENRES)
        not_interest_genres = all_genres - set(label_columns)
        
        for col in not_interest_genres:
            data = data[data[col] == 0]
    else:
        raise ValueError(f"Unknown strategy: {strategy}")
    
    return data

In [None]:
filtering_strategy = FILTERING_STRATEGY_FOR_GENRES

# Filter the data
initial_number_of_samples = data.shape[0]
data = filter_data(data, label_columns, filtering_strategy)

excluded_samples = initial_number_of_samples - data.shape[0]
print("Number of excluded samples:", excluded_samples)

# Split the data again
train_data, val_data = split_data(data, test_size=.2, random_state=42)

print("Train data shape:", train_data.shape)
print("Validation data shape:", val_data.shape)

In [None]:
# Prepare data format for ktrain use
X_train = train_data[feature_column].tolist()
Y_train = train_data[label_columns].to_numpy()

X_val = val_data[feature_column].tolist()
Y_val = val_data[label_columns].to_numpy()

In [None]:
# Calculate class weights

genre_counts = np.sum(Y_train, axis=0)
total_samples = len(Y_train)

genre_freq = genre_counts / total_samples

class_weights = {i: (1 / freq) if freq > 0 else 0 for i, freq in enumerate(genre_freq)}

# Normalize class weights to make the minimum weight 1.0
# https://www.analyticsvidhya.com/blog/2020/10/improve-class-imbalance-class-weights/
min_weight = min(class_weights.values())
class_weights = {i: weight / min_weight for i, weight in class_weights.items()}

print(class_weights)

In [None]:
transformer = text.Transformer(MODEL, maxlen=180, class_names=label_columns, batch_size=BATCH_SIZE)
# TODO: emphasize max len is chosen based on data

train_data = transformer.preprocess_train(X_train, Y_train)
val_data = transformer.preprocess_test(X_val, Y_val)

In [None]:
# Create a model and learner
model = transformer.get_classifier()
learner = ktrain.get_learner(model, train_data=train_data, val_data=val_data, batch_size=BATCH_SIZE)

In [None]:
learner.lr_find(show_plot=False, class_weight=class_weights)

In [None]:
learner.lr_plot(suggest=True)

In [None]:
losses = learner.lr_finder.losses
lrs = learner.lr_finder.lrs

data = [[lr, loss] for lr, loss in zip(lrs, losses)]
table = wandb.Table(data=data, columns=["Learning Rate", "Loss"])

run.log({
    "lr_vs_loss": wandb.plot.line(table, "Learning Rate", "Loss", title="Learning Rate vs Loss Plot")
})

About `learner.lr_estimate()` from the [documentation](https://amaiya.github.io/ktrain/core.html#ktrain.core.Learner.lr_estimate):

Return numerical estimates of lr using two different methods:
  1. lr associated with minum numerical gradient (None if gradient computation fails)
  2. lr associated with minimum loss divided by 10
  3. lr associated with longest valley
Since none of these methods are fool-proof and can
potentially return bad estimates, it is recommended that you
examine the plot generated by lr_plot to estimate the learning rate.

Returns:
  tuple: tuple of the form (float, float)


In [None]:
lr_estimates = learner.lr_estimate()

run.summary["lr_estimate_min_gradient"] = lr_estimates[0]
run.summary["lr_estimate_min_loss_div_10"] = lr_estimates[1]
run.summary["lr_estimate_longest_valley"] = lr_estimates[2]

# Print estimates so it is saved with the run
print("lr_estimate_min_gradient:", lr_estimates[0])
print("lr_estimate_min_loss_div_10:", lr_estimates[1])
print("lr_estimate_longest_valley:", lr_estimates[2])

In [None]:
run.finish()