In [27]:
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
# %pip install transformers[torch] datasets accelerate>=0.26.0
%pip install ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [28]:
import random

# Generate train data

In [29]:
# import labels and relationships from the json files
import json
with open('./../../useful_dataset/graph/unique_movies.json') as f:
    file = json.load(f)
    entities = file

In [30]:
len(entities)

25523

In [31]:
templates = [
    # Director-related queries
    "Who directed {}",
    "Can you tell me who the director of {} is",
    "Who is behind the camera for '{}'",

    # Screenwriter-related queries
    "Who wrote the screenplay for {}",
    "Which writer worked on {}",
    "Who is credited as the screenwriter of {}",

    # Actor-related queries
    "Which actors starred in {}",
    "Who are the main actors in '{}'",
    "Who played in {}",
    "Can you name any actors from {}",

    # Recommendations based on movies
    "If I enjoyed {}, {}, {}, and {} what other movies might I like",
    "If I enjoyed {}, {}, {}, {}, and {} what other movies might I like",
    "What are some movies similar to {}",
    "Can you recommend films like {}, {}, {}, and {}",
    "Can you recommend films like {}, {}, {}, {}, and {}",
    "I loved {}, {}, {}, and {}. Any recommendations for similar movies",
    "I loved {}, {}, {}, {}, and {}. Any recommendations for similar movies",
    "What movies are in the same genre as {}",

    # Plot-related queries
    "What is {} about",
    "Can you summarize the plot of {}",
    "What happens in {}",

    # Miscellaneous
    "What genre is \"{}\"",
    "Who composed the music for {}",
    "When was {} released",
    "What awards has \"{}\" won",
    "Is {} part of a series or franchise",
    "Are there any sequels or prequels to {}",
    "Where can I watch {}"
]


In [32]:
def generate_examples(movies, templates, num_examples=100):
    sentences = []
    labels = []

    for _ in range(num_examples):
        # Randomly pick a relationship and a movie title
        # Randomly pick a template
        template = random.choice(templates)
        
        # Determine the number of placeholders in the template
        num_placeholders = template.count('{}')

        # Randomly pick the required number of movies
        selected_movies = random.sample(movies, num_placeholders)

        # remove ", -, ., : ! ? ;  from the movie" in a simple way
        # selected_movies = [movie.translate(str.maketrans('', '', ',-.:!?;')) for movie in selected_movies]


        
        # Format the template with the selected movies
        question = template.format(*selected_movies)

        # Tokenize question manually
        tokens = question.split()

        # Create labels for each token
        label_seq = []
        for token in tokens:
            labeled = False
            for movie in selected_movies:
                if token == movie.split(" ")[0]:
                    label_seq.append("B-MOVIE")
                    labeled = True
                    break
                elif token in movie.split()[1:]:
                    label_seq.append("I-MOVIE")
                    labeled = True
                    break
            if not labeled:
                label_seq.append("O")

        # tokens to lower
        tokens = [token.lower() for token in tokens]

        # Append to the list
        sentences.append(tokens)
        labels.append(label_seq)

    return sentences, labels

# Generate 200 examples
tokens, ner_tags = generate_examples(entities, templates, num_examples=3000)

In [40]:
id = 132
print(tokens[id])
print(ner_tags[id])

['can', 'you', 'name', 'any', 'actors', 'from', 'naruto', 'shippuden', 'the', 'movie:', 'the', 'lost', 'tower']
['O', 'O', 'O', 'O', 'O', 'O', 'B-MOVIE', 'I-MOVIE', 'I-MOVIE', 'I-MOVIE', 'I-MOVIE', 'I-MOVIE', 'I-MOVIE']


In [34]:
# create a checkpoint of the created dataset with pickle?
import pickle

with open('data_for_ner.pkl', 'wb') as f:
    pickle.dump((tokens, ner_tags), f)


In [35]:
import torch
from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, DatasetDict
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import random
from transformers import DataCollatorForTokenClassification

# Load your dataset in the correct format
train_data = {
    'tokens': tokens,
    'ner_tags': ner_tags
}

# Map NER tags to labels (B-RELATION, B-MOVIE, etc.)
label_list = ["O", "B-MOVIE", "I-MOVIE"]

# Create the mapping from label to index
label_map = {label: i for i, label in enumerate(label_list)}

# Convert ner_tags to numerical values based on label_map
def convert_labels_to_ids(ner_tags):
    converted_tags = []
    for labels in ner_tags:
        converted_tags.append([label_map[label] for label in labels])
    return converted_tags

# Apply label conversion
train_data['ner_tags'] = convert_labels_to_ids(train_data['ner_tags'])

# Convert the dataset into Hugging Face Dataset format
dataset = Dataset.from_dict(train_data)

# Split the dataset into training and validation sets (80/20 split)
train_test_split = dataset.train_test_split(test_size=0.2)
datasets = DatasetDict({"train": train_test_split["train"], "test": train_test_split["test"]})

# Load a pre-trained NER-finetuned tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    ignore_mismatched_sizes=True  # Ignore mismatched sizes and initialize the classifier head for your label set
)

# Tokenize the dataset
# Function to tokenize and align labels
def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(
        examples['tokens'], 
        truncation=True,  # Activate truncation
        # padding=True,     # Activate padding
        is_split_into_words=True  # Ensure input is treated as pre-tokenized (word level)
    )

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Get word IDs for each token
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignored label for special tokens and padding
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  # Assign the label to the first token of a word
            else:
                label_ids.append(-100)  # Assign ignored label to sub-tokens
            previous_word_idx = word_idx
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenization and label alignment to the dataset
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True, fn_kwargs={'tokenizer': tokenizer})

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

# Train Dataset

In [36]:
from seqeval.metrics import classification_report, precision_score, recall_score, f1_score
import numpy as np

# Map label IDs back to label names
id_to_label = {v: k for k, v in label_map.items()}

# Function to align predictions and compute metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id_to_label[label_id] for label_id in label_seq if label_id != -100]  # Ignore padding tokens
        for label_seq in labels
    ]
    pred_labels = [
        [id_to_label[pred_id] for pred_id, label_id in zip(pred_seq, label_seq) if label_id != -100]
        for pred_seq, label_seq in zip(predictions, labels)
    ]

    # Use seqeval to calculate metrics
    precision = precision_score(true_labels, pred_labels)
    recall = recall_score(true_labels, pred_labels)
    f1 = f1_score(true_labels, pred_labels)
    
    print("\nClassification Report:\n", classification_report(true_labels, pred_labels))

    return {"precision": precision, "recall": recall, "f1": f1}

# Define training arguments
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Using device:", device)
model.to(device)

data_collator = DataCollatorForTokenClassification(tokenizer)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",  # Save model checkpoints for each epoch
    evaluation_strategy="epoch",  # Evaluate after each epoch
    metric_for_best_model="f1",  # Select the best model based on F1-score
    load_best_model_at_end=True,  # Automatically load the best model
)

# Create Trainer instance with custom compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained("./custom_ner_model")
tokenizer.save_pretrained("./custom_ner_model")


Using device: cpu




  0%|          | 0/750 [00:00<?, ?it/s]

{'loss': 0.8788, 'grad_norm': 2.002763509750366, 'learning_rate': 1.9733333333333336e-05, 'epoch': 0.07}
{'loss': 0.7064, 'grad_norm': 1.829643964767456, 'learning_rate': 1.9466666666666668e-05, 'epoch': 0.13}
{'loss': 0.5752, 'grad_norm': 1.1827261447906494, 'learning_rate': 1.9200000000000003e-05, 'epoch': 0.2}
{'loss': 0.4684, 'grad_norm': 1.119017481803894, 'learning_rate': 1.8933333333333334e-05, 'epoch': 0.27}
{'loss': 0.4225, 'grad_norm': 1.4572210311889648, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.33}
{'loss': 0.2958, 'grad_norm': 1.1690351963043213, 'learning_rate': 1.8400000000000003e-05, 'epoch': 0.4}
{'loss': 0.203, 'grad_norm': 2.503082752227783, 'learning_rate': 1.8133333333333335e-05, 'epoch': 0.47}
{'loss': 0.1727, 'grad_norm': 3.6340677738189697, 'learning_rate': 1.7866666666666666e-05, 'epoch': 0.53}
{'loss': 0.1042, 'grad_norm': 2.7655110359191895, 'learning_rate': 1.76e-05, 'epoch': 0.6}
{'loss': 0.1055, 'grad_norm': 1.9958871603012085, 'learning_rate': 1.

  0%|          | 0/38 [00:00<?, ?it/s]


Classification Report:
               precision    recall  f1-score   support

       MOVIE       0.93      0.89      0.91       990

   micro avg       0.93      0.89      0.91       990
   macro avg       0.93      0.89      0.91       990
weighted avg       0.93      0.89      0.91       990

{'eval_loss': 0.0769214853644371, 'eval_precision': 0.933972310969116, 'eval_recall': 0.8858585858585859, 'eval_f1': 0.9092794193882842, 'eval_runtime': 30.6911, 'eval_samples_per_second': 19.55, 'eval_steps_per_second': 1.238, 'epoch': 1.0}
{'loss': 0.0847, 'grad_norm': 1.6641005277633667, 'learning_rate': 1.5733333333333334e-05, 'epoch': 1.07}
{'loss': 0.0849, 'grad_norm': 2.902892827987671, 'learning_rate': 1.546666666666667e-05, 'epoch': 1.13}
{'loss': 0.0917, 'grad_norm': 2.683138370513916, 'learning_rate': 1.5200000000000002e-05, 'epoch': 1.2}
{'loss': 0.0649, 'grad_norm': 1.7959421873092651, 'learning_rate': 1.4933333333333335e-05, 'epoch': 1.27}
{'loss': 0.0872, 'grad_norm': 1.60936486

  0%|          | 0/38 [00:00<?, ?it/s]


Classification Report:
               precision    recall  f1-score   support

       MOVIE       0.95      0.89      0.92       990

   micro avg       0.95      0.89      0.92       990
   macro avg       0.95      0.89      0.92       990
weighted avg       0.95      0.89      0.92       990

{'eval_loss': 0.06668711453676224, 'eval_precision': 0.9463519313304721, 'eval_recall': 0.8909090909090909, 'eval_f1': 0.9177939646201873, 'eval_runtime': 25.7396, 'eval_samples_per_second': 23.31, 'eval_steps_per_second': 1.476, 'epoch': 2.0}
{'loss': 0.0419, 'grad_norm': 0.6722897887229919, 'learning_rate': 1.1733333333333335e-05, 'epoch': 2.07}
{'loss': 0.0387, 'grad_norm': 1.158530831336975, 'learning_rate': 1.1466666666666668e-05, 'epoch': 2.13}
{'loss': 0.0279, 'grad_norm': 0.5009326338768005, 'learning_rate': 1.1200000000000001e-05, 'epoch': 2.2}
{'loss': 0.0468, 'grad_norm': 5.206861972808838, 'learning_rate': 1.0933333333333334e-05, 'epoch': 2.27}
{'loss': 0.0711, 'grad_norm': 3.22708

  0%|          | 0/38 [00:00<?, ?it/s]


Classification Report:
               precision    recall  f1-score   support

       MOVIE       0.94      0.91      0.93       990

   micro avg       0.94      0.91      0.93       990
   macro avg       0.94      0.91      0.93       990
weighted avg       0.94      0.91      0.93       990

{'eval_loss': 0.048049092292785645, 'eval_precision': 0.9433962264150944, 'eval_recall': 0.9090909090909091, 'eval_f1': 0.9259259259259259, 'eval_runtime': 19.2555, 'eval_samples_per_second': 31.16, 'eval_steps_per_second': 1.973, 'epoch': 3.0}
{'loss': 0.0446, 'grad_norm': 1.9249995946884155, 'learning_rate': 7.733333333333334e-06, 'epoch': 3.07}
{'loss': 0.045, 'grad_norm': 2.155670404434204, 'learning_rate': 7.4666666666666675e-06, 'epoch': 3.13}
{'loss': 0.0317, 'grad_norm': 2.195035457611084, 'learning_rate': 7.2000000000000005e-06, 'epoch': 3.2}
{'loss': 0.0272, 'grad_norm': 0.9700429439544678, 'learning_rate': 6.9333333333333344e-06, 'epoch': 3.27}
{'loss': 0.0342, 'grad_norm': 4.487111

  0%|          | 0/38 [00:00<?, ?it/s]


Classification Report:
               precision    recall  f1-score   support

       MOVIE       0.94      0.92      0.93       990

   micro avg       0.94      0.92      0.93       990
   macro avg       0.94      0.92      0.93       990
weighted avg       0.94      0.92      0.93       990

{'eval_loss': 0.046344298869371414, 'eval_precision': 0.9392378990731205, 'eval_recall': 0.9212121212121213, 'eval_f1': 0.9301376848546661, 'eval_runtime': 14.8093, 'eval_samples_per_second': 40.515, 'eval_steps_per_second': 2.566, 'epoch': 4.0}
{'loss': 0.0361, 'grad_norm': 0.6429855823516846, 'learning_rate': 3.7333333333333337e-06, 'epoch': 4.07}
{'loss': 0.0229, 'grad_norm': 1.1059170961380005, 'learning_rate': 3.4666666666666672e-06, 'epoch': 4.13}
{'loss': 0.0172, 'grad_norm': 0.8281331062316895, 'learning_rate': 3.2000000000000003e-06, 'epoch': 4.2}
{'loss': 0.0376, 'grad_norm': 0.08562179654836655, 'learning_rate': 2.9333333333333338e-06, 'epoch': 4.27}
{'loss': 0.0438, 'grad_norm': 2.

  0%|          | 0/38 [00:00<?, ?it/s]


Classification Report:
               precision    recall  f1-score   support

       MOVIE       0.95      0.93      0.94       990

   micro avg       0.95      0.93      0.94       990
   macro avg       0.95      0.93      0.94       990
weighted avg       0.95      0.93      0.94       990

{'eval_loss': 0.04551567882299423, 'eval_precision': 0.9463364293085655, 'eval_recall': 0.9262626262626262, 'eval_f1': 0.9361919346605411, 'eval_runtime': 17.5028, 'eval_samples_per_second': 34.28, 'eval_steps_per_second': 2.171, 'epoch': 5.0}
{'train_runtime': 1489.2611, 'train_samples_per_second': 8.058, 'train_steps_per_second': 0.504, 'train_loss': 0.09490114943186442, 'epoch': 5.0}


SafetensorError: Error while serializing: IoError(Os { code: 1224, kind: Uncategorized, message: "The requested operation cannot be performed on a file with a user-mapped section open." })

In [37]:
model.save_pretrained("./custom_ner_model1")
tokenizer.save_pretrained("./custom_ner_model1")

('./custom_ner_model1\\tokenizer_config.json',
 './custom_ner_model1\\special_tokens_map.json',
 './custom_ner_model1\\vocab.txt',
 './custom_ner_model1\\added_tokens.json',
 './custom_ner_model1\\tokenizer.json')

# Test a sentence

In [43]:
def predict_entities(input_text, model, tokenizer):
    # Define the label list (same as the training script)
    label_list = ["O", "B-MOVIE", "I-MOVIE"]
    
    # Split the input text into words
    words = input_text.split()
    
    # Tokenize the input with `is_split_into_words=True`
    tokenized_input = tokenizer(words, return_tensors="pt", is_split_into_words=True, truncation=True)
    
    # Move inputs to GPU if available
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    inputs = {key: value.to(device) for key, value in tokenized_input.items()}

    # Run the model and get predictions
    with torch.no_grad():
        logits = model(**inputs).logits
    
    predictions = torch.argmax(logits, dim=2)

    # Get the word IDs to map tokens back to words
    word_ids = tokenized_input.word_ids(batch_index=0)
    
    # Initialize variables
    previous_word_idx = None
    labels = []
    
    for word_idx, pred_id in zip(word_ids, predictions[0]):
        if word_idx is None:
            continue
        elif word_idx != previous_word_idx:
            # Start of a new word
            label = label_list[pred_id.item()]
            labels.append(label)
            previous_word_idx = word_idx
    
    # Combine words and their predicted labels
    output_text = ''
    for word, label in zip(words, labels):
        if label != 'O':
            output_text += f"[{word} ({label})] "
        else:
            output_text += f"{word} "
    
    print(output_text)


In [45]:
import torch

# Assuming the model and tokenizer are already loaded into memory
# Remove loading code and pass the model and tokenizer as parameters


predict_entities("Who is the director of star wars jedi - 4?", model, tokenizer)
predict_entities("Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies", model, tokenizer)

Who is the director of [star (B-MOVIE)] [wars (I-MOVIE)] [jedi (I-MOVIE)] [- (I-MOVIE)] [4? (I-MOVIE)] 
Given that I like [The (B-MOVIE)] [Lion (I-MOVIE)] King, Pocahontas, [and (I-MOVIE)] [The (B-MOVIE)] [Beauty (I-MOVIE)] [and (I-MOVIE)] [the (I-MOVIE)] Beast, can you recommend some movies 
