<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/Deep-Learning-Projects/blob/main/Natural-Language-Processing/NER-MIT-Movie-Corpus/ner_mit_movie_corpus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [5]:
!pip install -q datasets

In [6]:
import tensorflow as tf
import numpy as np
import datasets
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.utils import pad_sequences

# Pipeline

In [16]:
def load_movie_data():
    print("[INFO] Loading MIT Movie Corpus...")
    dataset = datasets.load_dataset("tner/mit_movie_trivia")
    # Split dataset into training and testing sets
    train_data = dataset["train"]
    test_data = dataset["test"]

    # Show available features for debugging
    print("[DEBUG] Available features in train_data:", train_data.features)

    # Manually define NER labels as 'tags' feature is not ClassLabel
    label_names = [
        "O", "B-MOV", "I-MOV", "B-ACT", "I-ACT", "B-DIR", "I-DIR",
        "B-GEN", "I-GEN", "B-RAT", "I-RAT", "B-COM", "I-COM"
    ]
    num_labels = len(label_names)
    print(f"[INFO] {num_labels} Found, Tags: {label_names}")

    return train_data, test_data, label_names

def prepare_data(data_split, label_names):
    sentences = []
    labels = []

    # Calculate max valid label ID after shifting for padding
    max_valid_shifted_label_id = len(label_names)

    for item in data_split:
        sentences.append(" ".join(item['tokens']))

        shifted_tags = []
        for tag in item["tags"]:
            # Shift tags by 1 (0 becomes padding, actual tags start from 1)
            shifted_tag = tag + 1
            # Remap out-of-range tags to the 'O' tag (Outside, which is 1 after shift)
            if shifted_tag > max_valid_shifted_label_id:
                shifted_tags.append(1) # Map to 'O' tag
            else:
                shifted_tags.append(shifted_tag)
        labels.append(shifted_tags)

    return sentences, labels

def create_pipeline(
        train_texts, train_labels, test_texts, test_labels, max_tokens=10_000, seq_len=40):
    print("[INFO] Adapting TextVectorizatoin...")
    # Initialize TextVectorization layer
    vectorizer = TextVectorization(
        max_tokens=max_tokens,
        output_mode="int",
        output_sequence_length=seq_len,
    )
    vectorizer.adapt(train_texts)

    # Convert texts to sequences of integers
    X_train = vectorizer(train_texts)
    X_test = vectorizer(test_texts)

    print("[INFO] Padding labels...")
    # Pad label sequences to uniform length
    y_train = pad_sequences(train_labels, maxlen=seq_len, padding="post", value=0)
    y_test = pad_sequences(test_labels, maxlen=seq_len, padding="post", value=0)

    print("[INFO] Creating TF Datasets...")
    BATCH_SIZE = 64
    # Create TensorFlow Dataset for training
    train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    train_ds = train_ds.shuffle(1000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

    # Create TensorFlow Dataset for testing
    test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test))
    test_ds = test_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

    print("[INFO] Text corpus trained, padded and tensorflow dataset has been created.")
    return train_ds, test_ds, vectorizer


def build_ner_model(vocab_size, num_classes, embedding_dim=128):
    # Define a sequential BiLSTM model for NER
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),
        tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(128, return_sequences=True)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(num_classes, activation="softmax")
    ])
    return model


def predict_movie_query(model, vectorizer, query, label_names):
    # Prepare input query for prediction
    input_ids = vectorizer([query])

    # Get model predictions (probabilities for each class per token)
    predictions = model.predict(input_ids, verbose=0)[0]
    # Get the ID of the most probable tag for each token
    predicted_ids = np.argmax(predictions, axis=-1);

    # Decode predictions back to human-readable tags
    words = query.split()
    print(f"\nQUERY: {query}")
    print("=" * 50)
    print(f"{'TOKEN':<20} {'PREDICTED TAG':<20}")
    print("-" * 50)

    # Display each word and its predicted tag
    for i, word in enumerate(words):
        if i >= len(predicted_ids): break # Stop if query is shorter than sequence length

        tag_id = predicted_ids[i]

        # Map tag_id to human-readable tag name
        if tag_id == 0:
            tag_name = "PAD" # Padding tag
        else:
            # Adjust for the +1 shift done during data preparation
            if (tag_id - 1) < len(label_names):
                tag_name = label_names[tag_id - 1]
            else:
                tag_name = "UNKNOWN_TAG" # Fallback for unexpected tag IDs

        print(f"{word:<20} {tag_name:<20}")

def main():
    # Load raw training and testing data, and label names
    raw_train, raw_test, label_names = load_movie_data()

    # Prepare text and labels for model input
    train_txt, train_lbl = prepare_data(raw_train, label_names)
    test_txt, test_lbl = prepare_data(raw_test, label_names)

    # Calculate total number of classes (labels + padding)
    NUM_CLASSES = len(label_names) + 1
    print(f"[INFO] Number of classes (Including PAD): {NUM_CLASSES}")

    # Set hyperparameters for text processing
    MAX_TOKENS = 10_000
    SEQ_LEN = 30
    # Create and adapt text processing pipeline
    train_ds, test_ds, vectorizer = create_pipeline(
        train_txt, train_lbl, test_txt, test_lbl, MAX_TOKENS, SEQ_LEN
    )

    print("[INFO] Initializing BiLSTM Model....")
    # Build the NER model
    model = build_ner_model(len(vectorizer.get_vocabulary()), NUM_CLASSES)

    # Compile the model with Adam optimizer and sparse categorical crossentropy loss
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    print("\n[INFO] Model initialized. Model Summary:\n")
    model.summary()

    print("\n[INFO] Training Model...")
    # Train the model
    model.fit(train_ds, epochs=5, validation_data=test_ds)

    print("\n[INFO] Testing Model...")
    # Example queries for prediction
    examples = [
        "play the dark knight by christopher nolan",
        "show me funny movies with jim carrey",
        "i want to watch a horror movie from the 90s"
    ]

    # Predict tags for each example query
    for ex in examples:
        predict_movie_query(model, vectorizer, ex, label_names)

# Execute the main pipeline
main()


[INFO] Loading MIT Movie Corpus...
[DEBUG] Available features in train_data: {'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'tags': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None)}
[INFO] 13 Found, Tags: ['O', 'B-MOV', 'I-MOV', 'B-ACT', 'I-ACT', 'B-DIR', 'I-DIR', 'B-GEN', 'I-GEN', 'B-RAT', 'I-RAT', 'B-COM', 'I-COM']
[INFO] Number of classes (Including PAD): 14
[INFO] Adapting TextVectorizatoin...
[INFO] Padding labels...
[INFO] Creating TF Datasets...
[INFO] Text corpus trained, padded and tensorflow dataset has been created.
[INFO] Initializing BiLSTM Model....

[INFO] Model initialized. Model Summary:




[INFO] Training Model...
Epoch 1/5
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 156ms/step - accuracy: 0.6629 - loss: 1.2851 - val_accuracy: 0.8168 - val_loss: 0.5938
Epoch 2/5
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 143ms/step - accuracy: 0.8276 - loss: 0.5543 - val_accuracy: 0.8626 - val_loss: 0.4289
Epoch 3/5
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 142ms/step - accuracy: 0.8800 - loss: 0.3817 - val_accuracy: 0.8872 - val_loss: 0.3579
Epoch 4/5
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 142ms/step - accuracy: 0.9079 - loss: 0.2919 - val_accuracy: 0.8939 - val_loss: 0.3377
Epoch 5/5
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 148ms/step - accuracy: 0.9217 - loss: 0.2463 - val_accuracy: 0.8942 - val_loss: 0.3356

[INFO] Testing Model...

QUERY: play the dark knight by christopher nolan
TOKEN                PREDICTED TAG       
-------------------------------------