# Import Necessary Libraries


In [5]:
import pyconll
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
import os
import warnings
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from transformers import AutoTokenizer
from transformers import TFAutoModelForTokenClassification, create_optimizer
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score

# Data Preprocessing

## Load Data

In [6]:
def load_conllu_data(file_path):
    """Load and parse the CoNLLU file to extract sentences and their POS tags."""
    sentences = []
    pos_tags = []

    # Load the data file
    data = pyconll.load_from_file(file_path)

    for sentence in data:
        words = []
        tags = []

        for token in sentence:
            # Skip empty tokens or tokens without form/UPOS
            if not token.form or not token.upos:
                continue

            words.append(token.form)
            tags.append(token.upos)

        if words and tags:  # Only add non-empty sentences
            sentences.append(words)
            pos_tags.append(tags)

    return sentences, pos_tags

# Load the dataset
file_path = "Arabic_POS.conllu"
sentences, pos_tags = load_conllu_data(file_path)

print(f"Loaded {len(sentences)} sentences")

# Display a sample sentence with its tags
if sentences:
    sample_idx = 0
    print("\nSample sentence:")
    for word, tag in zip(sentences[sample_idx], pos_tags[sample_idx]):
        print(f"{word} → {tag}")

# Create dictionary mapping of UPOS tags to IDs
unique_tags = sorted(list(set(tag for sent_tags in pos_tags for tag in sent_tags)))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

print(f"\nUnique UPOS tags ({len(unique_tags)}):")
print(unique_tags)

Loaded 1904 sentences

Sample sentence:
برلين → X
ترفض → VERB
حصول → NOUN
شركة → NOUN
اميركية → ADJ
على → ADP
رخصة → NOUN
تصنيع → NOUN
دبابة → NOUN
" → PUNCT
ليوبارد → X
" → PUNCT
الالمانية → ADJ

Unique UPOS tags (16):
['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']


## Split The Data

In [7]:
# Split the data into train, validation, and test sets (70%, 15%, 15%)
train_sentences, temp_sentences, train_pos, temp_pos = train_test_split(
    sentences, pos_tags, test_size=0.3, random_state=42)
val_sentences, test_sentences, val_pos, test_pos = train_test_split(
    temp_sentences, temp_pos, test_size=0.5, random_state=42)

print(f"Train set: {len(train_sentences)} sentences")
print(f"Validation set: {len(val_sentences)} sentences")
print(f"Test set: {len(test_sentences)} sentences")

Train set: 1332 sentences
Validation set: 286 sentences
Test set: 286 sentences


## Tokenization and Alignment

In [8]:
def encode_tags(tags, tag2id, tokenized_input, max_length):
    """Encode tags and align with wordpiece tokens."""
    encoded_tags = np.ones(max_length, dtype=int) * -100  # Initialize with -100 (ignored in loss)

    # Get word_ids from the tokenized input
    word_ids = tokenized_input.word_ids()

    # Map tags to IDs and align with wordpiece tokens
    previous_word_idx = None
    for i, word_idx in enumerate(word_ids):
        # Special tokens have word_idx set to None
        if word_idx is None:
            continue

        # If it's the first token of the word, assign the tag
        if word_idx != previous_word_idx and word_idx < len(tags):
            encoded_tags[i] = tag2id[tags[word_idx]]

        previous_word_idx = word_idx

    return encoded_tags

def prepare_tf_dataset(sentences, tags, tokenizer, tag2id, batch_size=16, shuffle=True, max_length=128):
    """Prepare a TensorFlow Dataset for token classification."""
    input_ids_list = []
    attention_mask_list = []
    labels_list = []

    for sentence_tokens, sentence_tags in zip(sentences, tags):
        # Tokenize the sentence
        encodings = tokenizer(
            sentence_tokens,
            is_split_into_words=True,
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_attention_mask=True,
            return_offsets_mapping=True,  # This helps track original word positions
        )

        # Encode and align tags
        encoded_tags = encode_tags(
            sentence_tags,
            tag2id,
            encodings,
            max_length
        )

        input_ids_list.append(encodings['input_ids'])
        attention_mask_list.append(encodings['attention_mask'])
        labels_list.append(encoded_tags)

    # Convert to TensorFlow tensors
    input_ids = tf.convert_to_tensor(input_ids_list)
    attention_mask = tf.convert_to_tensor(attention_mask_list)
    labels = tf.convert_to_tensor(labels_list)

    # Create TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices(({
        'input_ids': input_ids,
        'attention_mask': attention_mask
    }, labels))

    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(sentences))

    dataset = dataset.batch(batch_size)

    return dataset

# Initialize tokenizer with a pre-trained Arabic BERT model
model_checkpoint = "asafaya/bert-base-arabic"  # You can change this to other models
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Create TensorFlow datasets
train_dataset = prepare_tf_dataset(train_sentences, train_pos, tokenizer, tag2id)
val_dataset = prepare_tf_dataset(val_sentences, val_pos, tokenizer, tag2id, shuffle=False)
test_dataset = prepare_tf_dataset(test_sentences, test_pos, tokenizer, tag2id, shuffle=False)

# Get a batch to inspect
for batch in train_dataset.take(1):
    inputs, labels = batch
    print("\nSample batch:")
    print(f"Input IDs shape: {inputs['input_ids'].shape}")
    print(f"Attention mask shape: {inputs['attention_mask'].shape}")
    print(f"Labels shape: {labels.shape}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/334k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]


Sample batch:
Input IDs shape: (16, 128)
Attention mask shape: (16, 128)
Labels shape: (16, 128)


# 2. Model Building

In [9]:
def build_model(model_name, num_labels):
    """Initialize a pre-trained transformer model for token classification."""
    # Load pre-trained model with a token classification head
    model = TFAutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        from_pt=True
    )

    return model

def align_predictions(predictions, labels):
    """Convert predictions and labels to format suitable for seqeval."""
    preds = np.argmax(predictions, axis=2)

    batch_size, seq_len = preds.shape

    pred_list = [[] for _ in range(batch_size)]
    label_list = [[] for _ in range(batch_size)]

    for i in range(batch_size):
        for j in range(seq_len):
            if labels[i, j] != -100:
                pred_list[i].append(id2tag[preds[i, j]])
                label_list[i].append(id2tag[labels[i, j]])

    return pred_list, label_list

def compute_metrics(predictions, labels):
    """Compute evaluation metrics."""
    pred_tags, true_tags = align_predictions(predictions, labels)

    accuracy = accuracy_score(true_tags, pred_tags)
    f1 = f1_score(true_tags, pred_tags)
    precision = precision_score(true_tags, pred_tags)
    recall = recall_score(true_tags, pred_tags)

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

def create_custom_loss():
    """Create a custom loss function that ignores padding tokens."""
    def custom_sparse_categorical_crossentropy(y_true, y_pred):
        # Create mask to ignore padded tokens (labels = -100)
        mask = tf.not_equal(y_true, -100)

        # Adjust labels to be non-negative for loss calculation
        y_true_adjusted = tf.where(mask, y_true, 0)

        # Convert mask to float
        mask_float = tf.cast(mask, dtype=tf.float32)

        # Calculate loss
        loss = tf.keras.losses.sparse_categorical_crossentropy(
            y_true_adjusted, y_pred, from_logits=True
        )

        # Apply mask and calculate mean
        masked_loss = loss * mask_float
        total_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(mask_float)

        return total_loss

    return custom_sparse_categorical_crossentropy

# 3. Training and Evaluation

In [10]:
# Suppress TensorFlow logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # 1: INFO, 2: WARNING, 3: ERROR

# Create a filter function for seqeval warnings
def filter_seqeval_warnings(message, category, filename, lineno, file=None, line=None):
    if category == UserWarning and "seems not to be NE tag" in str(message):
        return None  # Suppress the warning
    return True  # Show all other warnings

# Set the warning filter
warnings.filterwarnings("always")  # Reset all filters
warnings.showwarning = filter_seqeval_warnings

# Disable tqdm progress bars
tqdm.pandas = lambda *args, **kwargs: lambda x: x

def train_and_evaluate(model_name, train_dataset, val_dataset, test_dataset, epochs=3, learning_rate=5e-5):
    """Train and evaluate a model for POS tagging."""
    # Build the model
    model = build_model(model_name, len(tag2id))

    # Create optimizer with learning rate schedule
    steps_per_epoch = len(list(train_dataset))
    total_train_steps = steps_per_epoch * epochs
    optimizer, lr_schedule = create_optimizer(
        init_lr=learning_rate,
        num_train_steps=total_train_steps,
        num_warmup_steps=int(0.1 * total_train_steps)
    )

    # Compile the model
    model.compile(
        optimizer=optimizer,
        loss=create_custom_loss()
    )

    # Define callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=2,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ModelCheckpoint(
            filepath=f'./checkpoints/{model_name.split("/")[-1]}',
            monitor='val_loss',
            save_best_only=True
        )
    ]

    # Train the model
    print(f"\n{'='*50}")
    print(f"Training model: {model_name}")
    print(f"{'='*50}\n")

    history = model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=epochs,
        callbacks=callbacks
    )

    # Evaluate on validation set
    print(f"\n{'='*50}")
    print("EVALUATION RESULTS")
    print(f"{'='*50}\n")

    print("Evaluating on validation set:")
    val_predictions = model.predict(val_dataset)
    val_labels = np.concatenate([labels.numpy() for _, labels in val_dataset], axis=0)
    val_metrics = compute_metrics(val_predictions.logits, val_labels)

    print("\nValidation metrics:")
    for metric, value in val_metrics.items():
        print(f"{metric}: {value:.4f}")

    # Evaluate on test set
    print("\nEvaluating on test set:")
    test_predictions = model.predict(test_dataset)
    test_labels = np.concatenate([labels.numpy() for _, labels in test_dataset], axis=0)
    test_metrics = compute_metrics(test_predictions.logits, test_labels)

    print("\nTest metrics:")
    for metric, value in test_metrics.items():
        print(f"{metric}: {value:.4f}")

    return model, history, val_metrics, test_metrics

In [None]:
# Train and evaluate the model
model, history, val_metrics, test_metrics = train_and_evaluate(
    model_checkpoint,
    train_dataset,
    val_dataset,
    test_dataset
)

# Save the model
model.save_pretrained("./tf_arabic_pos_model")
tokenizer.save_pretrained("./tf_arabic_pos_tokenizer")

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training model: asafaya/bert-base-arabic

Epoch 1/3
Epoch 2/3
Epoch 3/3

EVALUATION RESULTS

Evaluating on validation set:

Validation metrics:
accuracy: 0.9714
f1: 0.9591
precision: 0.9604
recall: 0.9577

Evaluating on test set:

Test metrics:
accuracy: 0.9704
f1: 0.9580
precision: 0.9597
recall: 0.9562


('./tf_arabic_pos_tokenizer/tokenizer_config.json',
 './tf_arabic_pos_tokenizer/special_tokens_map.json',
 './tf_arabic_pos_tokenizer/vocab.txt',
 './tf_arabic_pos_tokenizer/added_tokens.json',
 './tf_arabic_pos_tokenizer/tokenizer.json')

# 4. Model Comparison

In [11]:
def compare_models(models_to_compare, epochs=3):
    """Compare different models for Arabic POS tagging."""
    results = []

    for model_name in models_to_compare:
        print(f"\n===== Training and evaluating {model_name} =====")

        # Initialize tokenizer for the current model
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Prepare datasets with the current tokenizer
        train_dataset = prepare_tf_dataset(train_sentences, train_pos, tokenizer, tag2id)
        val_dataset = prepare_tf_dataset(val_sentences, val_pos, tokenizer, tag2id, shuffle=False)
        test_dataset = prepare_tf_dataset(test_sentences, test_pos, tokenizer, tag2id, shuffle=False)

        # Train and evaluate the model
        model, history, val_metrics, test_metrics = train_and_evaluate(
            model_name,
            train_dataset,
            val_dataset,
            test_dataset,
            epochs=epochs
        )

        # Store results
        results.append({
            'model': model_name,
            'val_accuracy': val_metrics['accuracy'],
            'test_accuracy': test_metrics['accuracy'],
            'test_f1': test_metrics['f1']
        })

        # Save the model
        model.save_pretrained(f"./tf_{model_name.split('/')[-1]}")
        tokenizer.save_pretrained(f"./tf_{model_name.split('/')[-1]}_tokenizer")

    # Display comparison results
    print("\n===== Model Comparison =====")
    print(f"{'Model':<25} {'Val Accuracy':<15} {'Test Accuracy':<15} {'Test F1':<15}")
    print("-" * 70)
    for result in results:
        print(f"{result['model'].split('/')[-1]:<25} {result['val_accuracy']:.4f}{' '*10} {result['test_accuracy']:.4f}{' '*10} {result['test_f1']:.4f}")

    return results

In [None]:
# Models to compare
models_to_compare = [
    "asafaya/bert-base-arabic",           # Arabic BERT
    "bert-base-multilingual-cased",       # Multilingual BERT
    "aubmindlab/bert-base-arabertv02"     # AraBERT v2
]

# Compare models (uncomment to run)
model_comparison = compare_models(models_to_compare)


===== Training and evaluating asafaya/bert-base-arabic =====


All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training model: asafaya/bert-base-arabic

Epoch 1/3
Epoch 2/3
Epoch 3/3

EVALUATION RESULTS

Evaluating on validation set:

Validation metrics:
accuracy: 0.9720
f1: 0.9601
precision: 0.9621
recall: 0.9581

Evaluating on test set:

Test metrics:
accuracy: 0.9720
f1: 0.9601
precision: 0.9625
recall: 0.9577

===== Training and evaluating bert-base-multilingual-cased =====


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training model: bert-base-multilingual-cased

Epoch 1/3
Epoch 2/3
Epoch 3/3

EVALUATION RESULTS

Evaluating on validation set:

Validation metrics:
accuracy: 0.9686
f1: 0.9554
precision: 0.9582
recall: 0.9526

Evaluating on test set:

Test metrics:
accuracy: 0.9682
f1: 0.9552
precision: 0.9570
recall: 0.9534

===== Training and evaluating aubmindlab/bert-base-arabertv02 =====


tokenizer_config.json:   0%|          | 0.00/381 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/825k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForTokenClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training model: aubmindlab/bert-base-arabertv02

Epoch 1/3
Epoch 2/3
Epoch 3/3

EVALUATION RESULTS

Evaluating on validation set:

Validation metrics:
accuracy: 0.9709
f1: 0.9587
precision: 0.9609
recall: 0.9564

Evaluating on test set:

Test metrics:
accuracy: 0.9704
f1: 0.9577
precision: 0.9601
recall: 0.9554

===== Model Comparison =====
Model                     Val Accuracy    Test Accuracy   Test F1        
----------------------------------------------------------------------
bert-base-arabic          0.9720           0.9720           0.9601
bert-base-multilingual-cased 0.9686           0.9682           0.9552
bert-base-arabertv02      0.9709           0.9704           0.9577


# 5. Hyperparameter Tuning

In [12]:
def hyperparameter_tuning(model_name):
    """Perform hyperparameter tuning for a specific model."""
    print(f"\n===== Hyperparameter Tuning for {model_name} =====")

    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Hyperparameter grid
    learning_rates = [2e-5, 3e-5]
    batch_sizes = [16, 32]
    epochs_list = [3]

    best_val_accuracy = 0
    best_params = {}

    for lr in learning_rates:
        for bs in batch_sizes:
            for epochs in epochs_list:
                print(f"\nTrying: lr={lr}, batch_size={bs}, epochs={epochs}")

                # Prepare datasets with current batch size
                train_dataset = prepare_tf_dataset(train_sentences, train_pos, tokenizer, tag2id, batch_size=bs)
                val_dataset = prepare_tf_dataset(val_sentences, val_pos, tokenizer, tag2id, batch_size=bs, shuffle=False)
                test_dataset = prepare_tf_dataset(test_sentences, test_pos, tokenizer, tag2id, batch_size=bs, shuffle=False)

                # Train and evaluate with current hyperparameters
                _, _, val_metrics, _ = train_and_evaluate(
                    model_name,
                    train_dataset,
                    val_dataset,
                    test_dataset,
                    epochs=epochs,
                    learning_rate=lr
                )

                # Check if this is the best configuration
                if val_metrics['accuracy'] > best_val_accuracy:
                    best_val_accuracy = val_metrics['accuracy']
                    best_params = {
                        'learning_rate': lr,
                        'batch_size': bs,
                        'epochs': epochs,
                        'val_accuracy': val_metrics['accuracy']
                    }

    print("\n===== Best Hyperparameters =====")
    print(f"Learning Rate: {best_params['learning_rate']}")
    print(f"Batch Size: {best_params['batch_size']}")
    print(f"Epochs: {best_params['epochs']}")
    print(f"Validation Accuracy: {best_params['val_accuracy']:.4f}")

    return best_params



In [13]:
best_model =  "asafaya/bert-base-arabic"
best_params = hyperparameter_tuning(best_model)


===== Hyperparameter Tuning for asafaya/bert-base-arabic =====

Trying: lr=2e-05, batch_size=16, epochs=3


pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training model: asafaya/bert-base-arabic

Epoch 1/3
Epoch 2/3
Epoch 3/3

EVALUATION RESULTS

Evaluating on validation set:

Validation metrics:
accuracy: 0.9519
f1: 0.9322
precision: 0.9348
recall: 0.9297

Evaluating on test set:

Test metrics:
accuracy: 0.9528
f1: 0.9345
precision: 0.9385
recall: 0.9304

Trying: lr=2e-05, batch_size=32, epochs=3


All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training model: asafaya/bert-base-arabic

Epoch 1/3
Epoch 2/3
Epoch 3/3

EVALUATION RESULTS

Evaluating on validation set:

Validation metrics:
accuracy: 0.9212
f1: 0.8881
precision: 0.8946
recall: 0.8817

Evaluating on test set:

Test metrics:
accuracy: 0.9197
f1: 0.8888
precision: 0.8990
recall: 0.8788

Trying: lr=3e-05, batch_size=16, epochs=3


All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training model: asafaya/bert-base-arabic

Epoch 1/3
Epoch 2/3
Epoch 3/3

EVALUATION RESULTS

Evaluating on validation set:

Validation metrics:
accuracy: 0.9612
f1: 0.9458
precision: 0.9467
recall: 0.9449

Evaluating on test set:

Test metrics:
accuracy: 0.9618
f1: 0.9474
precision: 0.9509
recall: 0.9439

Trying: lr=3e-05, batch_size=32, epochs=3


All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training model: asafaya/bert-base-arabic

Epoch 1/3
Epoch 2/3
Epoch 3/3

EVALUATION RESULTS

Evaluating on validation set:

Validation metrics:
accuracy: 0.9498
f1: 0.9291
precision: 0.9328
recall: 0.9254

Evaluating on test set:

Test metrics:
accuracy: 0.9469
f1: 0.9274
precision: 0.9328
recall: 0.9221

===== Best Hyperparameters =====
Learning Rate: 3e-05
Batch Size: 16
Epochs: 3
Validation Accuracy: 0.9612


## 6. Train Final Model with Best Parameters

In [14]:
def train_final_model(model_name, best_params):
    """Train the final model with the best hyperparameters."""
    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Prepare datasets with the best batch size
    train_dataset = prepare_tf_dataset(
        train_sentences, train_pos, tokenizer, tag2id,
        batch_size=best_params['batch_size']
    )
    val_dataset = prepare_tf_dataset(
        val_sentences, val_pos, tokenizer, tag2id,
        batch_size=best_params['batch_size'], shuffle=False
    )
    test_dataset = prepare_tf_dataset(
        test_sentences, test_pos, tokenizer, tag2id,
        batch_size=best_params['batch_size'], shuffle=False
    )

    # Train and evaluate the final model
    final_model, _, _, test_metrics = train_and_evaluate(
        model_name,
        train_dataset,
        val_dataset,
        test_dataset,
        epochs=best_params['epochs'],
        learning_rate=best_params['learning_rate']
    )

    # Save the final model
    final_model.save_pretrained("./tf_arabic_pos_final_model")
    tokenizer.save_pretrained("./tf_arabic_pos_final_tokenizer")

    return final_model, test_metrics



In [15]:
Final_model,test_metrics = train_final_model(best_model,best_params)

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training model: asafaya/bert-base-arabic

Epoch 1/3
Epoch 2/3
Epoch 3/3

EVALUATION RESULTS

Evaluating on validation set:

Validation metrics:
accuracy: 0.9596
f1: 0.9441
precision: 0.9458
recall: 0.9425

Evaluating on test set:

Test metrics:
accuracy: 0.9604
f1: 0.9448
precision: 0.9478
recall: 0.9418


In [16]:
print(test_metrics)

{'accuracy': 0.9604430379746836, 'f1': np.float64(0.9448158502491795), 'precision': np.float64(0.9478112425313986), 'recall': np.float64(0.9418393311523082)}


# After tuning the hyperparameters, the accuracy decreased because it needed more hyperparameters to tunn So, I saved the model that trained before hyperparameter tuning.