In [None]:
!pip3 install torch torchvision torchaudio

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

# A linguistically-informed comparison between multilingual BERT and language-specific BERT models: The case of differential object marking in Romanian

This notebook accompanies my research master's thesis at the University of Amsterdam, which can be found here: [link to thesis repository](https://scripties.uba.uva.nl/). If run from start to finish, this code should replicate my described experiments and thus improve their replicability and the transparency of my reported results.

The repository additionallly provides the test sentence files, as well as the results I have obtained and the corresponding judgements.

In [None]:
from transformers import BertTokenizer, BertForMaskedLM
import torch
from transformers import AutoModel, AutoTokenizer, AutoModel, AutoTokenizer, AutoModelForMaskedLM

## **Google's multilingual BERT model**

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')

model.eval()

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

Read test sentences from files:

In [None]:
# File paths
sentences_A1 = 'test_sentences_A1.txt'
sentences_A2 = 'test_sentences_A2.txt'

with open(sentences_A1, 'r', encoding='utf-8') as file:
    test_sentences_A1 = file.readlines()

with open(sentences_A2, 'r', encoding='utf-8') as file:
    test_sentences_A2 = file.readlines()

# Strip newline characters from each sentence
test_sentences_A1 = [sentence.strip() for sentence in test_sentences_A1]
test_sentences_A2 = [sentence.strip() for sentence in test_sentences_A2]

all_test_sentences = test_sentences_A1 + test_sentences_A2


### 1. Handling A.1. sentences

The sentences corresponding to templates 1 through 16 will be handled first. For these sentences, it is the marking that is masked and the noun that is given. This means that (1) there are five different sentences for each template and (2) most of these test sentences contain two [MASK] tokens instead of just one. For each of these sentences, we need to extract the first predicted tokens and check their grammaticality.

Tokenize the sentences and create input tensors:

In [None]:
inputs = tokenizer(test_sentences_A1, return_tensors='pt', padding=True)
masked_indices = torch.where(inputs.input_ids == tokenizer.mask_token_id)

Get predictions for the positions with the [MASK] tokens:

In [None]:
# Forward pass through the model
with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits

# Softmax function to convert logits to probabilities
softmax = torch.nn.Softmax(dim=-1)

with open('predicted_results.txt', 'a', encoding='utf-8') as results_file:
    # Iterate over each sentence and its masked positions
    for i, sentence in enumerate(test_sentences_A1):
        input_ids = inputs.input_ids[i].clone()  # Clone to avoid in-place modification
        mask_indices = masked_indices[1][masked_indices[0] == i]

        # Process each mask position
        for mask_index in mask_indices:
            masked_logits = predictions[i, mask_index, :]
            masked_probs = softmax(masked_logits)

            # Get the top prediction for the current mask position
            top_prediction_index = torch.argmax(masked_probs).item()
            top_prediction_token = tokenizer.convert_ids_to_tokens([top_prediction_index])[0]

            # Replace the [MASK] token with the top predicted token
            input_ids[mask_index] = top_prediction_index

        # Decode the sentence with the top predictions
        decoded_sentence = tokenizer.decode(input_ids, skip_special_tokens=True)

        # Append the completed sentence to the results file
        results_file.write(decoded_sentence + '\n')

        # print(f"Original sentence: {sentence}")
        # print(f"Completed sentence: {decoded_sentence}\n")

### 2. Handling A.2. sentences

For sentences corresponding to templates 17 through 22, the approach is slightly different. For A.1., there is a set of 5 sentences for each template. For the rest, there is one sentence per template, but the point is to predict a noun following each type of marking, but we'll look at the top 5 prediction for the [MASK] token. There is also one single [MASK] token per sentence (for the noun).

In [None]:
inputs = tokenizer(test_sentences_A2, return_tensors='pt', padding=True)
masked_indices = torch.where(inputs.input_ids == tokenizer.mask_token_id)

with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits

with open("mbert.uncased_results_A2.txt", "a") as file:
    for i, (input_id, masked_index) in enumerate(zip(inputs.input_ids, masked_indices[1])):
        masked_logits = predictions[i, masked_index, :]

        masked_probs = softmax(masked_logits)

        # Get the top predictions and their scores
        top_k = 10  # Number of top predictions to display
        top_k_indices = torch.topk(masked_probs, top_k).indices
        top_k_scores = torch.topk(masked_probs, top_k).values

        predicted_tokens = tokenizer.convert_ids_to_tokens(top_k_indices)

        # Replace the [MASK] token in the original sentence with the predicted tokens
        for token, score in zip(predicted_tokens, top_k_scores):
            # Tokenize the original sentence
            original_sentence_tokens = tokenizer.convert_ids_to_tokens(input_id)
            # Replace the masked index with the predicted token
            original_sentence_tokens[masked_index] = token
            # Convert tokens back to a single sentence
            predicted_sentence = tokenizer.convert_tokens_to_string(original_sentence_tokens)
            # Write the sentence with the predicted token to the file
            file.write(predicted_sentence + '\n')

## **RoBERT** (Masala et al. 2020)

Mihai Masala, Stefan Ruseti, and Mihai Dascalu. (2020). RoBERT – A Romanian BERT Model. In *Proceedings of the 28th International Conference on Computational Linguistics*, pages 6626–6637, Barcelona, Spain (Online). International Committee on Computational Linguistics.

HuggingFace: [here](https://huggingface.co/readerbench/RoBERT-base)

Github: [here](https://github.com/dumitrescustefan/Romanian-Transformers)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("readerbench/RoBERT-large")
# model = AutoModelForMaskedLM.from_pretrained("readerbench/RoBERT-large")
# model.eval()

In [None]:
inputs = tokenizer(test_sentences_A1, return_tensors='pt', padding=True)
masked_indices = torch.where(inputs.input_ids == tokenizer.mask_token_id)

### Handling the A.1. examples

In [None]:
with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits

softmax = torch.nn.Softmax(dim=-1)

with open('ro.bert.base_results_A1.txt', 'a', encoding='utf-8') as results_file:
    # Iterate over each sentence and its masked positions
    for i, sentence in enumerate(test_sentences_A1):
        input_ids = inputs.input_ids[i].clone()
        mask_indices = masked_indices[1][masked_indices[0] == i]

        # Process each mask position
        for mask_index in mask_indices:
            masked_logits = predictions[i, mask_index, :]
            masked_probs = softmax(masked_logits)

            # Get the top prediction for the current mask position
            top_prediction_index = torch.argmax(masked_probs).item()
            top_prediction_token = tokenizer.convert_ids_to_tokens([top_prediction_index])[0]

            # Replace the [MASK] token with the top predicted token
            input_ids[mask_index] = top_prediction_index

        # Decode the sentence with the top predictions
        decoded_sentence = tokenizer.decode(input_ids, skip_special_tokens=True)

        # Append the completed sentence to the results file
        results_file.write(decoded_sentence + '\n')

### Handling the A.2. examples

In [None]:
# Function to get mask predictions
def get_mask_predictions(sentence, top_k=10):
    # Tokenize the input text
    inputs = tokenizer(sentence, return_tensors="pt")
    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

    with torch.no_grad():
        outputs = model(**inputs)

    mask_token_logits = outputs.logits[0, mask_token_index, :]

    # Get the top k predictions
    top_k_logits, top_k_tokens = torch.topk(mask_token_logits, top_k, dim=1)

    # Decode the top k tokens and get the logits
    top_k_predictions = [(tokenizer.decode([token]), logit.item()) for token, logit in zip(top_k_tokens[0], top_k_logits[0])]

    return top_k_predictions

# Get predictions for each sentence
for sentence in test_sentences_A2:
    predictions = get_mask_predictions(sentence)
    with open('robert_large_results_A2.txt', 'a', encoding='utf-8') as file:
        for pred, score in predictions:
            # Replace the [MASK] token with the prediction
            completed_sentence = sentence.replace("[MASK]", pred)
            # Write the completed sentence and score to the file
            file.write(f"{completed_sentence}\t{score}\n")


## **Romanian BERT** (Dumintrescu et al. 2020)

Stefan Dumitrescu, Andrei-Marius Avram, and Sampo Pyysalo. (2020). The birth of Romanian BERT. In *Findings of the Association for Computational Linguistics: EMNLP 2020*, pages 4324–4328, Online. Association for Computational Linguistics.

HuggingFace: [here](https://huggingface.co/dumitrescustefan/bert-base-romanian-uncased-v1)

Github: [here](https://github.com/dumitrescustefan/Romanian-Transformers)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
# model = AutoModelForMaskedLM.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
# model.eval()

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/397k [00:00<?, ?B/s]

In [None]:
inputs = tokenizer(test_sentences_A1, return_tensors='pt', padding=True)
masked_indices = torch.where(inputs.input_ids == tokenizer.mask_token_id)

### Handling the A.1. examples

In [None]:
with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits

softmax = torch.nn.Softmax(dim=-1)

with open('romanian.bert.cased_results_A1.txt', 'a', encoding='utf-8') as results_file:
    # Iterate over each sentence and its masked positions
    for i, sentence in enumerate(test_sentences_A1):
        input_ids = inputs.input_ids[i].clone()  # Clone to avoid in-place modification
        mask_indices = masked_indices[1][masked_indices[0] == i]

        # Process each mask position
        for mask_index in mask_indices:
            masked_logits = predictions[i, mask_index, :]
            masked_probs = softmax(masked_logits)

            # Get the top prediction
            top_prediction_index = torch.argmax(masked_probs).item()
            top_prediction_token = tokenizer.convert_ids_to_tokens([top_prediction_index])[0]

            # Replace the [MASK] token
            input_ids[mask_index] = top_prediction_index

        # Decode the sentence with the top predictions
        decoded_sentence = tokenizer.decode(input_ids, skip_special_tokens=True)

        results_file.write(decoded_sentence + '\n')

### Handle A.2. examples

In [None]:
# Same function as above, copied for convenience
def get_mask_predictions(sentence, top_k=10):
    # Tokenize the input text
    inputs = tokenizer(sentence, return_tensors="pt")
    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

    with torch.no_grad():
        outputs = model(**inputs)

    # Get the logits for the masked token
    mask_token_logits = outputs.logits[0, mask_token_index, :]

    # Get the top k predictions
    top_k_logits, top_k_tokens = torch.topk(mask_token_logits, top_k, dim=1)

    # Decode the top k tokens
    top_k_predictions = [(tokenizer.decode([token]), logit.item()) for token, logit in zip(top_k_tokens[0], top_k_logits[0])]

    return top_k_predictions

# Get predictions for each sentence
for sentence in test_sentences_A2:
    predictions = get_mask_predictions(sentence)
    with open('romanian.bert.cased_results_A2.txt', 'a', encoding='utf-8') as file:
        for pred, score in predictions:
            # Replace the [MASK] token with the prediction
            completed_sentence = sentence.replace("[MASK]", pred)
            file.write(f"{completed_sentence}\t{score}\n")

## Exploring the tokenization

In [None]:
# Define sentences to explore tokenization
sentences = [
    "M-a văzut Maria.",
    "Te-am văzut ieri.",
    "Nu ne-a văzut.",
    "Am văzut-o pe Maria.",
    "L-am văzut alaltăieri.",
    "Nu le-am văzut pe Ileana și pe mama ei."
]


for sentence in sentences:
    tokens = tokenizer.tokenize(sentence)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    decoded_tokens = tokenizer.convert_ids_to_tokens(token_ids)

    print(f"Original sentence: {sentence}")
    print(f"Tokens: {tokens}")

Original sentence: M-a văzut Maria.
Tokens: ['M', '-', 'a', 'văzut', 'Maria', '.']
Original sentence: Te-am văzut ieri.
Tokens: ['Te', '-', 'am', 'văzut', 'ieri', '.']
Original sentence: Nu ne-a văzut.
Tokens: ['Nu', 'ne', '-', 'a', 'văzut', '.']
Original sentence: Am văzut-o pe Maria.
Tokens: ['Am', 'văzut', '-', 'o', 'pe', 'Maria', '.']
Original sentence: L-am văzut alaltăieri.
Tokens: ['L', '-', 'am', 'văzut', 'ala', '##lt', '##ăi', '##er', '##i', '.']
Original sentence: Nu le-am văzut pe Ileana și pe mama ei.
Tokens: ['Nu', 'le', '-', 'am', 'văzut', 'pe', 'Ileana', 'și', 'pe', 'mama', 'ei', '.']
