In [None]:
from datasets import load_dataset, Dataset, DatasetDict
import torch
import numpy as np
from transformers import AutoModelForMaskedLM, BertTokenizer, DataCollatorForLanguageModeling
from tokenizers import BertWordPieceTokenizer
import os



# Preprocessing First

In [None]:
dataset = load_dataset("wikitext", "wikitext-103-raw-v1")
print(dataset)

README.md: 0.00B [00:00, ?B/s]

wikitext-103-raw-v1/test-00000-of-00001.(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-103-raw-v1/train-00000-of-00002(…):   0%|          | 0.00/157M [00:00<?, ?B/s]

wikitext-103-raw-v1/train-00001-of-00002(…):   0%|          | 0.00/157M [00:00<?, ?B/s]

wikitext-103-raw-v1/validation-00000-of-(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})


In [None]:
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=True,
    lowercase=True
)

train_lines = [line for line in dataset["train"]["text"] if len(line.strip()) > 0]

with open("wikitext_train.txt", "w", encoding="utf-8") as f:
    for line in train_lines:
        f.write(line + "\n")


In [None]:
tokenizer.train(
    files=["wikitext_train.txt"],
    vocab_size=16384,
    min_frequency=2,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
)

In [None]:
output_dir = "custom-ltg-tokenizer"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

tokenizer.save_model(output_dir)

['custom-ltg-tokenizer/vocab.txt']

In [None]:
tokenizer = BertTokenizer.from_pretrained("custom-ltg-tokenizer")
print(tokenizer.vocab_size)

16384


In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )


In [None]:
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])


Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [None]:
print(tokenized_dataset)

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3760
    })
})


In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

In [None]:
from transformers import BertConfig, BertForMaskedLM

config = BertConfig(
    attention_probs_dropout_prob=0.1,
    hidden_dropout_prob=0.1,
    hidden_size=192,
    intermediate_size=512,
    max_position_embeddings=512,
    position_bucket_size=32,
    num_attention_heads=3,
    num_hidden_layers=12,
    vocab_size=tokenizer.vocab_size,
    layer_norm_eps=1e-7,
    pad_token_id=tokenizer.pad_token_id
)

model = BertForMaskedLM(config)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/ltgbert-wikitext103-(4e-5)-checkpoints",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    learning_rate=4e-5,
    warmup_steps=1000,
    weight_decay=0.01,

    # --- CHECKPOINTING SETTINGS ---
    save_strategy="steps",          # Save by step count, not only epochs
    save_steps=10000,               # Save every 10,000 steps

    fp16=True,
    report_to="none"
)


In [None]:
from transformers import Trainer
from transformers import TrainerCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator
)

class EpochCheckpointCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        control.should_save = True
        return control

trainer.add_callback(EpochCheckpointCallback)

trainer.save_model("/content/drive/MyDrive/ltgbert-wikitext103-(4e-5)-checkpoints/checkpoint-initial")

In [None]:
# Load the model from the checkpoint
model = BertForMaskedLM.from_pretrained("/content/drive/MyDrive/ltgbert-wikitext103-(4e-5)-checkpoints/checkpoint-370000")

# Initialize the Trainer with the loaded model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator
)

# Add the callback to save checkpoints at the end of each epoch
class EpochCheckpointCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        control.should_save = True
        return control

trainer.add_callback(EpochCheckpointCallback)

# Start training
trainer.train(resume_from_checkpoint="/content/drive/MyDrive/ltgbert-wikitext103-(4e-5)-checkpoints/checkpoint-370000")

There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].


Step,Training Loss
370500,3.241
371000,3.2286
371500,3.2128
372000,3.1943
372500,3.1777
373000,3.1736
373500,3.1663
374000,3.1722
374500,3.1333
375000,3.1697


Step,Training Loss
370500,3.241
371000,3.2286
371500,3.2128
372000,3.1943
372500,3.1777
373000,3.1736
373500,3.1663
374000,3.1722
374500,3.1333
375000,3.1697


TrainOutput(global_step=562930, training_loss=1.0407410703418525, metrics={'train_runtime': 32285.1662, 'train_samples_per_second': 557.95, 'train_steps_per_second': 17.436, 'total_flos': 2.32962788524032e+17, 'train_loss': 1.0407410703418525, 'epoch': 10.0})

In [None]:
trainer.train()

NameError: name 'trainer' is not defined

In [None]:
trainer.save_model("/content/drive/MyDrive/XS-ltgbert-wikitext103")

## Extracting the static embeddings + specific token embedding

In [None]:
static_embs = model.get_input_embeddings().weight.detach().cpu().numpy()
print(static_embs.shape)

In [None]:
tokens = tokenizer.tokenize("clouds")
print(tokens)

token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

vectors = [static_embs[i] for i in token_ids]
for v in vectors:
    print(v.shape)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

if "cloud" in tokenizer.vocab:
    cloud_token_id = tokenizer.convert_tokens_to_ids("cloud")
    cloud_embedding = static_embs[cloud_token_id].reshape(1, -1)

    # Calculate cosine similarity between "cloud" embedding and all other embeddings
    similarities = cosine_similarity(cloud_embedding, static_embs)[0]

    # Get the indices of the top 11 most similar words (including "cloud" itself)
    # We take 11 to exclude the word "cloud" itself from the results
    most_similar_indices = np.argsort(similarities)[::-1][1:11]

    # Get the corresponding tokens and their similarity scores
    most_similar_tokens = tokenizer.convert_ids_to_tokens(most_similar_indices)
    most_similar_scores = [similarities[i] for i in most_similar_indices]

    print("10 nearest neighbors of 'cloud':")
    for token, score in zip(most_similar_tokens, most_similar_scores):
        print(f"{token}: {score:.4f}")
else:
    print("'cloud' is not in the vocabulary.")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Extracting epoch by epoch

In [None]:
model_epoch1 = BertForMaskedLM.from_pretrained("ltgbert-wikitext2-checkpoints/checkpoint-1148")
model_epoch2 = BertForMaskedLM.from_pretrained("ltgbert-wikitext2-checkpoints/checkpoint-2296")
model_epoch3 = BertForMaskedLM.from_pretrained("ltgbert-wikitext2-checkpoints/checkpoint-3444")

Ready to test!