In [None]:
#!pip install transformers==4.40.0 
!pip install -U git+https://github.com/huggingface/transformers
!pip install datasets # 2.15.0
!pip install portalocker>=2/0.0
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install torch==2.3.0
!pip install -U torchvision
!pip install protobuf==3.20.*

In [None]:
!pip install datasets==2.15.0

In [None]:
import torch
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoConfig,AutoModelForCausalLM,AutoModelForSequenceClassification,BertConfig,BertForMaskedLM,TrainingArguments, Trainer, TrainingArguments
from transformers import AutoTokenizer,BertTokenizerFast,TextDataset,DataCollatorForLanguageModeling
from transformers import pipeline
from datasets import load_dataset

from tqdm.auto import tqdm
import math
import time
import os


# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [None]:
# Set the environment variable TOKENIZERS_PARALLELISM to 'false'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [None]:
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

pipe = pipeline("text-generation", model=model,tokenizer=tokenizer, device=0)
print(pipe("This movie was really")[0]["generated_text"])

In [None]:
# Load the datasets
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

In [None]:
print(dataset)

In [None]:
#check a sample record
dataset["train"][400]

In [None]:
# Path to save the datasets to text files
output_file_train = "wikitext_dataset_train.txt"
output_file_test = "wikitext_dataset_test.txt"

# Open the output file in write mode
with open(output_file_train, "w", encoding="utf-8") as f:
    # Iterate over each example in the dataset
    for example in dataset["train"]:
        # Write the example text to the file
        f.write(example["text"] + "\n")

# Open the output file in write mode
with open(output_file_test, "w", encoding="utf-8") as f:
    # Iterate over each example in the dataset
    for example in dataset["test"]:
        # Write the example text to the file
        f.write(example["text"] + "\n")

In [None]:
# create a tokenizer from existing one to re-use special tokens
bert_tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [None]:
model_name = 'bert-base-uncased'

model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, is_decoder=True)

In [None]:
# Define the BERT configuration
config = BertConfig(
    vocab_size=len(bert_tokenizer.get_vocab()),  # Specify the vocabulary size(Make sure this number equals the vocab_size of the tokenizer)
    hidden_size=768,  # Set the hidden size
    num_hidden_layers=12,  # Set the number of layers
    num_attention_heads=12,  # Set the number of attention heads
    intermediate_size=3072,  # Set the intermediate size
)

In [None]:
# Create the BERT model for pre-training
model = BertForMaskedLM(config)

In [None]:
# check model configuration
model

In [None]:
# Prepare the pre-training data as a TextDataset
train_dataset = TextDataset(
    tokenizer=bert_tokenizer,
    file_path="wikitext_dataset_train.txt",  # Path to your pre-training data file
    block_size=128  # Set the desired block size for training
)
test_dataset = TextDataset(
    tokenizer=bert_tokenizer,
    file_path="wikitext_dataset_test.txt",  # Path to your pre-training data file
    block_size=128  # Set the desired block size for training
)

In [None]:
train_dataset[0]

In [None]:
# Prepare the data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=bert_tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
# check how collator transforms a sample input data record
data_collator([train_dataset[0]])

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

os.environ["WANDB_DISABLED"] = "true"


# Define the training arguments
training_args = TrainingArguments(
    output_dir="./trained_model",  # Specify the output directory for the trained model
    overwrite_output_dir=True,
    do_eval=True,
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=10,  # Specify the number of training epochs
    per_device_train_batch_size=2,  # Set the batch size for training
    save_total_limit=2,  # Limit the total number of saved checkpoints
    logging_steps = 20,
    fp16=True if torch.cuda.is_available() else False
    
)

# Instantiate the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Start the pre-training
trainer.train()