In [1]:
pip install transformers datasets accelerate


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
from sklearn.decomposition import PCA

file_path = "/content/drive/MyDrive/merged-dataset.txt"
with open(file_path, "r") as file:
    data = [line.strip() for line in file if line.strip()]

train_ratio = 0.1
train_size = int(len(data) * train_ratio)

# Split the data
train_data = data[train_size:]
validation_data = data[:train_size]

# Save the splits
with open("train_dataset.txt", "w", encoding="utf-8") as train_file:
    train_file.writelines(train_data)

with open("validation_dataset.txt", "w", encoding="utf-8") as validation_file:
    validation_file.writelines(validation_data)

In [5]:
import numpy as np

# Loading Assignment#2 embeddings
embedding_file = "/content/drive/MyDrive/Colab Notebooks/merged-embeddings.txt"
merge_embeddings = {}
with open(embedding_file, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip() # Strip leading/trailing whitespace
        if line: # Check if line is not empty after stripping
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype="float32")
            merge_embeddings[word] = vector


In [6]:
!pip install torch # Install the torch module

import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Loading GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Mapping GloVe embeddings to GPT-2 vocabulary
embedding_dim = model.transformer.wte.weight.shape[1]

embedding_matrix = np.zeros((len(tokenizer), embedding_dim))
for i, token in enumerate(tokenizer.get_vocab()):
    word = tokenizer.decode([i]).strip()
    if word in merge_embeddings:
        # Projecting GloVe embeddings to the correct dimension using linear projection
        merge_embeddings = merge_embeddings[word]

        projection_matrix = np.random.randn(len(merge_embeddings), embedding_dim)
        projected_embedding = merge_embeddings @ projection_matrix  # Project the GloVe embedding
        embedding_matrix[i] = projected_embedding
    else:
        embedding_matrix[i] = np.random.normal(size=(embedding_dim,))

# Replacing the model's embedding layer
model.resize_token_embeddings(len(tokenizer))
with torch.no_grad():
    model.transformer.wte.weight = torch.nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
from datasets import Dataset
def tokenize_function(examples):
    tokenizer.pad_token = tokenizer.eos_token
    # Tokenize the examples
    tokenized_examples = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_special_tokens_mask=True # Return a special tokens mask
    )

    # Creating the attention mask from the special tokens mask
    tokenized_examples["attention_mask"] = [
        [1 if token != tokenizer.pad_token_id else 0 for token in ids]
        for ids in tokenized_examples["input_ids"]
    ]
    return tokenized_examples

train_dataset = Dataset.from_dict({"text": train_data})
tokenized_dataset = train_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [None]:

from transformers import TrainingArguments, Trainer



# Define a function to compute the loss
def compute_loss(model, inputs, return_outputs=False):
    """Computes the language modeling loss."""
    labels = inputs.get("labels")
    # forward pass
    outputs = model(**inputs)
    # If `return_outputs=True` return the predictions and the labels
    if return_outputs:
        return (outputs.loss, outputs.logits)
    return outputs.loss  # Return the loss

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    num_train_epochs= 20,
    save_strategy="epoch",
    logging_dir="./logs",
    save_total_limit=2,
    fp16=True,
    logging_steps=10,
    report_to="none",
)
# Creating a validation dataset similar to the training dataset
validation_dataset = Dataset.from_dict({"text": validation_data})
validation_dataset = validation_dataset.map(tokenize_function, batched=True) # Tokenize validation dataset

# Including the labels in the training dataset
tokenized_dataset = tokenized_dataset.map(lambda examples: {'labels': examples['input_ids']}, batched=True)
validation_dataset = validation_dataset.map(lambda examples: {'labels': examples['input_ids']}, batched=True)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,

)


def train_step(model, inputs):
    """Custom training step to use the compute_loss function."""
    loss = compute_loss(model, inputs)
    loss.backward()
    return {"loss": loss.item()}

trainer.train_step = train_step



trainer.train()






Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,8.141575
2,9.031600,6.727363
3,9.031600,5.960961
4,7.054900,5.477588
5,7.054900,5.308753
6,6.025000,5.02072
7,6.025000,4.568008
8,5.231400,4.086609
9,5.231400,3.685771
10,4.516600,3.515598


TrainOutput(global_step=100, training_loss=4.660591011047363, metrics={'train_runtime': 1202.1571, 'train_samples_per_second': 0.166, 'train_steps_per_second': 0.083, 'total_flos': 18993158553600.0, 'train_loss': 4.660591011047363, 'epoch': 20.0})

In [None]:
trainer.save_model("/content/drive/MyDrive/fine_tunedwith_gpt2")
tokenizer.save_pretrained("/content/drive/MyDrive/fine_tunedwith_gpt2")


('/content/drive/MyDrive/fine_tunedwith_gpt2/tokenizer_config.json',
 '/content/drive/MyDrive/fine_tunedwith_gpt2/special_tokens_map.json',
 '/content/drive/MyDrive/fine_tunedwith_gpt2/vocab.json',
 '/content/drive/MyDrive/fine_tunedwith_gpt2/merges.txt',
 '/content/drive/MyDrive/fine_tunedwith_gpt2/added_tokens.json')

In [9]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Loading trained model
model_path = "/content/drive/MyDrive/fine_tuned_gpt2"
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# seed text
input_text = "National Institutes of Health NIH Turning Discovery"
input_ids = tokenizer.encode(input_text, return_tensors="pt")
attention_mask = torch.ones_like(input_ids)
output = model.generate(input_ids,attention_mask=attention_mask, max_length=50,no_repeat_ngram_size=2, num_return_sequences=1)
print("Input Text:",input_text)
print("Generated Text:")
generated_text=tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input Text: National Institutes of Health NIH Turning Discovery
Generated Text:
National Institutes of Health NIH Turning Discovery of of impending impending809809 Infantry Infantrytubtub Craft Craftilil Craft expulsion expulsion CraftUniversal of Drive expulsiontracktrack100100 Craft wasteland 311 expulsion Zeal Zeal Craft Zeal Birmingham Birmingham expulsion 96 96 expulsion ze ze expulsion


In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
references = "".join(validation_data).split()
candidate = generated_text.split()
smoothing_function = SmoothingFunction().method1
bleu_score1 = sentence_bleu(references, generated_text,weights=(1, 0, 0, 0), smoothing_function=smoothing_function)
bleu_score2 = sentence_bleu(references, generated_text,weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing_function)
bleu_score3 = sentence_bleu(references, generated_text,weights=(0.3, 0.3, 0.3, 0), smoothing_function=smoothing_function)
bleu_score4 = sentence_bleu(references, generated_text,weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing_function)
print(f"BLEU Score 1: {bleu_score1}")
print(f"BLEU Score 2: {bleu_score2}")
print(f"BLEU Score 3: {bleu_score3}")
print(f"BLEU Score 4: {bleu_score4}")

BLEU Score 1: 0.21656050955414013
BLEU Score 2: 0.2425084782425038
BLEU Score 3: 0.2511149323833971
BLEU Score 4: 0.18311364006086758


In [None]:
!pip install rouge-score
from rouge_score import rouge_scorer
references = "".join(validation_data)
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
scores = scorer.score(references , generated_text)
print("ROUGE Scores:", scores)

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=5777fb9bab76de4f6091e68c81bda409e0b62ab6e73001b8b4640472ca82587f
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
ROUGE Scores: {'rouge1': Score(precision=0.28205128205128205, recall=0.013189448441247002, fmeasure=0.025200458190148916), 'rouge2': Score(precision=0.15789473684210525, recall=0.007202881152460984, fmeasure=0.013777267508610792), 'rougeL': Score(precision=0.28205128205128205, recall=0.013189448441247002, fmeasure=0.025200458190148916)}
