In [2]:
from huggingface_hub import login


login("hugging_face_token")  # Paste your Hugging Face API token here

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
!pip install datasets evaluate 

In [5]:
!pip install rouge_score bleu 

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 24.8.3 requires cubinlinker, which is not installed.
cudf 24.8.3 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 24.8.3 requires ptxcompiler, which is not installed.
cuml 24.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 24.8.3 requires cupy-cuda11x>=12.0.0, which is not installed.
apache-beam 2.46.0 requires cloudpickle~=2.2.1, but you have cloudpickle 3.0.0 which is incompatible.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.8 which is incompatible.
apache-beam 2.46.0 requires numpy<1.25.0,>=1.14.3, but you have numpy 1.26.4 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 15.0.2 which is incompatible.
beatrix-jupyterlab 2024.66.154055 requires jupyterlab~=3.6.0, but you have jupyterlab 4.2.5 whic

In [9]:
import torch
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, Trainer, TrainingArguments, default_data_collator
from datasets import load_dataset
import evaluate
import numpy as np
import tqdm


# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Load the model and tokenizer
model_name = "meta-llama/Llama-3.2-1B-Instruct"
model = AutoModelForQuestionAnswering.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# Freeze all layers except the final linear layers (qa_outputs)
for param in model.base_model.parameters():
    param.requires_grad = False  # Freeze transformer layers

# Unfreeze the final linear layer (qa_outputs)
if hasattr(model, "qa_outputs"):
    for param in model.qa_outputs.parameters():
        param.requires_grad = True  # Unfreeze the final linear layer

# Add a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# Load the SQuAD v2 dataset
dataset = load_dataset("squad_v2")

# Preprocess function
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=128,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length"
    )
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        if len(answer["answer_start"]) == 0:
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_char = answer["answer_start"][0]
            end_char = start_char + len(answer["text"][0])
            sequence_ids = inputs.sequence_ids(i)

            # Find the start and end of the context
            context_start = sequence_ids.index(1)
            context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

            # If the answer is out of the context, label it (0, 0)
            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Otherwise, label the start and end positions
                start_idx = context_start
                end_idx = context_end
                while start_idx < len(offset) and offset[start_idx][0] <= start_char:
                    start_idx += 1
                while end_idx >= 0 and offset[end_idx][1] >= end_char:
                    end_idx -= 1
                start_positions.append(start_idx - 1)
                end_positions.append(end_idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True, num_proc=10)

Some weights of LlamaForQuestionAnswering were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['embed_tokens.weight', 'layers.0.input_layernorm.weight', 'layers.0.mlp.down_proj.weight', 'layers.0.mlp.gate_proj.weight', 'layers.0.mlp.up_proj.weight', 'layers.0.post_attention_layernorm.weight', 'layers.0.self_attn.k_proj.weight', 'layers.0.self_attn.o_proj.weight', 'layers.0.self_attn.q_proj.weight', 'layers.0.self_attn.v_proj.weight', 'layers.1.input_layernorm.weight', 'layers.1.mlp.down_proj.weight', 'layers.1.mlp.gate_proj.weight', 'layers.1.mlp.up_proj.weight', 'layers.1.post_attention_layernorm.weight', 'layers.1.self_attn.k_proj.weight', 'layers.1.self_attn.o_proj.weight', 'layers.1.self_attn.q_proj.weight', 'layers.1.self_attn.v_proj.weight', 'layers.10.input_layernorm.weight', 'layers.10.mlp.down_proj.weight', 'layers.10.mlp.gate_proj.weight', 'layers.10.mlp.up_proj.weight', 'layers.10.post_attention_layernorm.weight', 'la

In [10]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters in {model_name}: {total_params}")

Total parameters in meta-llama/Llama-3.2-1B-Instruct: 1235820546


In [11]:
# Model Evaluation Function
def evaluate_model(tokenized_datasets, model, tokenizer, batch_size=16):

  model.eval()
  model.half()
  def postprocess_predictions(examples, features, raw_predictions):
      all_start_logits, all_end_logits = raw_predictions
      predictions = []

      for i in range(len(features["input_ids"])):
          # Use the logits to get predicted start and end indices
          start_idx = np.argmax(all_start_logits[i])
          end_idx = np.argmax(all_end_logits[i])

          # Default to no answer
          prediction_text = ""
          no_answer_probability = 1.0

          # Valid indices ensure the prediction range is valid
          if start_idx <= end_idx and end_idx < len(examples["context"][i]):
              start_char = features["start_positions"][i]
              end_char = features["end_positions"][i]
              prediction_text = examples["context"][i][start_char:end_char + 1]
              no_answer_probability = 0.0  # Predicted an answer

          predictions.append({
              "id": examples["id"][i],  # Use `examples["id"]` for IDs
              "prediction_text": prediction_text,
              "no_answer_probability": no_answer_probability
          })

      return predictions

  # Chunk inference to process batches
  all_start_logits = []
  all_end_logits = []

  for start_idx in tqdm.tqdm(range(0, len(tokenized_datasets), batch_size), desc="Evaluating"):
      end_idx = min(start_idx + batch_size, len(tokenized_datasets))
      batch = tokenized_datasets[start_idx:end_idx]

      # Prepare inputs
      input_ids = torch.tensor(batch["input_ids"]).to(device)
      attention_mask = torch.tensor(batch["attention_mask"]).to(device)

      with torch.no_grad():
          outputs = model(input_ids=input_ids, attention_mask=attention_mask)
          all_start_logits.append(outputs.start_logits.cpu().numpy())
          all_end_logits.append(outputs.end_logits.cpu().numpy())

  # Concatenate logits
  all_start_logits = np.concatenate(all_start_logits, axis=0)
  all_end_logits = np.concatenate(all_end_logits, axis=0)

  # Post-process predictions
  predictions = postprocess_predictions(
      tokenized_datasets,
      tokenized_datasets,
      (all_start_logits, all_end_logits)
  )

  # Prepare references in the correct format
  references = [
      {
          "id": example["id"],
          "answers": example["answers"]
      }
      for example in tokenized_datasets
  ]

  pred_texts = [pred["prediction_text"] for pred in predictions]

  # Initialize ref_texts as a list of lists
  ref_texts = []

  for ref in references:
      if ref["answers"]["text"]:
          ref_texts.append([ans for ans in ref["answers"]["text"]])
      else:
          ref_texts.append([""])  # Default for no answers

  # Evaluate metrics
  squad_metric = evaluate.load("squad_v2")
  bleu_metric = evaluate.load("bleu")
  rouge_metric = evaluate.load("rouge")
  # meteor_metric = evaluate.load("meteor")

  squad_results = squad_metric.compute(predictions=predictions, references=references)
  bleu_results = bleu_metric.compute(predictions=pred_texts, references=ref_texts)
  rouge_results = rouge_metric.compute(predictions=pred_texts, references=ref_texts)
 

  # Consolidate results
  results = {
      "Exact Match (EM)": squad_results["exact"],
      "F1 Score": squad_results["f1"],
      "BLEU": bleu_results["bleu"],
      "ROUGE": rouge_results,
  }

  return results

In [12]:
results = evaluate_model(
    tokenized_datasets=tokenized_datasets["validation"],
    model=model,
    tokenizer=tokenizer,
    batch_size=64
)

print("Zero-shot predictions:")
results

Evaluating: 100%|██████████| 186/186 [06:57<00:00,  2.24s/it]


Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Zero-shot predictions:


{'Exact Match (EM)': 26.27810999747326,
 'F1 Score': 26.44984257208284,
 'BLEU': 0.004509031635948375,
 'ROUGE': {'rouge1': 0.0031690130494331204,
  'rouge2': 0.00013101621793612026,
  'rougeL': 0.0031646960770456856,
  'rougeLsum': 0.0031605142684630006}}

In [13]:

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,   
    num_train_epochs=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir='./logs',

    logging_steps=20,
    fp16=True
)



In [15]:

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=default_data_collator,
    tokenizer=tokenizer,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [16]:
# Train the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112676366666265, max=1.0…

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss
1,3.1878,2.684629




TrainOutput(global_step=8145, training_loss=3.416555714592603, metrics={'train_runtime': 7146.3161, 'train_samples_per_second': 18.236, 'train_steps_per_second': 1.14, 'total_flos': 9.739773098264832e+16, 'train_loss': 3.416555714592603, 'epoch': 1.0})

In [18]:
# Save the model
trainer.save_model("./fine_tuned_llama_on_squad")
tokenizer.save_pretrained("./fine-tuned-llama_sqaud")

('./fine-tuned-llama_sqaud/tokenizer_config.json',
 './fine-tuned-llama_sqaud/special_tokens_map.json',
 './fine-tuned-llama_sqaud/tokenizer.json')

In [19]:
results = evaluate_model(
    tokenized_datasets=tokenized_datasets["validation"],
    model=model,
    tokenizer=tokenizer,
    batch_size=64
)

print("Fine-tuned evaludeted scores:")
results

Evaluating: 100%|██████████| 186/186 [06:57<00:00,  2.25s/it]


Fine-tuned evaludeted scores:


{'Exact Match (EM)': 4.977680451444454,
 'F1 Score': 5.3672533937792855,
 'BLEU': 0.01829738879544512,
 'ROUGE': {'rouge1': 0.006728270222788735,
  'rouge2': 0.0006139184051240415,
  'rougeL': 0.006727793819852609,
  'rougeLsum': 0.00670582432438536}}

In [20]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters in {model_name} after fine-tuning: {total_params}")

Total parameters in meta-llama/Llama-3.2-1B-Instruct after fine-tuning: 1235820546


In [17]:
from huggingface_hub import HfApi

# Push the model to Hugging Face Model Hub
model.push_to_hub("llama_fine_tuning_squad")  # Replace with your model's name on Hugging Face Hub
tokenizer.push_to_hub("tokenizer_squad")

model.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/munikumar4689/tokenizer_squad/commit/00d1bf42fce80b9f08faa160c5af56bf5417c17b', commit_message='Upload tokenizer', commit_description='', oid='00d1bf42fce80b9f08faa160c5af56bf5417c17b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/munikumar4689/tokenizer_squad', endpoint='https://huggingface.co', repo_type='model', repo_id='munikumar4689/tokenizer_squad'), pr_revision=None, pr_num=None)