In [1]:
import os
import json 
import logging

logging.basicConfig(
    filename='log/app.log',            # Specify the log file name
    level=logging.DEBUG,           # Set the log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
    format='%(asctime)s - %(levelname)s - %(message)s'  # Set the log format
)

# Load the environment configuration JSON data
json_path = 'env_config.json'
with open(json_path, 'r') as file:
    env_config = json.load(file)

hf_home = env_config['HF_HOME']
# Set the HF_HOME environment variable
os.environ['HF_HOME'] = hf_home
# Set the access token to huggingface hub
access_token = env_config['access_token']
os.environ['HUGGINGFACE_HUB_TOKEN'] = access_token

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer


ds = load_dataset("rajpurkar/squad")

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
ds['train'][1]

{'id': '5733be284776f4190066117f',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'What is in front of the Notre Dame Main Building?',
 'answers': {'text': ['a copper statue of Christ'], 'answer_start': [188]}}

In [6]:
answer_len = len(ds['train'][0]['answers']['text'][0])
ds['train'][0]['context'][515:515+answer_len]

'Saint Bernadette Soubirous'

In [12]:
import torch

class LlmExpHelper:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def get_collate_fun(self):
        return lambda examples: self.collate_fn(examples)

    def collate_fn(self, examples):
        def num_words(x):
            return len(x.split())
        def get_first_k_words(x, k):
            return ' '.join(x.split()[:k])
        tokenizer = self.tokenizer
        max_len = 1024 # characters limit other than token limit
        context = [example['context'] for example in examples]
        # clip the context to max_len
        context = [text if num_words(text) <= max_len else get_first_k_words(text, max_len) for text in context]
        question = [example['question'] for example in examples]
        answer = [example['answers']['text'][0] for example in examples]
        answer_start = [example['answers']['answer_start'][0] for example in examples]
        answer_end = [answer_start[i] + len(answer[i]) for i in range(len(answer))]

        texts = [f"Context: {context[i]}\n Question: {question[i]}" for i in range(len(context))]

        messages_lambda = lambda text: [
            {"role": "system", "content": "Answer the question based on the context."},
            # {"role": "system", "content": "You are a chatbot for sentimate analysis."},
            {"role": "user", "content": text},
        ]
        messages = list(map(messages_lambda, texts))

        messages_with_template_applied = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
        )
        batch = tokenizer(
                    messages_with_template_applied,
                    add_special_tokens=False,
                    # pad_token=tokenizer.eos_token,
                    padding=True,
                    return_tensors="pt",
                    )
        
        # find the template boundaries
        text_lens = [len(tokenizer.encode(text)) - 1 for text in texts] # note that the tokenizer.encode adds the eos token
        text_lens_tensor = torch.tensor(text_lens, dtype=torch.long)
        
        def apply_mask(mask_tensor, text_lens_tensor):
            batch_size, seq_len = mask_tensor.shape
            for i in range(batch_size):
                text_len = text_lens_tensor[i].item()
                mask_tensor[i, -text_len-5:-5] = 0
            return 1- mask_tensor

        mask_tensor = apply_mask(torch.ones_like(batch['input_ids']), text_lens_tensor)

        batch['context_mask'] = mask_tensor
        
        return batch

llm_exp_helper = LlmExpHelper(tokenizer)
collate_fn = llm_exp_helper.get_collate_fun()

In [13]:
test_ds = ds['validation'].select(range(10))
test_batch = collate_fn(test_ds)

In [17]:
test_batch['input_ids'].shape

torch.Size([10, 225])

In [19]:
tokenizer.decode(test_batch['input_ids'][0])

'<|im_start|>system\nAnswer the question based on the context.<|im_end|>\n<|im_start|>user\nContext: Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.\n Question: Which NFL team represented the AFC at Super Bowl 50?<|im_end|>\n<|im_start|>assistant\n<|end_of_text|><|end_of