In [66]:
pip install transformers datasets torch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [67]:
from transformers import BertTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import torch

In [68]:
# Load training utterances and answers
with open("data/train_utt.txt", "r") as f:
    utterances = f.readlines()

with open("./data/train_ans.txt", "r") as f:
    slots = f.readlines()

# Strip whitespace
utterances = [utt.strip() for utt in utterances]
slots = [slot.strip() for slot in slots]


Parsing slot annotations into dictionaries and pairing them with their respective utterances. This part was intended to help us understanding the parsing process for our utterances. 

Future implementation: for higher-level intent, we should do use_cases and do single canonical use cases. For example "school", "work". 

In [69]:
def parse_slots(slot):
    pairs = slot.split("|")[1:] 
    slot_dict = {}
    for pair in pairs:
        key, value = pair.split("=")
        slot_dict[key] = value
    return slot_dict

# Convert slot strings to dictionaries
slot_dicts = [parse_slots(slot) for slot in slots]

# Combine utterances with their parsed slot dictionaries
utterance_slot_pairs = list(zip(utterances, slot_dicts))

# Display the parsed dictionaries
for i, (utterance, slot_dict) in enumerate(utterance_slot_pairs[:10]):  # Limit to first 10
    print(f"Utterance {i+1}: {utterance}")
    print(f"{slot_dict}")
    print()

Utterance 1: Can you show me laptops from a specific brand, like Apple?
{'brand': 'Apple'}

Utterance 2: I need a budget-friendly laptop. What’s the cheapest one you have?
{'price': 'cheap'}

Utterance 3: I need a laptop with a powerful processor for heavy tasks. Any suggestions?
{'processor_tier': 'high'}

Utterance 4: Can you recommend a laptop with at least a Core i5 processor?
{'processor_tier': 'i5+'}

Utterance 5: I need a compact laptop with a smaller display. What are my options?
{'display_size': 'small'}

Utterance 6: What’s the most affordable laptop with a 15.6-inch display?
{'display_size': '15.6', 'price': 'cheap'}

Utterance 7: I am trying to find a laptop that has a core i9 processor and display size of 15.6, what are my options?
{'processor_tier': 'i9', 'display_size': '15.6'}

Utterance 8: I am looking specifically for a Samsung laptop.
{'brand': 'Samsung'}

Utterance 9: I want a laptop that has at least 16GB RAM and doesn’t cost more than $500. Any options?
{'ram_memo

In [70]:
from collections import defaultdict
intent_dic = {'find_laptop':defaultdict(set)}
intent_dic

{'find_laptop': defaultdict(set, {})}

In [71]:
from collections import defaultdict

# Initialize dictionary for 'find_laptop' intent
intent_dic = {'find_laptop': defaultdict(set)}

# Placeholder for context and answers related to laptops
laptop_context = []
laptop_answers = []

# File paths to the dataset
utt_file_path = "data/train_utt.txt"
ans_file_path = "data/train_ans.txt"

# Process dataset to extract laptop-related information
with open(utt_file_path, 'r') as f1:
    with open(ans_file_path, 'r') as f2:
        utterances = f1.readlines()
        for i, line in enumerate(f2.readlines()):
            a_line = line.strip()
            ans = a_line.split('|')  
            intent = ans[0]  # Extract the intent
            if intent == 'find_laptop':  # Only process 'find_laptop' intent
                for a in ans[1:]:
                    if '=' in a:
                        slot, value = a.split('=')  # Split slot-value pair
                        laptop_context.append(utterances[i].strip())  # Save context
                        laptop_answers.append(value)  # Save the value
                        intent_dic[intent][slot].add(value)  # Add slot-value pair to intent_dic

# Output the updated intent dictionary
print(intent_dic)

{'find_laptop': defaultdict(<class 'set'>, {'brand': {'ASUS', 'lenovo', 'Apple', 'Microsoft', 'Samsung', 'HP', 'Dell'}, 'price': {'700', '500', 'mid-range', 'expensive', 'cheap'}, 'processor_tier': {'high', 'm2', 'decent', 'core i7', 'i7', 'i9', 'm1', 'i5+', 'latest'}, 'display_size': {'15', 'small', '13', '14', '15.6'}, 'ram_memory': {'16GB+', '16GB', 'max', '8GB', '32GB', '8GB+', '256GB+'}, 'brand!': {'Apple'}})}


In [73]:
laptop_context[:3]

['Can you show me laptops from a specific brand, like Apple?',
 'I need a budget-friendly laptop. What’s the cheapest one you have?',
 'I need a laptop with a powerful processor for heavy tasks. Any suggestions?']

In [72]:
laptop_answers[:3]

['Apple', 'cheap', 'high']

Edge Case: Negation for specific 

In [74]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [78]:
def get_answer_span_tensor(question, context, answer):
    # Print inputs before tokenization
    print("Question:", question)
    print("Context:", context)
    print("Answer:", answer)

    # Tokenize inputs
    ques = tokenizer.tokenize(question)
    context = tokenizer.tokenize(context)
    answer = tokenizer.tokenize(answer)

    # Print tokenized inputs
    print("Tokenized Question:", ques)
    print("Tokenized Context:", context)
    print("Tokenized Answer:", answer)

    # Combine tokens with special tokens
    s = ["[CLS]"] + ques + ["[SEP]"] + context + ["[SEP]"]
    print("Combined Tokens (with special tokens):", s)

    # Initialize start and end
    start, end = 0, 0

    # Match tokens to find the answer span
    for id_a, token_a in enumerate(answer):
        for id_s, token_s in enumerate(s):
            if token_a == token_s:
                if answer == s[id_s:id_s + len(answer)]:
                    start = id_s
                    end = id_s + len(answer) - 1
                    break
                elif len(s) - id_s + 1 <= len(answer):
                    break

    # Print start and end positions
    print("Start:", start)
    print("End:", end)

    return torch.tensor([start, end])



In [79]:
# Example inputs
test_question = "What are you looking for in a computer?"
test_context = "I'm looking for laptops like Apple that have an M1 chip."
test_answer = "Apple"

# Call function
tensor = get_answer_span_tensor(question, context, answer)
print("Answer Span Tensor:", tensor)

Question: What specifications?
Context: I'm looking for laptops like Apple that have an M1 chip.
Answer: Apple
Tokenized Question: ['what', 'specifications', '?']
Tokenized Context: ['i', "'", 'm', 'looking', 'for', 'laptop', '##s', 'like', 'apple', 'that', 'have', 'an', 'm1', 'chip', '.']
Tokenized Answer: ['apple']
Combined Tokens (with special tokens): ['[CLS]', 'what', 'specifications', '?', '[SEP]', 'i', "'", 'm', 'looking', 'for', 'laptop', '##s', 'like', 'apple', 'that', 'have', 'an', 'm1', 'chip', '.', '[SEP]']
Start: 13
End: 13
Answer Span Tensor: tensor([13, 13])


convert_to_BERT_tensors function tokenizes input questions and contexts into BERT's tensor format, including input_ids and attention_mask. These inputs are used for token-level slot labeling.

In [80]:
def convert_to_BERT_tensors(questions, contexts):
    '''takes a parallel list of question strings and answer strings'''
    #your code here
    result = tokenizer(questions,contexts,  return_tensors="pt", padding='max_length', truncation=True,  max_length=512)
    return result['input_ids'], result['attention_mask']

In [None]:
test_questions = ["Why?", "How?"]
test_contexts = ["I think it is because we can bluminate", "It was done"" ".join(["very"]*1000) + " well"]

ids, mask = convert_to_BERT_tensors(test_questions,test_contexts)
assert ids.shape == (2,512) # 512 because that's the max allowed sequence length for BERT
assert ids[0][3] == 102 # fourth token is separator
assert list(ids[0][-100:]) == [0]*100 # first row is mostly padding
assert list(ids[1][-100:]) != [0]*100 # second row is not
assert list(mask[0][-100:]) == [0]*100 # first row padding is masked
assert list(mask[1][-100:]) != [0]*100 # second row is not padding, no mask
print("Success!")

Success!


In [45]:
a = [1,2]
b = [3,4]
a.extend(b)
a.extend([3]*4)
a

[1, 2, 3, 4, 3, 3, 3, 3]

In [81]:
laptop_question = ["What laptop specifics?"]*len(laptop_context)

len(laptop_context)

84

In [None]:
#provided code
batch_size = 16

class QAdataset(Dataset):
    '''A dataset for housing QA data, including input_data, output_data, and padding mask'''
    def __init__(self, input_data, output_data,mask):
        self.input_data = input_data
        self.output_data = output_data
        self.mask = mask
        
    def __len__(self):
        return len(self.input_data)
    
    def __getitem__(self, index):
        target = self.output_data[index]
        data_val = self.input_data[index]
        mask = self.mask[index]
        return data_val,target,mask 

In [None]:
import torch
import torch.nn as nn  # Ensure nn is imported

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
criterion = nn.CrossEntropyLoss()

# Create checkpoint directory
import os
if not os.path.exists("checkpoints"):
    os.makedirs("checkpoints")


In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer