In [12]:
pip install transformers datasets torch

Note: you may need to restart the kernel to use updated packages.


In [4]:
from transformers import BertTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import torch

In [5]:
# Load training utterances and answers
with open("data/train_utt.txt", "r") as f:
    utterances = f.readlines()

with open("./data/train_ans.txt", "r") as f:
    slots = f.readlines()

# Strip whitespace
utterances = [utt.strip() for utt in utterances]
slots = [slot.strip() for slot in slots]


Parsing slot annotations into dictionaries and pairing them with their respective utterances. This part was intended to help us understanding the parsing process for our utterances. 

Future implementation: for higher-level intent, we should do use_cases and do single canonical use cases. For example "school", "work". 

In [6]:
def parse_slots(slot):
    pairs = slot.split("|")[1:] 
    slot_dict = {}
    for pair in pairs:
        key, value = pair.split("=")
        slot_dict[key] = value
    return slot_dict

# Convert slot strings to dictionaries
slot_dicts = [parse_slots(slot) for slot in slots]

# Combine utterances with their parsed slot dictionaries
utterance_slot_pairs = list(zip(utterances, slot_dicts))

# Display the parsed dictionaries
for i, (utterance, slot_dict) in enumerate(utterance_slot_pairs[:10]):  # Limit to first 10
    print(f"Utterance {i+1}: {utterance}")
    print(f"{slot_dict}")
    print()

Utterance 1: Can you show me laptops from a specific brand, like Apple?
{'brand': 'Apple'}

Utterance 2: I need a budget-friendly laptop. What’s the cheapest one you have?
{'price': 'cheap'}

Utterance 3: I need a laptop with a powerful processor for heavy tasks. Any suggestions?
{'processor_tier': 'high'}

Utterance 4: Can you recommend a laptop with at least a Core i5 processor?
{'processor_tier': 'i5+'}

Utterance 5: I need a compact laptop with a smaller display. What are my options?
{'display_size': 'small'}

Utterance 6: What’s the most affordable laptop with a 15.6-inch display?
{'display_size': '15.6', 'price': 'cheap'}

Utterance 7: I am trying to find a laptop that has a core i9 processor and display size of 15.6, what are my options?
{'processor_tier': 'i9', 'display_size': '15.6'}

Utterance 8: I am looking specifically for a Samsung laptop.
{'brand': 'Samsung'}

Utterance 9: I want a laptop that has at least 16GB RAM and doesn’t cost more than $500. Any options?
{'ram_memo

In [7]:
from collections import defaultdict
intent_dic = {'find_laptop':defaultdict(set)}
intent_dic

{'find_laptop': defaultdict(set, {})}

In [8]:
from collections import defaultdict

# Initialize dictionary for 'find_laptop' intent
intent_dic = {'find_laptop': defaultdict(set)}

# Placeholder for context and answers related to laptops
laptop_context = []
laptop_answers = []

# File paths to the dataset
utt_file_path = "data/train_utt.txt"
ans_file_path = "data/train_ans.txt"

# Process dataset to extract laptop-related information
with open(utt_file_path, 'r') as f1:
    with open(ans_file_path, 'r') as f2:
        utterances = f1.readlines()
        for i, line in enumerate(f2.readlines()):
            a_line = line.strip()
            ans = a_line.split('|')  
            intent = ans[0]  # Extract the intent
            if intent == 'find_laptop':  # Only process 'find_laptop' intent
                for a in ans[1:]:
                    if '=' in a:
                        slot, value = a.split('=')  # Split slot-value pair
                        laptop_context.append(utterances[i].strip())  # Save context
                        laptop_answers.append(value)  # Save the value
                        intent_dic[intent][slot].add(value)  # Add slot-value pair to intent_dic

# Output the updated intent dictionary
print(intent_dic)

{'find_laptop': defaultdict(<class 'set'>, {'brand': {'Dell', 'lenovo', 'ASUS', 'Microsoft', 'HP', 'Apple', 'Samsung'}, 'price': {'cheap', '700', 'expensive', 'mid-range', '500'}, 'processor_tier': {'decent', 'high', 'm1', 'i7', 'i5+', 'i9', 'core i7', 'm2', 'latest'}, 'display_size': {'15', '14', '15.6', 'small', '13'}, 'ram_memory': {'16GB', '256GB+', '16GB+', 'max', '8GB', '8GB+', '32GB'}, 'brand!': {'Apple'}})}


In [9]:
laptop_context[:3]

['Can you show me laptops from a specific brand, like Apple?',
 'I need a budget-friendly laptop. What’s the cheapest one you have?',
 'I need a laptop with a powerful processor for heavy tasks. Any suggestions?']

In [10]:
laptop_answers[:3]

['Apple', 'cheap', 'high']

Edge Case: Negation for specific 

In [11]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

get_answer_span_tensor identifies the span of a given answer within a tokenized context when paired with a question. used in question-answering tasks.

span based approach to find the start and end positions of the answer within the combined tokenized sequence of question and context. will help with superised training for BERT to allow the model to pinpoint the slot values directly in the context.

In [13]:
def get_answer_span_tensor(question, context, answer):
    # Print inputs before tokenization
    print("Question:", question)
    print("Context:", context)
    print("Answer:", answer)

    # Tokenize inputs
    ques = tokenizer.tokenize(question)
    context = tokenizer.tokenize(context)
    answer = tokenizer.tokenize(answer)

    # Print tokenized inputs
    print("Tokenized Question:", ques)
    print("Tokenized Context:", context)
    print("Tokenized Answer:", answer)

    # Combine tokens with special tokens
    s = ["[CLS]"] + ques + ["[SEP]"] + context + ["[SEP]"]
    print("Combined Tokens (with special tokens):", s)

    # Initialize start and end
    start, end = 0, 0

    # Match tokens to find the answer span
    for id_a, token_a in enumerate(answer):
        for id_s, token_s in enumerate(s):
            if token_a == token_s:
                if answer == s[id_s:id_s + len(answer)]:
                    start = id_s
                    end = id_s + len(answer) - 1
                    break
                elif len(s) - id_s + 1 <= len(answer):
                    break

    # Print start and end positions
    print("Start:", start)
    print("End:", end)

    return torch.tensor([start, end])



In [14]:
# Example inputs
test_question = "Can you suggest a laptop with a Core i7 processor and 16GB of RAM?"
test_context = """
Available laptops:
1. Dell Inspiron: Core i5, 8GB RAM, $800.
2. HP Spectre x360: Core i7, 16GB RAM, $1500.
3. Apple MacBook Pro: M1 chip, 16GB RAM, $2000.
4. Lenovo ThinkPad: AMD Ryzen 5, 8GB RAM, $1000.
"""
test_answer = "HP Spectre x360"
bad_answer = "Dell Inspiron"  

# Call function
tensor = get_answer_span_tensor(test_question, test_context, test_answer)
print("Answer Span Tensor:", tensor)

Question: Can you suggest a laptop with a Core i7 processor and 16GB of RAM?
Context: 
Available laptops:
1. Dell Inspiron: Core i5, 8GB RAM, $800.
2. HP Spectre x360: Core i7, 16GB RAM, $1500.
3. Apple MacBook Pro: M1 chip, 16GB RAM, $2000.
4. Lenovo ThinkPad: AMD Ryzen 5, 8GB RAM, $1000.

Answer: HP Spectre x360
Tokenized Question: ['can', 'you', 'suggest', 'a', 'laptop', 'with', 'a', 'core', 'i', '##7', 'processor', 'and', '16', '##gb', 'of', 'ram', '?']
Tokenized Context: ['available', 'laptop', '##s', ':', '1', '.', 'dell', 'ins', '##pi', '##ron', ':', 'core', 'i', '##5', ',', '8', '##gb', 'ram', ',', '$', '800', '.', '2', '.', 'hp', 'spec', '##tre', 'x', '##36', '##0', ':', 'core', 'i', '##7', ',', '16', '##gb', 'ram', ',', '$', '1500', '.', '3', '.', 'apple', 'mac', '##book', 'pro', ':', 'm1', 'chip', ',', '16', '##gb', 'ram', ',', '$', '2000', '.', '4', '.', 'len', '##ovo', 'think', '##pad', ':', 'am', '##d', 'ry', '##zen', '5', ',', '8', '##gb', 'ram', ',', '$', '1000', '.']
T

The ## prefix in tokenized words (e.g., ##tre in the tokenized word "spectre") is a characteristic of subword tokenization used in models like BERT. Helps handle rare words or words that were not part of the model's training data. Instead of ignoring or failing to recognize unknown words, BERT splits them into subword tokens

In [26]:
span = get_answer_span_tensor(test_question,test_context,test_answer)
# print(span)
assert span.shape == (2,)
# uncomment to double check
# assert list(span) == [43,48]

print("\nBad Answer Tensor")
span = get_answer_span_tensor(test_question,test_context,bad_answer)
# uncomment to double check, if the code runs and the assertion error appears then success shouldn't pop up
assert list(span) == [25,28]
print('Success!')

Question: Can you suggest a laptop with a Core i7 processor and 16GB of RAM?
Context: 
Available laptops:
1. Dell Inspiron: Core i5, 8GB RAM, $800.
2. HP Spectre x360: Core i7, 16GB RAM, $1500.
3. Apple MacBook Pro: M1 chip, 16GB RAM, $2000.
4. Lenovo ThinkPad: AMD Ryzen 5, 8GB RAM, $1000.

Answer: HP Spectre x360
Tokenized Question: ['can', 'you', 'suggest', 'a', 'laptop', 'with', 'a', 'core', 'i', '##7', 'processor', 'and', '16', '##gb', 'of', 'ram', '?']
Tokenized Context: ['available', 'laptop', '##s', ':', '1', '.', 'dell', 'ins', '##pi', '##ron', ':', 'core', 'i', '##5', ',', '8', '##gb', 'ram', ',', '$', '800', '.', '2', '.', 'hp', 'spec', '##tre', 'x', '##36', '##0', ':', 'core', 'i', '##7', ',', '16', '##gb', 'ram', ',', '$', '1500', '.', '3', '.', 'apple', 'mac', '##book', 'pro', ':', 'm1', 'chip', ',', '16', '##gb', 'ram', ',', '$', '2000', '.', '4', '.', 'len', '##ovo', 'think', '##pad', ':', 'am', '##d', 'ry', '##zen', '5', ',', '8', '##gb', 'ram', ',', '$', '1000', '.']
T

convert_to_BERT_tensors function tokenizes input questions and contexts into BERT's tensor format, including input_ids and attention_mask. These inputs are used for token-level slot labeling.

In [19]:
def convert_to_BERT_tensors(questions, contexts):
    '''takes a parallel list of question strings and answer strings'''
    #your code here
    result = tokenizer(questions,contexts,  return_tensors="pt", padding='max_length', truncation=True,  max_length=512)
    return result['input_ids'], result['attention_mask']

In [20]:
test_questions = ["Why?", "How?"]
test_contexts = ["I think it is because we can bluminate", "It was done"" ".join(["very"]*1000) + " well"]

ids, mask = convert_to_BERT_tensors(test_questions,test_contexts)
assert ids.shape == (2,512) # 512 because that's the max allowed sequence length for BERT
assert ids[0][3] == 102 # fourth token is separator
assert list(ids[0][-100:]) == [0]*100 # first row is mostly padding
assert list(ids[1][-100:]) != [0]*100 # second row is not
assert list(mask[0][-100:]) == [0]*100 # first row padding is masked
assert list(mask[1][-100:]) != [0]*100 # second row is not padding, no mask
print("Success!")

Success!


In [21]:
ids, mask = convert_to_BERT_tensors(test_questions, test_contexts)

In [None]:
#provided code
batch_size = 16

class QAdataset(Dataset):
    '''A dataset for housing QA data, including input_data, output_data, and padding mask'''
    def __init__(self, input_data, output_data,mask):
        self.input_data = input_data
        self.output_data = output_data
        self.mask = mask
        
    def __len__(self):
        return len(self.input_data)
    
    def __getitem__(self, index):
        target = self.output_data[index]
        data_val = self.input_data[index]
        mask = self.mask[index]
        return data_val,target,mask 

In [None]:
import torch
import torch.nn as nn  

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
criterion = nn.CrossEntropyLoss()

# Create checkpoint directory
import os
if not os.path.exists("checkpoints"):
    os.makedirs("checkpoints")


The imports and downloads in the two cells below were intended to use for intent classification prediction but because we only have one intent to work with, we don't have to do the extra work with training a classifier to predict the intent. The intent is already fixed, we can focus entirely on slot extraction (e.g., extracting price, brand, RAM, etc.) from the user utterances.

In [None]:
# Load user utterances for training, development, and test
with open("data/train_utt.txt") as f:
    train = f.read().splitlines()

with open("data/dev_utt.txt") as f:
    dev = f.read().splitlines()

with open("data/test_utt.txt") as f:
    test = f.read().splitlines()

# Initialize empty lists for category and facility (slots)
train_category = []
train_facility = []

# Load answers (slots and categories) for training data
with open("data/train_ans.txt") as f:
    for line in f:
        splitting = line.strip().split('|')
        # The intent (e.g., find_laptop)
        train_category.append(splitting[0])  
        train_facility.append(splitting[1:])  # Slot-value pairs (e.g., brand=Apple, price=cheap)

# Load answers for development data
dev_category = []
dev_facility = []
with open("data/dev_ans.txt") as f:
    for line in f:
        splitting = line.strip().split('|')
        dev_category.append(splitting[0])  
        dev_facility.append(splitting[1:])  # Slot-value pairs


In [None]:
def predict_display_size(utterance):
    display_sizes = ["13-inch", "14-inch", "15-inch", "15.6-inch", "17-inch"]
    utterance = utterance.lower()

    for size in display_sizes:
        if size in utterance:
            return f"display_size={size}"
    return "display_size=unknown"


In [None]:
def predict_price(utterance, establishment, filter=[]):
    utterance = utterance.lower()
    for f in filter:
        utterance = utterance.replace(f, "XXXX")
    
    cheap_keywords = ["cheap", "inexpensive", "affordable", "budget-friendly", "low price"]
    expensive_keywords = ["expensive", "high-end", "costly", "premium", "upscale"]
    midrange_keywords = ["moderate", "mid-range", "middle price", "mid price"]
    dontcare_keywords = ["price doesn't matter", "any price", "doesn't care about cost"]

    # Check if the utterance indicates a price range
    if any(keyword in utterance for keyword in cheap_keywords):
        return f"{establishment}-price=cheap"
    if any(keyword in utterance for keyword in expensive_keywords):
        return f"{establishment}-price=expensive"
    if any(keyword in utterance for keyword in midrange_keywords):
        return f"{establishment}-price=mid-range"
    if any(keyword in utterance for keyword in dontcare_keywords):
        return f"{establishment}-price=dontcare"
    
    return ""

In [None]:
def predict_ram(utterance, establishment, filter=[]):
    utterance = utterance.lower()
    for f in filter:
        utterance = utterance.replace(f, "XXXX")
    
    ram_sizes = ["8gb", "16gb", "32gb", "64gb"]
    dontcare_keywords = ["don't care about memory", "any memory size", "no preference for RAM"]

    # Check if the utterance specifies a RAM size
    for ram in ram_sizes:
        if ram in utterance:
            return f"{establishment}-ram_memory={ram.upper()}"
    
    # Check if the user doesn't care about RAM
    if any(keyword in utterance for keyword in dontcare_keywords):
        return f"{establishment}-ram_memory=dontcare"
    
    return ""

In [None]:
def predict_brand(utterance, establishment, filter=[]):
    utterance = utterance.lower()
    for f in filter:
        utterance = utterance.replace(f, "XXXX")
    
    brands = ["apple", "dell", "hp", "samsung", "lenovo", "asus", "microsoft"]

    for brand in brands:
        if brand in utterance:
            return f"{establishment}-brand={brand.capitalize()}"
    
    return ""


In [None]:
def predict_processor_tier(utterance):
    tiers = {
        "high": ["high-end", "powerful", "m1", "i9", "ryzen 9"],
        "mid": ["mid-range", "balanced", "i7", "ryzen 7"],
        "low": ["low-end", "budget", "i5", "ryzen 5"]
    }
    utterance = utterance.lower()

    for tier, keywords in tiers.items():
        if any(keyword in utterance for keyword in keywords):
            return f"processor_tier={tier}"
    return "processor_tier=unknown"


In [None]:
# process the utterance through all slot-prediction functions and combine the results
def extract_slots(utterance):
    slots = []
    slots.append(predict_price(utterance))
    slots.append(predict_display_size(utterance))
    slots.append(predict_processor_tier(utterance))
    slots.append(predict_brand(utterance))
    return [slot for slot in slots if "unknown" not in slot]  # Filter out unknown slots

In [None]:
correct = 0
incorrect = 0

for utt, ans, cat in list(zip(train, train_facility, train_category)):
    # Example: Predict price
    pred_price = predict_price(utt, "laptop")
    if ("-price" in " ".join(ans) and pred_price in " ".join(ans)) or ("-price" not in " ".join(ans) and pred_price == ""):
        correct += 1
    else:
        print(f"Utterance: {utt}")
        print(f"Expected: {ans}")
        print(f"Prediction: {pred_price}")
        incorrect += 1

print(f"Correct: {correct}")
print(f"Incorrect: {incorrect}")


Utterance: I need a budget-friendly laptop. What’s the cheapest one you have?
Expected: ['price=cheap']
Prediction: laptop-price=cheap
Utterance: What’s the most affordable laptop with a 15.6-inch display?
Expected: ['display_size=15.6', 'price=cheap']
Prediction: laptop-price=cheap
Utterance: Can you find me a Dell laptop that’s cheap?
Expected: ['brand=Dell', 'price=500']
Prediction: laptop-price=cheap
Utterance: Can you find a laptop with a 15.6-inch screen and under $700, but with a high-end processor?
Expected: ['display_size=15.6', 'price=700', 'processor_tier=high']
Prediction: laptop-price=expensive
Utterance: Can you help me find a good and cheap laptop that is suitable for machine learning?
Expected: ['ram_memory=16GB', 'processor_tier=m1', 'price=cheap']
Prediction: laptop-price=cheap
Utterance: I’m a college student and I’m looking for an affordable laptop
Expected: ['brand=lenovo', 'price=cheap']
Prediction: laptop-price=cheap
Utterance: Can you help me find a cheap HP lap