In [34]:
pip install transformers datasets torch

Python(71079) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Note: you may need to restart the kernel to use updated packages.


In [35]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import torch

In [36]:
# Load training utterances and answers
with open("data/train_utt.txt", "r") as f:
    utterances = f.readlines()

with open("data/train_ans.txt", "r") as f:
    slots = f.readlines()

# Strip whitespace
utterances = [utt.strip() for utt in utterances]
slots = [slot.strip() for slot in slots]


Parsing slot annotations into dictionaries and pairing them with their respective utterances. This part was intended to help us understanding the parsing process for our utterances. 

In [37]:
def parse_slots(slot):
    pairs = slot.split("|")[1:] 
    slot_dict = {}
    for pair in pairs:
        key, value = pair.split("=")
        slot_dict[key] = value
    return slot_dict

# Convert slot strings to dictionaries
slot_dicts = [parse_slots(slot) for slot in slots]

# Combine utterances with their parsed slot dictionaries
utterance_slot_pairs = list(zip(utterances, slot_dicts))

# Display the parsed dictionaries
for i, (utterance, slot_dict) in enumerate(utterance_slot_pairs[:10]):  # Limit to first 10
    print(f"Utterance {i+1}: {utterance}")
    print(f"{slot_dict}")
    print()

Utterance 1: Can you show me laptops from a specific brand, like Apple?
{'brand': 'Apple'}

Utterance 2: I need a budget-friendly laptop. What’s the cheapest one you have?
{'price': 'cheap'}

Utterance 3: I need a laptop with a powerful processor for heavy tasks. Any suggestions?
{'processor_tier': 'high'}

Utterance 4: Can you recommend a laptop with at least a Core i5 processor?
{'processor_tier': 'i5+'}

Utterance 5: I need a compact laptop with a smaller display. What are my options?
{'display_size': 'small'}

Utterance 6: What’s the most affordable laptop with a 15.6-inch display?
{'display_size': '15.6', 'price': 'cheap'}

Utterance 7: I am trying to find a laptop that has a core i9 processor and display size of 15.6, what are my options?
{'processor_tier': 'i9', 'display_size': '15.6'}

Utterance 8: I am looking specifically for a Samsung laptop.
{'brand': 'Samsung'}

Utterance 9: I want a laptop that has at least 16GB RAM and doesn’t cost more than $500. Any options?
{'ram_memo

In [44]:
from collections import defaultdict

# Initialize slot-specific contexts and answers
brand_context = []
brand_answers = []
price_context = []
price_answers = []
processor_context = []
processor_answers = []
display_context = []
display_answers = []

# Dictionary to store slots and their unique values
slot_dic = defaultdict(lambda: defaultdict(set))

# Load the files
with open("data/train_utt.txt") as f1, open("data/train_ans.txt") as f2:
    utterances = f1.readlines()
    for i, line in enumerate(f2.readlines()):
        a_line = line.strip()
        ans = a_line.split('|')
        # The intent (always 'find_laptop')
        intent = ans[0] 
        
        # Process slots in the answer
        for a in ans[1:]:
            if "!=" in a:  # Handle negated slots
                slot_name, slot_value = a.split("!=")
            else:  # Handle regular slots
                slot_name, slot_value = a.split("=")
            
            # Collect context and answers for specific slots
            if slot_name == "brand":
                brand_context.append(utterances[i].strip())
                brand_answers.append(slot_value)
            if slot_name == "price":
                price_context.append(utterances[i].strip())
                price_answers.append(slot_value)
            if slot_name == "processor_tier":
                processor_context.append(utterances[i].strip())
                processor_answers.append(slot_value)
            if slot_name == "display_size":
                display_context.append(utterances[i].strip())
                display_answers.append(slot_value)
            
            # Add slot values to the slot dictionary
            slot_dic[slot_name]["values"].add(slot_value)

# Display the slot dictionary
slot_dic

defaultdict(<function __main__.<lambda>()>,
            {'brand': defaultdict(set,
                         {'values': {'ASUS',
                           'Apple',
                           'Dell',
                           'HP',
                           'Microsoft',
                           'Samsung',
                           'lenovo'}}),
             'price': defaultdict(set,
                         {'values': {'500',
                           '700',
                           'cheap',
                           'expensive',
                           'mid-range'}}),
             'processor_tier': defaultdict(set,
                         {'values': {'core i7',
                           'decent',
                           'high',
                           'i5+',
                           'i7',
                           'i9',
                           'latest',
                           'm1',
                           'm2'}}),
             'display_size': defaultdict

In [42]:
brand_context[:3]

['Can you show me laptops from a specific brand, like Apple?',
 'I am looking specifically for a Samsung laptop.',
 'Can you recommend a laptop with a small display size but that is not an Apple brand?']

In [43]:
brand_answers[:3]

['Apple', 'Samsung', 'Apple']

Edge Case: Negation for specific 