In [37]:
import torch
from transformers import DistilBertForSequenceClassification, DistilBertForTokenClassification
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizerFast
from tqdm import tqdm

In [46]:
data = [
    {"text": "Book a flight from New York to Los Angeles on friday", "intent": "BookFlight", "slots": {"loc1": "New York", "loc2": "Los Angeles", "Date": "Friday"}},
    {"text": "Book a flight from Dhaka to Los Angeles on friday", "intent": "BookFlight", "slots": {"loc1": "Dhaka", "loc2": "Los Angeles", "Date": "Friday"}},
    {"text": "Book a flight from Ctg to Los Angeles on friday", "intent": "BookFlight", "slots": {"loc1": "Ctg", "loc2": "Los Angeles", "Date": "Friday"}},
    {"text": "Book a flight from New York to Dhaka on friday", "intent": "BookFlight", "slots": {"loc1": "New York", "loc2": "Dhaka", "Date": "Friday"}},
    {"text": "Book a flight from New York to Los Angeles on saturday", "intent": "BookFlight", "slots": {"loc1": "New York", "loc2": "Los Angeles", "Date": "saturday"}},
    {"text": "Book a flight from New York to Los Angeles on friday", "intent": "BookFlight", "slots": {"loc1": "New York", "loc2": "Los Angeles", "Date": "Friday"}},
    {"text": "Book a flight from Dhaka to Los Angeles on friday", "intent": "BookFlight", "slots": {"loc1": "Dhaka", "loc2": "Los Angeles", "Date": "Friday"}},
    {"text": "Book a flight from Ctg to Los Angeles on friday", "intent": "BookFlight", "slots": {"loc1": "Ctg", "loc2": "Los Angeles", "Date": "Friday"}},
    {"text": "Book a flight from New York to Dhaka on friday", "intent": "BookFlight", "slots": {"loc1": "New York", "loc2": "Dhaka", "Date": "Friday"}},
    {"text": "Book a flight from New York to Los Angeles on saturday", "intent": "BookFlight", "slots": {"loc1": "New York", "loc2": "Los Angeles", "Date": "saturday"}},
    {"text": "Book a flight from New York to Los Angeles on friday", "intent": "BookFlight", "slots": {"loc1": "New York", "loc2": "Los Angeles", "Date": "Friday"}},
    {"text": "Book a flight from Dhaka to Los Angeles on friday", "intent": "BookFlight", "slots": {"loc1": "Dhaka", "loc2": "Los Angeles", "Date": "Friday"}},
    {"text": "Book a flight from Ctg to Los Angeles on friday", "intent": "BookFlight", "slots": {"loc1": "Ctg", "loc2": "Los Angeles", "Date": "Friday"}},
    {"text": "Book a flight from New York to Dhaka on friday", "intent": "BookFlight", "slots": {"loc1": "New York", "loc2": "Dhaka", "Date": "Friday"}},
    {"text": "Book a flight from New York to Los Angeles on saturday", "intent": "BookFlight", "slots": {"loc1": "New York", "loc2": "Los Angeles", "Date": "saturday"}},
    {"text": "Book a flight from New York to Los Angeles on friday", "intent": "BookFlight", "slots": {"loc1": "New York", "loc2": "Los Angeles", "Date": "Friday"}},
    {"text": "Book a flight from Dhaka to Los Angeles on friday", "intent": "BookFlight", "slots": {"loc1": "Dhaka", "loc2": "Los Angeles", "Date": "Friday"}},
    {"text": "Book a flight from Ctg to Los Angeles on friday", "intent": "BookFlight", "slots": {"loc1": "Ctg", "loc2": "Los Angeles", "Date": "Friday"}},
    {"text": "Book a flight from New York to Dhaka on friday", "intent": "BookFlight", "slots": {"loc1": "New York", "loc2": "Dhaka", "Date": "Friday"}},
    {"text": "Book a flight from New York to Los Angeles on saturday", "intent": "BookFlight", "slots": {"loc1": "New York", "loc2": "Los Angeles", "Date": "saturday"}},

    # ... more examples ...
]


In [45]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LEN = 50  # Adjust this depending on your data

def encode_data(data):
    texts = [item['text'] for item in data]
    encoding = tokenizer(texts, truncation=True, padding='max_length', max_length=MAX_LEN, return_tensors='pt')
    intents = [item['intent'] for item in data]
    slots = [item['slots'] for item in data]
    return encoding, intents, slots

encoded_data, intents, slots = encode_data(data)

In [30]:
from torch.utils.data import DataLoader, random_split, Dataset

class IntentSlotDataset(Dataset):
    def __init__(self, encodings, intents, slots):
        self.encodings = encodings
        self.intents = intents
        self.slots = slots

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['intents'] = self.intents[idx]
        item['slots'] = self.slots[idx]
        return item

    def __len__(self):
        return len(self.intents)

dataset = IntentSlotDataset(encoded_data, intents, slots)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [31]:
dataloader = DataLoader(encoded_data, batch_size=8)

In [32]:
# Initialize intent detection model
intent_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=8)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
intent_model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [34]:
optimizer_intent = torch.optim.Adam(intent_model.parameters(), lr=2e-5)

In [44]:
for epoch in range(3):  # Number of epochs
    intent_model.train()
    # slot_model.train()

    for batch in tqdm(dataloader):
        input_ids = torch.tensor(batch["input_ids"]).to(device)
        attention_mask = torch.tensor(batch["attention_mask"]).to(device)
        intent_labels = torch.tensor(batch["intent_label"]).to(device)

        optimizer_intent.zero_grad()
        # optimizer_slot.zero_grad()

        # Intent Detection
        intent_outputs = intent_model(input_ids=input_ids, attention_mask=attention_mask)
        intent_loss = intent_criterion(intent_outputs.logits, intent_labels)
        intent_loss.backward()
        optimizer_intent.step()

        # # Slot Filling
        # slot_outputs = slot_model(input_ids=input_ids, attention_mask=attention_mask)
        # slot_loss = compute_slot_loss(slot_outputs.logits, slot_labels)
        # slot_loss.backward()
        # optimizer_slot.step()


  0%|          | 0/3 [00:00<?, ?it/s]


TypeError: only integer tensors of a single element can be converted to an index

In [None]:
import sentencepiece as spm

# Assuming you have a text file named 'bangla_corpus.txt' with lots of Bengali sentences.
spm.SentencePieceTrainer.train('--input=bangla_corpus.txt --model_prefix=m_bangla --vocab_size=16000 --character_coverage=0.9995')
