# 📦 1. Install Dependencies
!pip install transformers scikit-learn pandas torch joblib

# 📂 2. Load libraries

In [1]:
import json
import pandas as pd

# 📂 2. Load intents.json

In [2]:
with open("../intents.json", "r") as f:
    intents = json.load(f)

# Flatten into DataFrame
data = []
for intent, phrases in intents.items():
    for phrase in phrases:
        data.append({"text": phrase, "intent": intent})

df = pd.DataFrame(data)
df.sample(5)

Unnamed: 0,text,intent
26,thank you very much,thank_you
13,farewell,goodbye
18,until next time,goodbye
10,bye,goodbye
4,hey there,greet


# 🏷️ 3. Label Encoding

In [3]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["label"] = le.fit_transform(df["intent"])
num_labels = len(le.classes_)

# 🔠 4. Tokenize Text

In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokens = tokenizer(
    list(df["text"]),
    truncation=True,
    padding=True,
    return_tensors="pt"
)

# 🧱 5. Create Dataset

In [5]:
import torch

class IntentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

dataset = IntentDataset(tokens, df["label"].tolist())

# 🤖 6. Load BERT for Classification

In [6]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=len(le.classes_)
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 🏋️ 7. Train the Model

In [7]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    num_train_epochs=20,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

trainer.train()

Step,Training Loss
10,1.383
20,1.0057
30,0.8293
40,0.5188
50,0.2933
60,0.112
70,0.0398
80,0.0259
90,0.0132
100,0.0091


TrainOutput(global_step=180, training_loss=0.237656617951062, metrics={'train_runtime': 282.3537, 'train_samples_per_second': 2.408, 'train_steps_per_second': 0.637, 'total_flos': 2096703872640.0, 'train_loss': 0.237656617951062, 'epoch': 20.0})

# 💾 8. Save Model and Tokenizer

In [11]:
import joblib

tokenizer = BertTokenizer.from_pretrained("intent_model")
model = BertForSequenceClassification.from_pretrained("intent_model")


# Save Label Encoder
joblib.dump(le, "label_encoder.pkl")

['label_encoder.pkl']

In [9]:
def predict_intent(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted = torch.argmax(logits, dim=1)
    return le.inverse_transform(predicted.numpy())[0]

# Try it!
print(predict_intent("hiya!"))            # → greet
print(predict_intent("tell me a joke"))   # → joke


greet
joke
