# Chatbot NLP – Intents Classification + Q&A

This notebook fine-tunes a lightweight text classifier for intents and demonstrates a simple retrieval for answers.

Data format: CSV with `text,intent,answer` under `../data/intents_sample.csv`.

Run in project root:

```bash
.venv\Scripts\activate
jupyter lab deep_learning/chatbot_nlp/notebooks/chatbot_nlp.ipynb
```



In [None]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

DATA_PATH = Path(__file__).resolve().parents[1] / 'data' / 'intents_sample.csv'

# Load data
assert DATA_PATH.exists(), f"Data file not found: {DATA_PATH}"
df = pd.read_csv(DATA_PATH)
df = df.dropna(subset=['text','intent']).copy()
labels = sorted(df['intent'].unique())
label2id = {l:i for i,l in enumerate(labels)}
id2label = {i:l for l,i in label2id.items()}
df['label_id'] = df['intent'].map(label2id)

# Split
X_train, X_test, y_train, y_test = train_test_split(df['text'].tolist(), df['label_id'].tolist(), test_size=0.2, stratify=df['label_id'], random_state=42)

# Tokenizer & model
checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(labels), id2label=id2label, label2id=label2id)

def encode_batch(texts, labels):
    enc = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    enc['labels'] = torch.tensor(labels)
    return enc

train_enc = encode_batch(X_train, y_train)
test_enc = encode_batch(X_test, y_test)

class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, enc): self.enc = enc
    def __len__(self): return self.enc['input_ids'].shape[0]
    def __getitem__(self, idx): return {k: v[idx] for k, v in self.enc.items()}

train_ds = SimpleDataset(train_enc)
test_ds = SimpleDataset(test_enc)

args = TrainingArguments(
    output_dir='outputs',
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=10,
    disable_tqdm=False,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
)

trainer.train()

# Evaluate
preds = trainer.predict(test_ds).predictions
pred_ids = preds.argmax(axis=1)
print(classification_report(y_test, pred_ids, target_names=labels))
print(confusion_matrix(y_test, pred_ids))

# Simple retrieval for answers
answer_map = dict(zip(df['intent'], df.get('answer', ['']*len(df))))

def answer_for_intent(intent: str) -> str:
    return answer_map.get(intent, '')

# Demo
sample = "what are your opening hours?"
inputs = tokenizer(sample, return_tensors='pt')
with torch.no_grad():
    logits = model(**inputs).logits
pred = logits.argmax(dim=1).item()
intent = id2label[pred]
print({'text': sample, 'intent': intent, 'answer': answer_for_intent(intent)})

