Intent classification with a CLICNIC 150 Data

In [None]:
import os
import math
import datetime
from tqdm import tqdm
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras

In [None]:
from transformers import BertModel, BertTokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
random_seed = 42
np.random.seed(random_seed)
tf.random.set_seed(random_seed)

Data

In [None]:
import json
with open('data_full.json') as json_file:
    CLINC150 = json.load(json_file)
CLINC150_train=CLINC150['train']
CLINC150_test=CLINC150['test']
CLINC150_val=CLINC150['val']

In [None]:
# Print the number of examples in each dataset
print("Number of training examples:", len(CLINC150_train))
print("Number of test examples:", len(CLINC150_test))
print("Number of validation examples:", len(CLINC150_val))


Number of training examples: 15000
Number of test examples: 4500
Number of validation examples: 3000


In [None]:
classes=['insurance',
 'next_holiday',
 'repeat',
 'credit_limit_change',
 'book_hotel',
 'yes',
 'damaged_card',
 'rewards_balance',
 'time',
 'pto_balance',
 'interest_rate',
 'change_volume',
 'taxes',
 'sync_device',
 'traffic',
 'what_song',
 'shopping_list',
 'todo_list_update',
 'order_checks',
 'shopping_list_update']

In [None]:
train_data=[]
test_data=[]
val_data=[]

In [None]:
for c in CLINC150_train:
    if c[1] in classes:
        train_data.append(c)
for c in CLINC150_test:
    if c[1] in classes:
        test_data.append(c)
for c in CLINC150_val:
    if c[1] in classes:
        val_data.append(c)
df = pd.DataFrame(train_data)
df.to_csv('train_data.csv', index=False,header=('text','intent'))
train=pd.read_csv('train_data.csv')
print(len(train))
train.head()

2000


Unnamed: 0,text,intent
0,"what time is it in punta gorda, florida",time
1,"what time is it in glenwood springs, co",time
2,"what time is it in fredericksburg, tx",time
3,"what time is it in las vegas, nv",time
4,"what time is it in houston, tx",time


In [None]:
train["intent"].unique()

array(['time', 'shopping_list_update', 'rewards_balance', 'repeat', 'yes',
       'insurance', 'todo_list_update', 'sync_device', 'damaged_card',
       'next_holiday', 'change_volume', 'what_song', 'book_hotel',
       'taxes', 'pto_balance', 'interest_rate', 'credit_limit_change',
       'shopping_list', 'traffic', 'order_checks'], dtype=object)

In [None]:
df = pd.DataFrame(val_data)
df.to_csv('val_data.csv', index=False,header=('text','intent'))
valid=pd.read_csv('val_data.csv')
print(len(valid))
valid.head()

400


Unnamed: 0,text,intent
0,what time is it in france,time
1,what's the time in london right now,time
2,what hour is it in london,time
3,what's the time,time
4,what is the time in london,time


In [None]:
df = pd.DataFrame(test_data)
df.to_csv('test_data.csv', index=False,header=('text','intent'))
test=pd.read_csv('test_data.csv')
print(len(test))
test.head()

600


Unnamed: 0,text,intent
0,i need you to tell me what time it is in new y...,time
1,"what time is it in adelaide, australia right now",time
2,is it after noon,time
3,is it six o clock yet,time
4,please give me the time in tanzania at this mo...,time


In [None]:

train = pd.concat([train, valid]).reset_index(drop=True)


In [None]:

# Define model name
bert_model_name = "bert-base-uncased"

# Load tokenizer and model directly from Hugging Face
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
model = BertModel.from_pretrained(bert_model_name)

Input Text Preparation

In [None]:
from transformers import BertTokenizer
import numpy as np
from tqdm import tqdm

class DataPreparation:

    text_column = "text"
    label_column = "intent"

    def __init__(self, train, test, classes, max_seq_len=192):
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.max_seq_len = max_seq_len
        self.classes = classes

        ((self.train_x, self.train_y), (self.test_x, self.test_y)) = map(self.prepare_data, [train, test])

    def prepare_data(self, df):
        x, y = [], []

        for _, row in tqdm(df.iterrows()):
            text, label = row[DataPreparation.text_column], row[DataPreparation.label_column]
            # Tokenize, add special tokens, and pad to max sequence length
            encoding = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,  # Adds [CLS] and [SEP]
                max_length=self.max_seq_len,
                padding='max_length',  # Pads to max length
                truncation=True,  # Truncates to max length
                return_tensors="np"  # Returns numpy arrays
            )
            x.append(encoding['input_ids'][0])
            y.append(self.classes.index(label))

        return np.array(x), np.array(y)


In [None]:
from transformers import BertTokenizer
import numpy as np
from tqdm import tqdm

class DataPreparation:

    text_column = "text"
    label_column = "intent"

    def __init__(self, train, test, classes, max_seq_len=192):
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.max_seq_len = max_seq_len
        self.classes = classes

        # Prepare training and test data
        ((self.train_x, self.train_y, self.train_x_attention_mask),
         (self.test_x, self.test_y, self.test_x_attention_mask)) = map(self.prepare_data, [train, test])

    def prepare_data(self, df):
        x, y, attention_masks = [], [], []

        for _, row in tqdm(df.iterrows()):
            text, label = row[DataPreparation.text_column], row[DataPreparation.label_column]

            # Tokenize and create attention mask
            encoding = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=self.max_seq_len,
                padding='max_length',
                truncation=True,
                return_tensors="np"
            )
            x.append(encoding['input_ids'][0])
            attention_masks.append(encoding['attention_mask'][0])
            y.append(self.classes.index(label))

        return np.array(x), np.array(y), np.array(attention_masks)


model

In [None]:
from transformers import TFBertModel
import tensorflow as tf
from tensorflow import keras

def model_definition(max_seq_len, bert_model_name="bert-base-uncased"):

    # Load pre-trained BERT model
    bert = TFBertModel.from_pretrained(bert_model_name)

    # Define input layer
    input_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_ids")
    attention_mask = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="attention_mask")

    # Pass inputs through BERT model
    bert_output = bert(input_ids, attention_mask=attention_mask)

    # Extract the CLS token output
    cls_out = bert_output.last_hidden_state[:, 0, :]
    cls_out = keras.layers.Dropout(0.5)(cls_out)

    # Add dense layers
    logits = keras.layers.Dense(units=768, activation="tanh")(cls_out)
    logits = keras.layers.Dropout(0.5)(logits)
    logits = keras.layers.Dense(units=len(classes), activation="softmax")(logits)

    # Define the model with inputs and outputs
    model = keras.Model(inputs=[input_ids, attention_mask], outputs=logits)

    return model


In [None]:
# Get the list of unique classes (intents)
classes = train['intent'].unique().tolist()

# Initialize DataPreparation with updated structure
data = DataPreparation(train, test, classes, max_seq_len=128)


2400it [00:00, 7445.85it/s]
600it [00:00, 7575.00it/s]


Pytorch

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset, random_split
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np


In [None]:


# Encode labels
label_encoder = LabelEncoder()
train['intent'] = label_encoder.fit_transform(train['intent'])
classes = label_encoder.classes_

# Split into train and test sets
train_df, val_df = train_test_split(train, test_size=0.1, random_state=42)


In [None]:
classes

array(['book_hotel', 'change_volume', 'credit_limit_change',
       'damaged_card', 'insurance', 'interest_rate', 'next_holiday',
       'order_checks', 'pto_balance', 'repeat', 'rewards_balance',
       'shopping_list', 'shopping_list_update', 'sync_device', 'taxes',
       'time', 'todo_list_update', 'traffic', 'what_song', 'yes'],
      dtype=object)

In [None]:
train_df.head()

Unnamed: 0,text,intent
482,"yep, that's true",19
1368,how much money do i pay in taxes,14
2248,i need a suite that can accommodate 3 adults a...,0
857,how do i report my card if it got cut in half,3
1017,go now and increase the volume to 4 please,1


In [None]:
val_df.shape

(240, 2)

In [None]:
class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.texts = df['text'].tolist()
        self.labels = df['intent'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the input text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )

        # Return input IDs, attention mask, and label
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
# Load the tokenizer and model from transformers
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(classes))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Create PyTorch datasets
train_dataset = TextDataset(train_df, tokenizer)
val_dataset = TextDataset(val_df, tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [None]:
# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)




In [None]:
from transformers import get_linear_schedule_with_warmup

# Set up training parameters
epochs = 3
total_steps = len(train_loader) * epochs

# Set up the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps
)

# Training function
def train_epoch(model, data_loader, optimizer, scheduler):
    model.train()
    total_train_loss = 0

    for batch in data_loader:
        optimizer.zero_grad()

        # Move data to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()
        scheduler.step()

    return total_train_loss / len(data_loader)

# Evaluation function
def eval_model(model, data_loader):
    model.eval()
    total_eval_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            # Move data to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_eval_loss += loss.item()

            # Compute accuracy
            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)

    avg_eval_loss = total_eval_loss / len(data_loader)
    accuracy = correct_predictions.double() / len(data_loader.dataset)

    return avg_eval_loss, accuracy


In [None]:
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")

    # Train the model
    train_loss = train_epoch(model, train_loader, optimizer, scheduler)
    print(f"Train loss: {train_loss:.3f}")

    # Evaluate the model
    val_loss, val_acc = eval_model(model, val_loader)
    print(f"Validation loss: {val_loss:.3f}, Validation accuracy: {val_acc:.3f}")


Epoch 1/3
Train loss: 2.633
Validation loss: 2.097, Validation accuracy: 0.738
Epoch 2/3
Train loss: 1.812
Validation loss: 1.446, Validation accuracy: 0.950
Epoch 3/3
Train loss: 1.418
Validation loss: 1.253, Validation accuracy: 0.963


In [None]:
# Assuming 'test' is your test DataFrame with columns 'text' and 'intent'

# Encode the labels in the test dataset
test['intent'] = label_encoder.transform(test['intent'])

# Create a PyTorch dataset for the test set
test_dataset = TextDataset(test, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16)


In [None]:
def predict(model, data_loader):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in data_loader:
            # Move data to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Get predictions
            _, preds = torch.max(logits, dim=1)
            predictions.extend(preds.cpu().numpy())

    return np.array(predictions)


In [None]:
test_preds = predict(model, test_loader)

# Calculate accuracy using scikit-learn (optional)
from sklearn.metrics import accuracy_score
test_accuracy = accuracy_score(test['intent'], test_preds)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Test Accuracy: 94.67%


In [None]:
import random

def show_predictions(model, tokenizer, test_df, label_encoder):
    # Randomly select 10 samples from the test set
    samples = test.sample(n=10).reset_index(drop=True)

    # Prepare the texts for prediction
    texts = samples['text'].tolist()

    # Encode the texts using the tokenizer
    encodings = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

    # Move tensors to device
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)

    # Perform inference
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)

    # Convert predictions and actual labels to readable format
    preds = label_encoder.inverse_transform(preds.cpu().numpy())
    actuals = label_encoder.inverse_transform(samples['intent'].values)

    # Display the results
    for i in range(10):
        print(f"Text: {texts[i]}")
        print(f"Actual Intent: {actuals[i]}")
        print(f"Predicted Intent: {preds[i]}")
        print("-" * 50)

# Call the function
show_predictions(model, tokenizer, test, label_encoder)


Text: i didn't understand you
Actual Intent: repeat
Predicted Intent: yes
--------------------------------------------------
Text: what should i squirrel away to pay in taxes
Actual Intent: taxes
Predicted Intent: taxes
--------------------------------------------------
Text: on the way to work is there traffic
Actual Intent: traffic
Predicted Intent: traffic
--------------------------------------------------
Text: i need to understand how many points have i earned with my credit card
Actual Intent: rewards_balance
Predicted Intent: rewards_balance
--------------------------------------------------
Text: i don't need grocery shopping on my todo list anymore
Actual Intent: todo_list_update
Predicted Intent: todo_list_update
--------------------------------------------------
Text: when will my next vacation day be
Actual Intent: next_holiday
Predicted Intent: next_holiday
--------------------------------------------------
Text: what time is it in the greenwich timezone
Actual Intent: tim

In [None]:
model.save_pretrained("model_save")
tokenizer.save_pretrained("model_save")


('model_save/tokenizer_config.json',
 'model_save/special_tokens_map.json',
 'model_save/vocab.txt',
 'model_save/added_tokens.json')