In [4]:
pip install datasets evaluate



In [5]:
# Data processing
import pandas as pd
import numpy as np

# Modeling
import torch

# Hugging Face Dataset
from datasets import Dataset

# Model performance evaluation
import evaluate

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [7]:
data = pd.read_csv('dataset.csv')
data.head(5)

Unnamed: 0,utterance,intent
0,"have a question, can you call Customer Service?",contact_customer_service
1,need help informing of issues paying,payment_issue
2,need help modifying my profile,edit_account
3,"want to request some bills, tell me how to do it",get_invoice
4,"a error message pops when I try to pay, can yo...",payment_issue


In [8]:
data.intent.value_counts()

payment_issue               3873
contact_customer_service    1856
create_account              1776
get_invoice                 1346
get_refund                  1113
complaint                    719
delete_account               628
registration_problems        143
edit_account                 138
change_shipping_address      101
Name: intent, dtype: int64

In [9]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
data['intent'] = label_encoder.fit_transform(data['intent'])
data.head()

Unnamed: 0,utterance,intent
0,"have a question, can you call Customer Service?",2
1,need help informing of issues paying,8
2,need help modifying my profile,5
3,"want to request some bills, tell me how to do it",6
4,"a error message pops when I try to pay, can yo...",8


In [10]:
data = data.rename(columns={"utterance": "text", "intent": "label"})

In [11]:
dataset = Dataset.from_pandas(data)

In [12]:
dataset[0]

{'text': ' have a question, can you call Customer Service?', 'label': 2}

In [13]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [14]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/11693 [00:00<?, ? examples/s]

In [16]:
tokenized_dataset[1]

{'text': ' need help informing of issues paying',
 'label': 8,
 'input_ids': [101, 2342, 2393, 21672, 1997, 3314, 7079, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [17]:
from torch.utils.data import TensorDataset, random_split
# Create a 80-20 train-validation split.
train_size = int(0.8 * len(tokenized_dataset))
val_size = len(tokenized_dataset) - train_size

# Divide the dataset by randomly selecting samples.
tokenized_train, tokenized_valid = random_split(tokenized_dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

9,354 training samples
2,339 validation samples


In [20]:
tokenized_train[5]

{'text': 'could u ask an agent ot remove my profile?',
 'label': 4,
 'input_ids': [101,
  2071,
  1057,
  3198,
  2019,
  4005,
  27178,
  6366,
  2026,
  6337,
  1029,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [21]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [22]:
import evaluate
accuracy = evaluate.load("accuracy")

In [23]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [24]:
id2label = {
0: "change_shipping_address",
1: "complaint",
2: "contact_customer_service",
3: "create_account",
4: "delete_account",
5: "edit_account",
6: "get_invoice",
7: "get_refund",
8: "payment_issue",
9: "registration_problems",
}
label2id = {
"change_shipping_address": 0,
"complaint": 1,
"contact_customer_service": 2,
"create_account": 3,
"delete_account": 4,
"edit_account": 5,
"get_invoice": 6,
"get_refund": 7,
"payment_issue": 8,
"registration_problems": 9,
}

In [30]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=10, id2label=id2label, label2id=label2id, dropout=0.2
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.we

In [3]:
pip install transformers==4.30



In [31]:
training_args = TrainingArguments(
    output_dir="bert_classification_model",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

/content/bert_classification_model is already a clone of https://huggingface.co/maurosm/bert_classification_model. Make sure you pull the latest changes with `repo.git_pull()`.


In [33]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2428,0.010242,0.998717
2,0.0049,0.007454,0.998717


TrainOutput(global_step=1170, training_loss=0.10625843874409667, metrics={'train_runtime': 109.8381, 'train_samples_per_second': 170.323, 'train_steps_per_second': 10.652, 'total_flos': 120789206590800.0, 'train_loss': 0.10625843874409667, 'epoch': 2.0})

In [34]:
torch.save(model.state_dict(), 'bert_model')

In [35]:
trainer.push_to_hub()

To https://huggingface.co/maurosm/bert_classification_model
   8a6e6c2..419e66f  main -> main

   8a6e6c2..419e66f  main -> main

To https://huggingface.co/maurosm/bert_classification_model
   419e66f..3b439ae  main -> main

   419e66f..3b439ae  main -> main



'https://huggingface.co/maurosm/bert_classification_model/commit/419e66f7cb18b9fade7081dc96c5168708564f7b'

In [36]:
text = "i want to close my existing account"

In [37]:
import transformers
from transformers import pipeline
classifier = pipeline("sentiment-analysis", model="maurosm/bert_classification_model")
classifier(text)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


[{'label': 'delete_account', 'score': 0.9970633387565613}]

In [38]:
!zip -r /content/file.zip /content/bert_model

  adding: content/bert_model (deflated 8%)


In [39]:
from google.colab import files
files.download("/content/file.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>