In [1]:
!pip install transformers datasets evaluate

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 

In [None]:
# Data processing
import pandas as pd
import numpy as np

# Modeling
import torch

# Hugging Face Dataset
from datasets import Dataset

# Model performance evaluation
import evaluate

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
data = pd.read_csv('dataset.csv')
data.head(5)

Unnamed: 0,utterance,intent
0,"have a question, can you call Customer Service?",contact_customer_service
1,havent got a user account and i wannaregister,create_account
2,need ehlp modifying my profile,edit_account
3,need help informing of issues paying,payment_issue
4,"want to request some bills, tell me how to do it",get_invoice


In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
data['intent'] = label_encoder.fit_transform(data['intent'])
data.head()

Unnamed: 0,utterance,intent
0,"have a question, can you call Customer Service?",2
1,havent got a user account and i wannaregister,3
2,need ehlp modifying my profile,5
3,need help informing of issues paying,8
4,"want to request some bills, tell me how to do it",6


In [None]:
data = data.rename(columns={"utterance": "text", "intent": "label"})

In [None]:
dataset = Dataset.from_pandas(data)

In [None]:
dataset[0]

{'text': ' have a question, can you call Customer Service?', 'label': 2}

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/13155 [00:00<?, ? examples/s]

In [None]:
from torch.utils.data import TensorDataset, random_split
# Create a 80-20 train-validation split.
train_size = int(0.8 * len(tokenized_dataset))
val_size = len(tokenized_dataset) - train_size

# Divide the dataset by randomly selecting samples.
tokenized_train, tokenized_valid = random_split(tokenized_dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

10,524 training samples
2,631 validation samples


In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate
accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
id2label = {
0: "change_shipping_address",
1: "complaint",
2: "contact_customer_service",
3: "create_account",
4: "delete_account",
5: "edit_account",
6: "get_invoice",
7: "get_refund",
8: "payment_issue",
9: "registration_problems",
}
label2id = {
"change_shipping_address": 0,
"complaint": 1,
"contact_customer_service": 2,
"create_account": 3,
"delete_account": 4,
"edit_account": 5,
"get_invoice": 6,
"get_refund": 7,
"payment_issue": 8,
"registration_problems": 9,
}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=10, id2label=id2label, label2id=label2id
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.we

In [None]:
# pip install transformers[torch]

In [None]:
training_args = TrainingArguments(
    output_dir="bert_classification_model",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

/content/bert_classification_model is already a clone of https://huggingface.co/maurosm/bert_classification_model. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.217,0.008109,0.99886
2,0.0059,0.005343,0.99924


TrainOutput(global_step=1316, training_loss=0.08526211678075936, metrics={'train_runtime': 150.4118, 'train_samples_per_second': 139.936, 'train_steps_per_second': 8.749, 'total_flos': 135770508711600.0, 'train_loss': 0.08526211678075936, 'epoch': 2.0})

In [None]:
torch.save(model.state_dict(), 'bert_model')

In [None]:
trainer.push_to_hub()

Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 1.00/255M [00:00<?, ?B/s]

Upload file runs/Oct17_23-44-41_dcb8976a7ce7/events.out.tfevents.1697586309.dcb8976a7ce7.3387.0:   0%|        …

To https://huggingface.co/maurosm/bert_classification_model
   30baebd..f9af3c7  main -> main

   30baebd..f9af3c7  main -> main

To https://huggingface.co/maurosm/bert_classification_model
   f9af3c7..bcb8ba4  main -> main

   f9af3c7..bcb8ba4  main -> main



'https://huggingface.co/maurosm/bert_classification_model/commit/f9af3c7e5454232f82d920d7194d3ef6bf23bc65'

In [4]:
text = "i want to close my existing account"

In [5]:
import transformers
from transformers import pipeline
classifier = pipeline("sentiment-analysis", model="maurosm/bert_classification_model")
classifier(text)

[{'label': 'delete_account', 'score': 0.9982580542564392}]

In [None]:
!zip -r /content/file.zip /content/bert_model

In [None]:
from google.colab import files
files.download("/content/file.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>