In [1]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer

text = []
unis = {}
unis_ordered = []
label = []

with open('/content/drive/My Drive/dataset.txt') as f:
    for line in f:
        data = line.strip().split('\t\t')
        text.append(data[0])
        if data[1] not in unis:
            unis[data[1]] = len(unis)
            unis_ordered.append(data[1])
        label.append(unis[data[1]])

df = pd.DataFrame({'text': text, 'label': label})

ds = Dataset.from_pandas(df)
ds = ds.train_test_split(test_size=0.2, shuffle=True)

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True)

tokenized_ds = ds.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(unis)).to("cuda")

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifi

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=280, training_loss=3.6395440237862724, metrics={'train_runtime': 212.9754, 'train_samples_per_second': 20.941, 'train_steps_per_second': 1.315, 'total_flos': 591594810101760.0, 'train_loss': 3.6395440237862724, 'epoch': 5.0})

In [3]:
trainer.save_model('bert_classifier')
!zip -r /content/bert_classifier.zip /content/bert_classifier
!cp /content/bert_classifier.zip /content/drive/My\ Drive

Saving model checkpoint to bert_classifier
Configuration saved in bert_classifier/config.json
Model weights saved in bert_classifier/pytorch_model.bin
tokenizer config file saved in bert_classifier/tokenizer_config.json
Special tokens file saved in bert_classifier/special_tokens_map.json


updating: content/bert_classifier/ (stored 0%)
updating: content/bert_classifier/vocab.txt (deflated 53%)
updating: content/bert_classifier/tokenizer.json (deflated 71%)
updating: content/bert_classifier/training_args.bin (deflated 48%)
updating: content/bert_classifier/pytorch_model.bin (deflated 8%)
updating: content/bert_classifier/special_tokens_map.json (deflated 40%)
updating: content/bert_classifier/tokenizer_config.json (deflated 40%)
updating: content/bert_classifier/config.json (deflated 72%)


In [4]:
from transformers import pipeline

def tokenizer_truncate(input, **kwargs):
    return tokenizer(input, truncation=True, padding=True, **kwargs)

pipe = pipeline(task="text-classification", model=model.to('cpu'), tokenizer=tokenizer_truncate)
predictions = pipe(ds['test']['text'])

Disabling tokenizer parallelism, we're using DataLoader multithreading already


In [5]:
y_pred = []
y_true = []

for i, el in enumerate(predictions):
    y_pred.append(model.config.label2id[el['label']])
    y_true.append(ds['test']['label'][i])

from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

print(f1_score(y_true, y_pred, average='macro'))
print(accuracy_score(y_true, y_pred))
confusion_matrix(y_true, y_pred)

0.05085619192684802
0.2556053811659193


array([[ 7,  2,  0, ...,  0,  0,  0],
       [ 0, 19,  0, ...,  0,  0,  0],
       [ 1,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  3,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]])