#Installing Libraries

In [1]:
!pip install torchmetrics datasets transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


##Data Prepration

In [2]:
import pickle
import os
from google.cloud import storage
from google.cloud.storage.blob import Blob
from google.colab import auth
import pandas as pd
import numpy as np
import json
import glob
import sklearn
from datetime import datetime
from pathlib import Path
from sklearn.model_selection import train_test_split
from datasets import Dataset, Value, ClassLabel, Features
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import torch
from datasets import load_metric


pd.options.mode.chained_assignment = None 


def save_model(save_model_path):

    if not os.path.exists(save_model_path):
        os.mkdir(save_model_path)

    for f in os.listdir(save_model_path):
        os.remove(os.path.join(save_model_path, f))

    model.save_pretrained(save_model_path)


Updated property [core/project].


# Hyperparameters

In [18]:
SAVE_MODEL_PATH = "./best_model"

TEST_SIZE = 0.20
CHECK_POINT = "distilbert-base-uncased"
BATCH_SIZE = 16
NUM_EPOCHS = 1
LEARNING_RATE = 5e-5

#Load `dataset_with_clusters.csv` gained from previous step

#Reading CSV and Prepration

In [37]:
df = pd.read_csv("dataset_with_clusters.csv")
df = df[['abstract','cluster_label']]
df.columns = ["input", "label"]
df = df.replace(np.nan, "")
df["input"] = df["input"].apply(lambda x: x.strip())
df = df.sample(frac=1).reset_index(drop=True)

df

Unnamed: 0,input,label
0,Near-infrared (IR) diffuse Galactic light (DGL...,7
1,Electromagnetic interactions of the spin 3/2 p...,8
2,The ongoing neural revolution in machine trans...,12
3,We describe the effective Lorentz forces on th...,4
4,Deep learning-based techniques have achieved s...,12
...,...,...
621741,In the context of SU(3)_c x SU(2)_L x [U(1)]^n...,18
621742,The Large Hadron Electron Collider and the Fut...,0
621743,One-way quantum computing is an important and ...,13
621744,The high luminosity that will be accumulated a...,2


#

#Creating Config File

In [38]:
classes_names = list(df["label"].unique())
classes_dict = dict(enumerate(classes_names))
classes_names = list(classes_names)
classes_names = [int(item) for item in classes_names]

features = Features({"input": Value("string"), "label": ClassLabel(num_classes=len(classes_names), names=classes_names)})

features

{'input': Value(dtype='string', id=None),
 'label': ClassLabel(names=[7, 8, 12, 4, 0, 1, 5, 14, 10, 3, 17, 9, 2, 19, 13, 16, 11, 6, 15, 18], id=None)}

#Dividing Dataset

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df["input"], df["label"], test_size = TEST_SIZE)

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

data_files = {"train": "train.csv", "test":"test.csv"}
datasets = load_dataset("csv", data_files=data_files, features=features)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-1e589f2032c49db2/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-1e589f2032c49db2/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [23]:
datasets

DatasetDict({
    train: Dataset({
        features: ['input', 'label'],
        num_rows: 64000
    })
    test: Dataset({
        features: ['input', 'label'],
        num_rows: 16000
    })
})

##Training the Model

In [24]:
tokenizer = AutoTokenizer.from_pretrained(CHECK_POINT) 

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


def tokenize_function(example):
    return tokenizer(example["input"], truncation=True)

tokenized_datasets = datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["input"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("pytorch")

Map:   0%|          | 0/64000 [00:00<?, ? examples/s]

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

In [25]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 64000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
})

In [26]:
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator
)

test_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=BATCH_SIZE, collate_fn=data_collator
)

In [27]:
for step, batch in enumerate(test_dataloader):
    if step >= 1 :
        break
    print(batch["input_ids"])
    print(batch["attention_mask"])
    print(batch["labels"])

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[  101,  2057,  2556,  ...,     0,     0,     0],
        [  101,  2057,  2817,  ...,     0,     0,     0],
        [  101,  2057,  2817,  ...,     0,     0,     0],
        ...,
        [  101, 17630, 11498,  ...,     0,     0,     0],
        [  101,  2057,  2556,  ...,     0,     0,     0],
        [  101,  2057,  6848,  ...,     0,     0,     0]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([ 0, 12,  8, 13,  2, 16,  2, 11, 12,  4, 10, 15,  7,  4,  4, 18])


Creating Model

In [28]:
from transformers import AutoConfig, AutoModelForSequenceClassification

id2label = {int(k):int(v) for k,v in classes_dict.items()}
label2id = {int(v):int(k) for k,v in classes_dict.items()}

config = AutoConfig.from_pretrained(CHECK_POINT, label2id=label2id, id2label=id2label)

model = AutoModelForSequenceClassification.from_pretrained(CHECK_POINT, config=config)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

Creating Optimizer

In [29]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [30]:
from transformers import get_scheduler

num_training_steps = NUM_EPOCHS * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

4000


In [40]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(NUM_EPOCHS*len(train_dataloader)))

f1_micro = load_metric("f1")
f1_macro = load_metric("f1")
accuracy = load_metric("accuracy")

results = {}
best_model_score = -1

for epoch in range(NUM_EPOCHS):

    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


    model.eval()
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            test_outputs = model(**batch)

        logits = test_outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        f1_micro.add_batch(predictions=predictions, references=batch["labels"])
        f1_macro.add_batch(predictions=predictions, references=batch["labels"])
        accuracy.add_batch(predictions=predictions, references=batch["labels"])

    result = {}
    result["train_loss"] = loss.item()
    result["test_loss"] = test_outputs.loss.item()
    result["test_f1_micro"] = f1_micro.compute(average="micro")["f1"]
    result["test_f1_macro"] = f1_macro.compute(average="macro")["f1"]
    result["test_accuracy"] = accuracy.compute()["accuracy"]

    results["epoch_" + str(epoch+1)] = result

    print(result)

    if result["test_f1_micro"] > best_model_score:
        print("Saving best model ...")
        save_model(SAVE_MODEL_PATH)
        best_model_score = result["test_f1_micro"]



In [32]:
# tokenizer.save_pretrained(SAVE_MODEL_PATH)

('./best_model/tokenizer_config.json',
 './best_model/special_tokens_map.json',
 './best_model/vocab.txt',
 './best_model/added_tokens.json',
 './best_model/tokenizer.json')

##Inference Document Class

In [33]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(SAVE_MODEL_PATH) 
tokenizer = AutoTokenizer.from_pretrained(SAVE_MODEL_PATH) 

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

#Text Inference Function

In [41]:
text = "Give an abstract paper here!"

def infer_document_class(text, device=device):
    input = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    input = {k: v.to(device) for k, v in input.items()}
    output = model(**input)
    preds_indexes = torch.argmax(output.logits, dim=-1)
    preds_scores = torch.nn.functional.softmax(output.logits, dim=-1)
    outputs = []
    for idx, pred in enumerate(preds_indexes):
        label = model.config.id2label[pred.item()]
        score = preds_scores[idx, pred.item()]
        outputs.append([label, score.item()])
    return outputs

Predict Text

In [42]:
infer_document_class(text)

[[3, 0.0668896809220314]]