In [15]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [3]:
# Loading the pickled dataset
import pickle
with open('Data/Dataframes/newDF.pkl', 'rb') as f:
    df = pickle.load(f)

In [4]:
trainingDF = df.drop(columns=['ID', 'Label', 'TextLen'])

In [5]:
# Lets convert the text into numerical values using the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


In [6]:

# Dataset
class ResumeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

We use 512 max length, but we have resumes longer than 512. We might want to up it to more.

In [7]:
texts = df['Resume'].tolist()
labels = trainingDF.drop(columns=['Resume']).values

X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.1)

train_dataset = ResumeDataset(X_train, y_train, tokenizer)
val_dataset = ResumeDataset(X_val, y_val, tokenizer)

In [8]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=labels.shape[1],
    problem_type="multi_label_classification"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
import accelerate
print(accelerate.__version__)
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

1.6.0


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [11]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.0305,0.029585
2,0.0117,0.021991
3,0.0164,0.018065


TrainOutput(global_step=9771, training_loss=0.03867846989406366, metrics={'train_runtime': 8241.5805, 'train_samples_per_second': 9.483, 'train_steps_per_second': 1.186, 'total_flos': 2.0565184708583424e+16, 'train_loss': 0.03867846989406366, 'epoch': 3.0})

In [1]:
# To upload the model, we have to save the tokenizer and the model
from transformers import BertTokenizer

# Load the same tokenizer you used during training
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Save it to the directory where your checkpoint is stored
tokenizer.save_pretrained("./results/checkpoint-9771")  # adjust if needed


  from .autonotebook import tqdm as notebook_tqdm


('./results/checkpoint-9771\\tokenizer_config.json',
 './results/checkpoint-9771\\special_tokens_map.json',
 './results/checkpoint-9771\\vocab.txt',
 './results/checkpoint-9771\\added_tokens.json')

In [None]:
# To train the model on an nvidia GPU, you need to download the CUDA toolkit and install the nvidia drivers.


import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")

True
NVIDIA GeForce RTX 4060


In [21]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # Convert logits to predicted class indices
    preds = np.argmax(logits, axis=1)

    # If labels are one-hot, convert to class indices too
    if labels.ndim > 1 and labels.shape[1] > 1:
        labels = np.argmax(labels, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [22]:
trainer.compute_metrics = compute_metrics


In [23]:
results = trainer.evaluate()


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
print(results)

{'eval_loss': 0.018065307289361954, 'eval_accuracy': 0.8918825561312608, 'eval_precision': 0.922566883602777, 'eval_recall': 0.8918825561312608, 'eval_f1': 0.9027684731323408, 'eval_runtime': 73.8097, 'eval_samples_per_second': 39.222, 'eval_steps_per_second': 4.905, 'epoch': 3.0}


- eval_loss': 0.018065307289361954
- eval_accuracy': 0.8918825561312608
- eval_precision': 0.922566883602777
- eval_recall': 0.8918825561312608
- eval_f1': 0.9027684731323408
- eval_runtime': 73.8097
- eval_samples_per_second': 39.222
- eval_steps_per_second': 4.905
- epoch': 3.0

## ***To use the model:***

from transformers import BertForSequenceClassification, BertTokenizer

#### ***Load the model and tokenizer from Hugging Face Hub***

```
model = BertForSequenceClassification.from_pretrained("pelle112112/ResumeLabelBert")
tokenizer = BertTokenizer.from_pretrained("pelle112112/ResumeLabelBert")
```
