<a href="https://colab.research.google.com/github/nithya-nara/nithya-nara/blob/main/BERT_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
pip install transformers pandas torch



In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from tqdm import tqdm
import torch

In [17]:
# Load the dataset
data = pd.read_csv("/content/UpdatedResumeDataSet.csv")

# Split the dataset into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(data['Resume'], data['Category'], test_size=0.2, random_state=42)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize the input texts
train_encodings = tokenizer(train_texts.tolist(), padding=True, truncation=True, max_length=512)
test_encodings = tokenizer(test_texts.tolist(), padding=True, truncation=True, max_length=512)

# Convert labels to numerical values
label_dict = {label: i for i, label in enumerate(train_labels.unique())}
train_labels = train_labels.apply(lambda x: label_dict[x])
test_labels = test_labels.apply(lambda x: label_dict[x])

# Convert the tokenized inputs and labels to PyTorch tensors
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              torch.tensor(train_labels.tolist()))

test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']),
                             torch.tensor(test_encodings['attention_mask']),
                             torch.tensor(test_labels.tolist()))

In [18]:
# Define the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_dict))

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)
epochs = 3

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# Set the device to GPU if available, otherwise CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(train_loader)
    print(f"Average training loss for Epoch {epoch+1}: {avg_train_loss}")

# Save the trained model
model.save_pretrained("bert_resume_classification_model")

Epoch 1: 100%|██████████| 193/193 [1:14:30<00:00, 23.17s/it]


Average training loss for Epoch 1: 3.006173287648611


Epoch 2: 100%|██████████| 193/193 [1:13:50<00:00, 22.96s/it]


Average training loss for Epoch 2: 1.2155170309420076


Epoch 3:  14%|█▍        | 27/193 [10:22<1:03:48, 23.06s/it]


KeyboardInterrupt: 

In [20]:
# Evaluation loop
model.eval()
total_eval_loss = 0
for batch in tqdm(test_loader, desc="Evaluating"):
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0],
              'attention_mask': batch[1],
              'labels': batch[2]}
    with torch.no_grad():
        outputs = model(**inputs)
        loss = outputs.loss
        total_eval_loss += loss.item()
avg_eval_loss = total_eval_loss / len(test_loader)
print(f"Average evaluation loss: {avg_eval_loss}")

Evaluating: 100%|██████████| 49/49 [06:12<00:00,  7.60s/it]

Average evaluation loss: 0.2327915778269573



