In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import re

In [12]:
# Load and preprocess data (assuming you've already cleaned the text)
df = pd.read_csv('/kaggle/input/resume-data/Resume.csv')

In [13]:
# Preprocess text
def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

df['cleaned_resume'] = df['Resume_str'].apply(preprocess_text)

In [14]:
# Encode labels
le = LabelEncoder()
df['encoded_category'] = le.fit_transform(df['Category'])


In [15]:
# Spliting the data
train_texts, val_texts, train_labels, val_labels = train_test_split(df['cleaned_resume'], df['encoded_category'], test_size=0.2, random_state=42)

In [None]:
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [17]:
# Tokenize and encode sequences
max_len = 128

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=max_len)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=max_len)

In [18]:
# Convert to PyTorch tensors
train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_labels.values)
)

val_dataset = TensorDataset(
    torch.tensor(val_encodings['input_ids']),
    torch.tensor(val_encodings['attention_mask']),
    torch.tensor(val_labels.values)
)

In [19]:
# Create dataloaders
batch_size = 32

train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

val_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=batch_size
)


In [None]:
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=24,
    output_attentions=False,
    output_hidden_states=False
)

In [24]:
# Set up optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 50
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [25]:
# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}'):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Average training loss: {avg_train_loss:.4f}')

    # Validation
    model.eval()
    val_accuracy = 0
    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        with torch.no_grad():
            outputs = model(**inputs)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        val_accuracy += (predictions == inputs['labels']).float().mean().item()

    val_accuracy /= len(val_dataloader)
    print(f'Validation Accuracy: {val_accuracy:.4f}')

print("Training complete!")

Epoch 1/50: 100%|██████████| 63/63 [00:21<00:00,  2.87it/s]


Average training loss: 1.3109
Validation Accuracy: 0.7917


Epoch 2/50: 100%|██████████| 63/63 [00:22<00:00,  2.86it/s]


Average training loss: 0.9492
Validation Accuracy: 0.8056


Epoch 3/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.7420
Validation Accuracy: 0.8115


Epoch 4/50: 100%|██████████| 63/63 [00:22<00:00,  2.83it/s]


Average training loss: 0.5857
Validation Accuracy: 0.8076


Epoch 5/50: 100%|██████████| 63/63 [00:22<00:00,  2.83it/s]


Average training loss: 0.4667
Validation Accuracy: 0.7939


Epoch 6/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.3683
Validation Accuracy: 0.8095


Epoch 7/50: 100%|██████████| 63/63 [00:22<00:00,  2.83it/s]


Average training loss: 0.2845
Validation Accuracy: 0.8095


Epoch 8/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.2282
Validation Accuracy: 0.8076


Epoch 9/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.1692
Validation Accuracy: 0.8154


Epoch 10/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.1269
Validation Accuracy: 0.8054


Epoch 11/50: 100%|██████████| 63/63 [00:22<00:00,  2.83it/s]


Average training loss: 0.0918
Validation Accuracy: 0.7941


Epoch 12/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.0697
Validation Accuracy: 0.7978


Epoch 13/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.0529
Validation Accuracy: 0.8073


Epoch 14/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.0312
Validation Accuracy: 0.8037


Epoch 15/50: 100%|██████████| 63/63 [00:22<00:00,  2.83it/s]


Average training loss: 0.0247
Validation Accuracy: 0.7997


Epoch 16/50: 100%|██████████| 63/63 [00:22<00:00,  2.83it/s]


Average training loss: 0.0168
Validation Accuracy: 0.7961


Epoch 17/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.0132
Validation Accuracy: 0.8076


Epoch 18/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.0251
Validation Accuracy: 0.7997


Epoch 19/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.0114
Validation Accuracy: 0.8095


Epoch 20/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.0075
Validation Accuracy: 0.8076


Epoch 21/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.0070
Validation Accuracy: 0.8095


Epoch 22/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.0057
Validation Accuracy: 0.8037


Epoch 23/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.0051
Validation Accuracy: 0.8076


Epoch 24/50: 100%|██████████| 63/63 [00:22<00:00,  2.83it/s]


Average training loss: 0.0046
Validation Accuracy: 0.8037


Epoch 25/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.0045
Validation Accuracy: 0.8076


Epoch 26/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.0041
Validation Accuracy: 0.8037


Epoch 27/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.0043
Validation Accuracy: 0.7997


Epoch 28/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.0035
Validation Accuracy: 0.7997


Epoch 29/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.0033
Validation Accuracy: 0.7997


Epoch 30/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.0031
Validation Accuracy: 0.8037


Epoch 31/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.0030
Validation Accuracy: 0.8037


Epoch 32/50: 100%|██████████| 63/63 [00:22<00:00,  2.84it/s]


Average training loss: 0.0044
Validation Accuracy: 0.8037


Epoch 33/50:  75%|███████▍  | 47/63 [00:16<00:05,  2.80it/s]


KeyboardInterrupt: 

## Trying RoBERTa

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
import torch
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_linear_schedule_with_warmup
from tqdm import tqdm
import re
from torch.nn.utils import clip_grad_norm_
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

In [19]:
# Load and preprocess data (assuming you've already cleaned the text)
df = pd.read_csv('/kaggle/input/resume-data/Resume.csv')

In [20]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return ' '.join(text.split())


df['cleaned_resume'] = df['Resume_str'].apply(preprocess_text)

In [21]:
le = LabelEncoder()
df['encoded_category'] = le.fit_transform(df['Category'])

In [22]:
# Calculate class weights
class_weights = torch.tensor(
    [1 / count for count in Counter(df['encoded_category']).values()],
    dtype=torch.float
)
class_weights /= class_weights.sum()

In [23]:
# Split data into train, validation, and test sets
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['cleaned_resume'], df['encoded_category'], test_size=0.3, random_state=2024
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=2024
)

In [None]:
# Load RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

In [25]:
# Tokenize and encode sequences
max_len = 256

def encode_texts(texts):
    return tokenizer(
        list(texts),
        truncation=True,
        padding=True,
        max_length=max_len,
        return_tensors='pt'
    )

train_encodings = encode_texts(train_texts)
val_encodings = encode_texts(val_texts)
test_encodings = encode_texts(test_texts)

In [26]:
# Data augmentation function
def augment_text(text):
    words = text.split()
    if len(words) > 10:
        # Randomly delete 10% of words
        words = [w for w in words if np.random.rand() > 0.1]
    return ' '.join(words)

In [27]:
# Applying data augmentation to training data
augmented_train_texts = train_texts.apply(augment_text)
train_texts = pd.concat([train_texts, augmented_train_texts])
train_labels = pd.concat([train_labels, train_labels])

train_encodings = encode_texts(train_texts)
val_encodings = encode_texts(val_texts)
test_encodings = encode_texts(test_texts)

In [29]:
# Creating dataloaders
batch_size = 16
accumulation_steps = 2

train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

val_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=batch_size
)

test_dataloader = DataLoader(
    test_dataset,
    sampler=SequentialSampler(test_dataset),
    batch_size=batch_size
)


In [None]:
# Loading RoBERTa model
model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=24,
    output_attentions=False,
    output_hidden_states=False
)

In [31]:
# Seting up optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 50
total_steps = len(train_dataloader) * epochs // accumulation_steps
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [32]:
# Moveing model and class weights to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
class_weights = class_weights.to(device)

In [33]:
# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    
    for step, batch in enumerate(tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}')):
        batch = tuple(t.to(device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        
        outputs = model(**inputs)
        loss = outputs.loss
        
        # Apply class weights to the loss
        loss = (loss * class_weights[inputs['labels']]).mean()
        
        loss = loss / accumulation_steps
        loss.backward()
        
        total_loss += loss.item()
        
        if (step + 1) % accumulation_steps == 0:
            clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
    
    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Average training loss: {avg_train_loss:.4f}')
    
    # Validation
    model.eval()
    val_accuracy = 0
    val_loss = 0
    
    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        loss = outputs.loss
        val_loss += loss.item()
        
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        val_accuracy += (predictions == inputs['labels']).float().mean().item()
    
    val_loss /= len(val_dataloader)
    val_accuracy /= len(val_dataloader)
    print(f'Validation Loss: {val_loss:.4f}')
    print(f'Validation Accuracy: {val_accuracy:.4f}')

print("Training complete!")

Epoch 1/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0625
Validation Loss: 2.4157
Validation Accuracy: 0.4870


Epoch 2/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0363
Validation Loss: 1.2307
Validation Accuracy: 0.7469


Epoch 3/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0208
Validation Loss: 0.9862
Validation Accuracy: 0.7755


Epoch 4/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0160
Validation Loss: 0.8546
Validation Accuracy: 0.7917


Epoch 5/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0126
Validation Loss: 0.9293
Validation Accuracy: 0.7677


Epoch 6/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0106
Validation Loss: 0.8592
Validation Accuracy: 0.7943


Epoch 7/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0089
Validation Loss: 0.9001
Validation Accuracy: 0.7781


Epoch 8/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0075
Validation Loss: 0.8880
Validation Accuracy: 0.7891


Epoch 9/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0063
Validation Loss: 0.9033
Validation Accuracy: 0.7964


Epoch 10/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0054
Validation Loss: 0.8452
Validation Accuracy: 0.7990


Epoch 11/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0043
Validation Loss: 0.9646
Validation Accuracy: 0.7859


Epoch 12/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0037
Validation Loss: 0.9389
Validation Accuracy: 0.7833


Epoch 13/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0029
Validation Loss: 0.9570
Validation Accuracy: 0.7729


Epoch 14/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0026
Validation Loss: 1.0210
Validation Accuracy: 0.7755


Epoch 15/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0021
Validation Loss: 0.9663
Validation Accuracy: 0.7969


Epoch 16/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0018
Validation Loss: 0.9862
Validation Accuracy: 0.7969


Epoch 17/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0016
Validation Loss: 0.9946
Validation Accuracy: 0.7938


Epoch 18/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0014
Validation Loss: 1.0741
Validation Accuracy: 0.7859


Epoch 19/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0012
Validation Loss: 1.0337
Validation Accuracy: 0.7859


Epoch 20/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0012
Validation Loss: 1.0309
Validation Accuracy: 0.7865


Epoch 21/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0010
Validation Loss: 1.0265
Validation Accuracy: 0.7990


Epoch 22/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0009
Validation Loss: 1.0408
Validation Accuracy: 0.7938


Epoch 23/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0008
Validation Loss: 1.0194
Validation Accuracy: 0.8099


Epoch 24/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0006
Validation Loss: 1.0335
Validation Accuracy: 0.7917


Epoch 25/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0005
Validation Loss: 1.0442
Validation Accuracy: 0.8042


Epoch 26/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0005
Validation Loss: 1.0785
Validation Accuracy: 0.7964


Epoch 27/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0005
Validation Loss: 1.1349
Validation Accuracy: 0.7859


Epoch 28/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0004
Validation Loss: 1.0808
Validation Accuracy: 0.7943


Epoch 29/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0003
Validation Loss: 1.1024
Validation Accuracy: 0.7969


Epoch 30/50: 100%|██████████| 109/109 [00:40<00:00,  2.69it/s]


Average training loss: 0.0003
Validation Loss: 1.1230
Validation Accuracy: 0.7943


Epoch 31/50:  17%|█▋        | 18/109 [00:07<00:35,  2.54it/s]


KeyboardInterrupt: 

In [34]:
# Saving the model
torch.save(model.state_dict(), 'resume_categoriztion_model.pth')

In [None]:
# Loading the model
model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=24,
    output_attentions=False,
    output_hidden_states=False
)
model.load_state_dict(torch.load('/kaggle/working/resume_categoriztion_model.pth'))
model.to(device)
model.eval()

In [36]:
# Evaluate on test data
test_predictions = []
test_true_labels = []

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {
        'input_ids': batch[0],
        'attention_mask': batch[1],
    }
    labels = batch[2]
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    
    test_predictions.extend(predictions.cpu().numpy())
    test_true_labels.extend(labels.cpu().numpy())


In [37]:
# Calculate metrics
conf_matrix = confusion_matrix(test_true_labels, test_predictions)
class_report = classification_report(test_true_labels, test_predictions, target_names=le.classes_, output_dict=True)

In [38]:
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('confusion_matrix.png')
plt.close()

In [39]:
metrics = ['f1-score', 'precision', 'recall']
for metric in metrics:
    plt.figure(figsize=(12, 6))
    values = [class_report[cls][metric] for cls in le.classes_]
    plt.bar(le.classes_, values)
    plt.title(f'{metric.capitalize()} by Class')
    plt.xlabel('Class')
    plt.ylabel(metric.capitalize())
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(f'{metric}_by_class.png')
    plt.close()

In [40]:
print("\nTest Set Metrics:")
print(f"Accuracy: {class_report['accuracy']:.4f}")
print(f"Macro Avg F1-Score: {class_report['macro avg']['f1-score']:.4f}")
print(f"Weighted Avg F1-Score: {class_report['weighted avg']['f1-score']:.4f}")


Test Set Metrics:
Accuracy: 0.8043
Macro Avg F1-Score: 0.7607
Weighted Avg F1-Score: 0.8047
