In [258]:
!pip install transformers
import pandas as pd
import torch
import numpy as np
from torch import nn
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
import random

# Load the training data from the CSV file
data = pd.read_csv("relabelled.csv")
data = data[data['title_found'].notna()]
to_be_labelled = data[data['label'].isna()]
data.drop(to_be_labelled.index, inplace=True)
data = data[['title_found', 'label']]
data['label'] = data['label'].astype(int)
data['title_found'] = data['title_found'].astype(str)
data.drop_duplicates(inplace=True)




Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [259]:
len(data[data['label']==1])

9174

In [260]:
df4 = data[data['label']==1].sample(frac=1)[0:len(data[data['label']==0])]
df5 = data[data['label']==1].drop(df4.index)
df4.head()

Unnamed: 0,title_found,label
10902,Front-End WebTV Software Engineer (Mid/Senior),1
11545,Compliance Manager - Remote Opportunity,1
7983,Cost Control Specialist - Leatherhead,1
3180,Project Engineer,1
10412,Legal Assistant,1


In [261]:
len(data[data['label']==0])

5364

In [262]:
df3 = data[data['label']==0]
len(df3)

5364

In [263]:
df3.head()

Unnamed: 0,title_found,label
6,TEAM MEMBER,0
26,Retail & Admissions,0
32,Consummate Care Ltd,0
33,CQC overall rating :,0
34,Outstanding,0


In [264]:

new_data = pd.concat([df4, df3])
new_data = new_data.sample(frac=1)
new_data.head()

Unnamed: 0,title_found,label
4078,IT supporter,1
8280,li_sugr,0
8923,Flex,0
2901,Support Worker - Wallingford,1
7576,Oak Tree Federation- Laughton CP and Firle CEP...,0


In [265]:
new_data['label'].unique()

array([1, 0])

In [266]:
train_data_1 = new_data[new_data['label']==1]
train_data_0 = new_data[new_data['label']==0]
train_data_1 = train_data_1.sample(frac=0.6)
train_data_0 = train_data_0.sample(frac=0.6)
train_data = pd.concat([train_data_1, train_data_0])
train_data = train_data.sample(frac=1)

val_data = new_data.drop(train_data.index)
val_data = pd.concat([val_data, df5])

In [267]:
len(train_data[train_data['label']==1])

3218

In [268]:
len(train_data[train_data['label']==0])

3218

In [269]:
# Load the BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModel.from_pretrained("bert-base-cased")

# Define the neural network for fine-tuning
class FineTuneBERT(nn.Module):
    def __init__(self, bert_model):
        super().__init__()
        self.bert = bert_model
        self.drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 2)
        
    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids, attention_mask=attention_mask)[0]
        bert_output = bert_output[:, 0, :]
        bert_output = self.drop(bert_output)
        logits = self.out(bert_output)
        return logits

# Initialize the fine-tuning model
model = FineTuneBERT(model)

criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=2e-5)

# Load the training data as arrays of input_ids and attention_masks
train_input_ids = tokenizer(list(train_data['title_found']), truncation=True, padding=True, max_length=28, return_tensors='pt').input_ids
train_attention_masks = tokenizer(list(train_data['title_found']), truncation=True, padding=True, max_length=28, return_tensors='pt').attention_mask
train_labels = torch.tensor(list(train_data['label']), dtype=torch.long)

# Load the validation data as arrays of input_ids and attention_masks
val_input_ids = tokenizer(list(val_data['title_found']), truncation=True, padding=True, max_length=128, return_tensors='pt').input_ids
val_attention_masks = tokenizer(list(val_data['title_found']), truncation=True, padding=True, max_length=128, return_tensors='pt').attention_mask
val_labels = torch.tensor(list(val_data['label']), dtype=torch.long)

# Create a TensorDataset and DataLoader for the training data
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=64)

# Create a TensorDataset and DataLoader for the validation data
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=64)

def accuracy(predictions, labels):
    """
    Calculate the accuracy of predictions.

    Args:
        predictions (torch.Tensor): Predictions made by the model.
        labels (torch.Tensor): Actual labels of the data.

    Returns:
        float: Accuracy of the model.
    """
    _, predicted = torch.max(predictions, dim=1)
    correct = (predicted == labels).sum().item()
    total = labels.size(0)
    return correct / total


# Train the model
device = torch.device('mps' if torch.backends.mps.is_available() else 'cuda')
model.to(device)

avg_train_accuracies = []
avg_val_accuracies = []
best_val_acc = 0.0

for epoch in range(4):
    train_accuracies = []
    val_accuracies = []
    
    for step, batch in enumerate(train_dataloader):
        batch_input_ids = batch[0].to(device)
        batch_attention_masks = batch[1].to(device)
        batch_labels = batch[2].to(device)
        model.zero_grad()
        logits = model(batch_input_ids, batch_attention_masks)
        loss = criterion(logits, batch_labels)
        loss.backward()
        optimizer.step()

        # Calculate the accuracy on the training set
        acc = accuracy(logits, batch_labels)
        train_accuracies.append(acc)
        print(f"Epoch {epoch+1}, step {step+1}, training accuracy: {acc}")
        
    # Validation loop
    model.eval()
    with torch.no_grad():
        total_val_loss = 0.0
        total_val_correct = 0
        total_val_samples = 0
        for batch in val_dataloader:
            batch_input_ids = batch[0].to(device)
            batch_attention_masks = batch[1].to(device)
            batch_labels = batch[2].to(device)
            logits = model(batch_input_ids, batch_attention_masks)
            val_loss = criterion(logits, batch_labels)
            total_val_loss += val_loss.item() * len(batch_labels)
            total_val_correct += accuracy(logits, batch_labels) * len(batch_labels)
            total_val_samples += len(batch_labels)
        val_acc = total_val_correct / total_val_samples
        val_accuracies.append(val_acc)
        val_loss = total_val_loss / total_val_samples
        print(f"Epoch {epoch+1}, validation accuracy: {val_acc:.4f}")
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), "fine_tuned_job_title_classifier3.pt")
            
    avg_train_accuracies.append(sum(train_accuracies)/len(train_accuracies))
    avg_val_accuracies.append(sum(val_accuracies)/len(val_accuracies))


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1, step 1, training accuracy: 0.40625
Epoch 1, step 2, training accuracy: 0.4375
Epoch 1, step 3, training accuracy: 0.4375
Epoch 1, step 4, training accuracy: 0.640625
Epoch 1, step 5, training accuracy: 0.6875
Epoch 1, step 6, training accuracy: 0.828125
Epoch 1, step 7, training accuracy: 0.9375
Epoch 1, step 8, training accuracy: 0.90625
Epoch 1, step 9, training accuracy: 0.8125
Epoch 1, step 10, training accuracy: 0.890625
Epoch 1, step 11, training accuracy: 0.921875
Epoch 1, step 12, training accuracy: 0.984375
Epoch 1, step 13, training accuracy: 0.921875
Epoch 1, step 14, training accuracy: 0.953125
Epoch 1, step 15, training accuracy: 0.9375
Epoch 1, step 16, training accuracy: 0.859375
Epoch 1, step 17, training accuracy: 0.90625
Epoch 1, step 18, training accuracy: 0.921875
Epoch 1, step 19, training accuracy: 0.9375
Epoch 1, step 20, training accuracy: 0.953125
Epoch 1, step 21, training accuracy: 0.921875
Epoch 1, step 22, training accuracy: 0.953125
Epoch 1, step 

In [270]:
model.load_state_dict(torch.load("fine_tuned_job_title_classifier3.pt"))

<All keys matched successfully>

In [271]:
def test_labelled_data(classifier_model, data):
    '''
    To be used to test classifier on labelled data
    in case of windows change device from mps to cuda
    '''
    
    device = torch.device('mps' if torch.backends.mps.is_available() else 'cuda')
    classifier_model.to(device)

    # Load the test data as arrays of input_ids and attention_masks
    test_input_ids = tokenizer(list(data['title_found']), truncation=True, padding=True, max_length=128, return_tensors='pt').input_ids
    test_attention_masks = tokenizer(list(data['title_found']), truncation=True, padding=True, max_length=128, return_tensors='pt').attention_mask
    test_labels = torch.tensor(list(data['label']), dtype=torch.long)

    # Create a TensorDataset and DataLoader for the test data
    test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
    test_dataloader = DataLoader(test_dataset, batch_size=64)
    
    model.eval()

    # Iterate through test DataLoader and generate predictions
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_dataloader:
            batch_input_ids = batch[0].to(device)
            batch_attention_masks = batch[1].to(device)
            batch_labels = batch[2].to(device)
            batch_preds = model(batch_input_ids, batch_attention_masks)
            batch_preds = batch_preds.argmax(dim=1)
            all_preds += batch_preds.tolist()
            all_labels += batch_labels.tolist()

            # Generate and print confusion matrix
    cm = confusion_matrix(all_labels, all_preds)
    print('\nCONFUSION MATRIX\n', cm)
    
    # Generate and print classification report
    target_names = ['class 0', 'class 1']
    report = classification_report(all_labels, all_preds, target_names=target_names)
    print('\nCLASSIFICATION REPORT\n', report)
    
test_labelled_data(model, val_data)


CONFUSION MATRIX
 [[2051   95]
 [  95 5861]]

CLASSIFICATION REPORT
               precision    recall  f1-score   support

     class 0       0.96      0.96      0.96      2146
     class 1       0.98      0.98      0.98      5956

    accuracy                           0.98      8102
   macro avg       0.97      0.97      0.97      8102
weighted avg       0.98      0.98      0.98      8102



In [272]:


# Function for calculating uncertainty
def get_most_uncertain_samples(model, data, tokenizer, threshold=0.5, batch_size=64):
    # Load the data as arrays of input_ids and attention_masks
    input_ids = tokenizer(list(data['title_found']), truncation=True, padding=True, max_length=128, return_tensors='pt').input_ids
    attention_masks = tokenizer(list(data['title_found']), truncation=True, padding=True, max_length=128, return_tensors='pt').attention_mask
    
    # Create a TensorDataset and DataLoader for the data
    dataset = TensorDataset(input_ids, attention_masks)
    dataloader = DataLoader(dataset, batch_size=batch_size)
    
    # Calculate the uncertainty and confidence scores for each sample
    scores = []
    confidences = []
    predicted_labels = []
    samples = []
    for batch in dataloader:
        batch_input_ids = batch[0].to(device)
        batch_attention_masks = batch[1].to(device)
        logits = model(batch_input_ids, batch_attention_masks)
        probs = nn.functional.softmax(logits, dim=1)
        
        max_probs, _ = torch.max(probs, dim=1)
        uncertainty_scores = 1 - max_probs.cpu().detach().numpy()
        scores.extend(uncertainty_scores.tolist())
        
        confidences_scores = max_probs.cpu().detach().numpy()
        confidences.extend(confidences_scores.tolist())
        
        # Generate labels based on the threshold value
        labels = (probs[:, 1] >= threshold).long().cpu().tolist()
        predicted_labels.extend(labels)
        
    samples = data['title_found'].tolist()
    
    return [(samples[i], predicted_labels[i], scores[i], confidences[i]) for i in range(len(data))]


In [273]:
results = get_most_uncertain_samples(model, data=val_data[['title_found']], tokenizer=tokenizer)
predicted_label = []
uncertainty_score = []
confidence_score = []
for sample, predicted, score , confidence in results:
    predicted_label.append(predicted)
    uncertainty_score.append(score)
    confidence_score.append(confidence)

In [274]:
val_data['predicted_label'] = predicted_label
val_data['uncertainty'] = uncertainty_score
val_data['confidence'] = confidence_score
val_data.head()

Unnamed: 0,title_found,label,predicted_label,uncertainty,confidence
8923,Flex,0,0,0.00018,0.99982
6930,Benefits include;,0,0,0.000139,0.999861
2884,Machine Drivers,1,1,0.018916,0.981084
13244,Associate Health & Social Care Tutor,1,1,0.000165,0.999835
12721,Kettleby Foods/Hygiene Team Leader\nPM shift,1,1,0.000352,0.999648


In [275]:
a = val_data[val_data['label'] == 0 ]
b = a[a['predicted_label'] == 1]
b

Unnamed: 0,title_found,label,predicted_label,uncertainty,confidence
9582,KS 98-2 Multi-function Controller,0,1,0.213724,0.786276
14305,Stockists,0,1,0.007949,0.992051
8651,marquee build crew,0,1,0.001732,0.998268
14654,First time buyer,0,1,0.490622,0.509378
6315,Head Office (Central Support),0,1,0.004614,0.995386
...,...,...,...,...,...
13514,Tablet Hardness Tester: TBF 100i,0,1,0.037654,0.962346
14580,CodeRunner,0,1,0.021075,0.978925
9541,Level 2 YMCA Studio Instructor or RSA Exercise...,0,1,0.002401,0.997599
14449,MDO,0,1,0.000824,0.999176


In [276]:
b.to_excel('check_false_positives_title.xlsx', index=False)

In [277]:
c = val_data[val_data['label'] == 1 ]
d = c[c['predicted_label'] == 0]
d

Unnamed: 0,title_found,label,predicted_label,uncertainty,confidence
899,Joiners - up to Â£17.50 per hour,1,0,0.192509,0.807491
3601,Lifeguard,1,0,0.000531,0.999469
2036,North-East of England / Field Sales role,1,0,0.000597,0.999403
8344,Freelancers,1,0,0.000518,0.999482
1992,BMW MOT Tester,1,0,0.144592,0.855408
...,...,...,...,...,...
14047,SYSTEMS SUPPORT ANALYST,1,0,0.151060,0.848940
14081,SALES DIRECTOR,1,0,0.013954,0.986046
14199,Freelance/Contractor,1,0,0.179731,0.820269
14340,1st Line IT Technical Support,1,0,0.120224,0.879776


In [278]:
d.to_excel('check_false_neg_title.xlsx', index=False)

In [279]:
e = val_data[val_data['label'] == 1 ]
f = e[e['predicted_label'] == 1]
f.to_excel('check_true_pos_titles.xlsx', index=False)

In [280]:
e = val_data[val_data['label'] == 0 ]
f = e[e['predicted_label'] == 0]
f.to_excel('check_true_neg_titles.xlsx', index=False)

In [281]:
# val_data[['confidence']].describe()

In [282]:
# import seaborn as sns
# sns.violinplot(data = to_be_labelled[['confidence']])

In [283]:
# threshold = 0.9900
# confident_data = to_be_labelled.loc[to_be_labelled['confidence'] > threshold]
# confident_data.head()

In [284]:
# len(confident_data)

In [285]:
# remaining_data = to_be_labelled.loc[to_be_labelled['confidence'] <= threshold]
# remaining_data.head()

In [286]:
# len(remaining_data)

In [287]:
# confident_data['label'] = confident_data['predicted_label']
# confident_data.head()

In [288]:
# final_data = pd.concat([data, confident_data[['title_found', 'label']], remaining_data[['title_found',  'label']]])

In [289]:
# final_data.to_csv('relabelled.csv', index=False)