In [1]:
# %autosave 60

In [2]:
# !pip install bitsandbytes
# !pip install datasets

In [3]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
    pipeline
)
import bitsandbytes

#### Load the base Model to be finetuned

In [4]:
model_id="FacebookAI/roberta-base"

In [5]:
classifier= pipeline('text-classification', model=model_id)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


Syntax to run model
No meaningful output as of now as this is the base model and there are no lables like "POSITIVE" and "NEGATIVE" that it has inbuilt



In [6]:
classifier("HELLO!!")

[{'label': 'LABEL_1', 'score': 0.5334505438804626}]

## **CREATING** **DATASET**
#### TODO: Get a much larger and comprehensive dataset

In [8]:
# load the dataset
# TODO -  generate large synthetic data in future to prevent overfitting
import pandas as pd
df1 = pd.read_csv("./data/jira_scrum_role_dataset.csv")
df2 = pd.read_csv("./data/role_classification_dataset_long.csv")

In [9]:
#concat the dataset into a single entity with mixing the rows
dataset = pd.concat([df1,df2], ignore_index=True)

In [10]:
dataset

Unnamed: 0,description,role
0,Implement responsive UI using React: Implement...,FrontEndEngineer
1,Add new components to the dashboard: Add new c...,FrontEndEngineer
2,Add new components to the dashboard: Add new c...,FrontEndEngineer
3,Fix CSS styling issues for mobile view: Fix CS...,FrontEndEngineer
4,Add new components to the dashboard: Add new c...,FrontEndEngineer
...,...,...
6995,Scale bleeding-edge web-readiness. Develop new...,FrontEndEngineer
6996,Revolutionize scalable solutions. Configure al...,DevOpsEngineer
6997,Syndicate customized paradigms. Improve access...,FrontEndEngineer
6998,Incubate mission-critical architectures. Fix b...,BackendEngineer


In [11]:
dataset_y = dataset['role']
dataset.drop(['role'], inplace=True, axis = 1)

In [12]:
dataset_y

0       FrontEndEngineer
1       FrontEndEngineer
2       FrontEndEngineer
3       FrontEndEngineer
4       FrontEndEngineer
              ...       
6995    FrontEndEngineer
6996      DevOpsEngineer
6997    FrontEndEngineer
6998     BackendEngineer
6999     BackendEngineer
Name: role, Length: 7000, dtype: object

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset, dataset_y, test_size=0.2, random_state=42)

## Preprocessing
- Clean the data (if necessary)
- Need to tokenize the inputs to feed into model
- Need to label encode the output to multiple classes for classification


In [14]:
cols = y_train.unique()
cols

array(['BackendEngineer', 'FrontEndEngineer', 'CloudEngineer',
       'DevOpsEngineer', 'AIEngineer', 'DatabaseDesignEngineer'],
      dtype=object)

# **TOKENIZING**

##### We tokenize input data using the existing tokenizer for the base model

In [15]:
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

In [16]:
X_train

Unnamed: 0,description
1032,Integrate third-party payment gateway: Integra...
6339,Envisioneer front-end e-services. Improve acce...
3886,Optimize cloud resource usage: Optimize cloud ...
2653,Configure Docker containers for services: Conf...
6914,Brand enterprise users. Develop new React comp...
...,...
3772,Migrate services to AWS Lambda: Migrate servic...
5191,Evaluate model performance on validation set: ...
5226,Set up ML pipeline for training and inference:...
5390,Fine-tune GPT model for text generation: Fine-...


In [17]:
# Tokenising the train and test inputs to finetune the model
train_encodings = tokenizer(list(X_train['description']), padding=True, truncation=True, max_length=256, return_tensors="pt")
test_encodings = tokenizer(list(X_test['description']), padding=True, truncation=True, max_length=256, return_tensors="pt")

#### Since the task is classification, we use a Label Encoder to convert the roles into numeric representations so simplify the process of prediction

In [18]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train_enc = torch.tensor(label_encoder.fit_transform(y_train))
y_test_enc = torch.tensor(label_encoder.transform(y_test))

In [19]:
y_train_enc

tensor([1, 5, 2,  ..., 0, 0, 5])

### Create a Role to Index and Role to Index Mapping

In [20]:
# RoleToIndex = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_).astype(float)))
# IndexToRole = dict(zip(label_encoder.transform(label_encoder.classes_).astype(float), label_encoder.classes_))
IndexToRole = {int(i): role for i, role in zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_)}
RoleToIndex = {role: int(i) for role, i in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))}


In [21]:
RoleToIndex, IndexToRole

({'AIEngineer': 0,
  'BackendEngineer': 1,
  'CloudEngineer': 2,
  'DatabaseDesignEngineer': 3,
  'DevOpsEngineer': 4,
  'FrontEndEngineer': 5},
 {0: 'AIEngineer',
  1: 'BackendEngineer',
  2: 'CloudEngineer',
  3: 'DatabaseDesignEngineer',
  4: 'DevOpsEngineer',
  5: 'FrontEndEngineer'})

The Roberta Base Model does not come with any predefined labels like "POSITIVE" "NEGATIVE". This is becase it has not been trained for any specific purpose.

We can define our own labels and can finetune the model on those classes.

The following cell updates the base model with custom labels define above

In [22]:
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": IndexToRole})

In [23]:
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Converting the dataset to the standard format required for the training and finetuning of Roberta Models using pytorch

In [24]:
from torch.utils.data import Dataset,DataLoader
class RoleDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = RoleDataset(train_encodings, y_train_enc)
test_dataset = RoleDataset(test_encodings, y_test_enc)

## **FINETUNING** **BEGINS**

We will save multiple versions of the model in a directory to and pick the model with the least loss

In [None]:
import time
output_dir = f'./summary-training'

In [None]:
training_args = TrainingArguments(
    output_dir = output_dir,
    num_train_epochs=2, #entire dataset will be trained on twice
    # warmup_steps=1, # gradually increases learning rate from 0 to alpha in <warmp_steps> steps. Currently not useful
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    # max_steps=1000,  # can override num.epochs
    learning_rate=1e-5,
    optim="paged_adamw_8bit", #optimiser used for gradient descent
    logging_strategy="steps",
    logging_steps=100, # prints the loss every 100 steps
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=500, # Saves the model every 500 steps
    gradient_checkpointing=True,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

In [27]:
trainer.train()

Step,Training Loss
100,1.519
200,0.2599
300,0.053
400,0.0279
500,0.004
600,0.0152
700,0.0021
800,0.0017
900,0.0014
1000,0.0012


TrainOutput(global_step=2800, training_loss=0.0677345957128065, metrics={'train_runtime': 471.7877, 'train_samples_per_second': 23.739, 'train_steps_per_second': 5.935, 'total_flos': 437437839283200.0, 'train_loss': 0.0677345957128065, 'epoch': 2.0})

## ***NOTE***

- Loss is much less than it should be because of Model Over-Fitting.

- This can be avoided by a larger dataset with more diverse examples

### After training, load the model from directroy

In [None]:
model_path = "summary-training\checkpoint-2800"
model = RobertaForSequenceClassification.from_pretrained(model_path)

  model_path = "summary-training-1748945582\checkpoint-2800"


We require the tokenizer used to tokenize the input text and feed into the model

In [30]:
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

#### Testing on a random entry from test dataset

---



In [31]:
input = list(X_test['description'])
X_test

Unnamed: 0,description
6500,Re-intermediate e-business bandwidth. Fix bugs...
2944,Monitor server metrics with Prometheus: Monito...
2024,Monitor server metrics with Prometheus: Monito...
263,Fix CSS styling issues for mobile view: Fix CS...
4350,Normalize database schema: Normalize database ...
...,...
3484,Set up cloudwatch alarms for EC2 instances: Se...
1860,Implement RESTful APIs with Node.js: Implement...
4974,Add indexes to frequently queried columns: Add...
387,Refactor frontend state management: Refactor f...


In [32]:
text = input[4]

In [33]:
inputs = tokenizer(text, padding=True, truncation=True, max_length=256, return_tensors="pt")

In [34]:
# Tokenized input string
inputs

{'input_ids': tensor([[    0, 45647,  2072,  8503, 47404,    35, 26411,  2072,  8503, 47404,
             4, 28688,  2072, 41614, 22680,    13,   819,     4, 38141,   275,
          3464,    32,  1432,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])}

In [35]:
# Code to get models output
import torch

model.eval()  # Set model to evaluation mode
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=1).item()


In [36]:
predicted_role = IndexToRole[predicted_class_id]
print("Predicted role:", predicted_role)

Predicted role: DatabaseDesignEngineer


In [37]:
def predict(text, model, tokenizer):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=256, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=1).item()
        predicted_role = IndexToRole[predicted_class_id]
        # print("Predicted Role", predicted_role)
        return predicted_role

Testing the Model:

In [38]:
def test_metric(model, X_test, y_test, tokenizer):
    from sklearn.metrics import accuracy_score
    import torch
    predictions = []
    model.eval()  # Set model to evaluation mode

    for text in X_test["description"]:
        predicted_role = predict(text, model, tokenizer)
        predictions.append(predicted_role)

    return [predictions, accuracy_score(y_test, predictions)]

In [39]:
# test_metric(model, X_test, y_test)
predictions, score = test_metric(model, X_test, y_test, tokenizer)
# print(predictions)
print("accuracy : ", score)

accuracy :  1.0


(Again, overfitting)

In [41]:
text = """
    Design a database schema to store user and admin data
"""
predict(text, model, tokenizer)

'DatabaseDesignEngineer'