<a href="https://colab.research.google.com/github/natrixbasil/taskmaster_classification/blob/main/TaskMaster_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import pandas as pd
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader

In [None]:
#Load pre-trained multilingual BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=3)
#The tags are: 0 - home, 1 - school-work, 3 - shopping
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#Load my dataset
file_path = '/content/clean_df.json'
df = pd.read_json(file_path)
df.head()

Unnamed: 0,TaskTitle,ListTitle
0,clean room bathroom,0
1,clean denture,0
2,get salt water softener,2
3,fix car cover,0
4,buy deo stick,2


In [None]:
#Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        task_title = str(self.dataframe.iloc[idx]['TaskTitle'])
        list_title = self.dataframe.iloc[idx]['ListTitle']

        # Tokenize the text
        inputs = self.tokenizer(
            task_title,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Get the label (ListTitle)
        label = list_title

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'label': label
        }

In [None]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [None]:
#Determine the max token length
token_lengths = [len(tokenizer.encode(text)) for text in df['TaskTitle']]
max_token_length = max(token_lengths)
print(f"Max token length: {max_token_length}")

Max token length: 18


In [None]:
max_length = 18

In [None]:
#Create training and validation datasets
train_dataset = CustomDataset(train_df, tokenizer, max_length)
val_dataset = CustomDataset(val_df, tokenizer, max_length)

In [None]:
#Create DataLoader instances for training and validation
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
#Set hyperparameters
num_epochs = 3
learning_rate = 1e-5

In [None]:
#Initialize the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [49]:
# Training loop
for epoch in range(num_epochs):
    # Training steps
    model.train()
    for batch in train_dataloader:
        inputs = torch.tensor(batch['input_ids'])
        attention_mask = torch.tensor(batch['attention_mask'])
        labels = torch.tensor(batch['label'])

        # Forward pass
        outputs = model(inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    # Validation steps
    model.eval()
    val_loss = 0.0
    correct_predictions = 0

    with torch.no_grad():
        for batch in val_dataloader:
            inputs = torch.tensor(batch['input_ids'])
            attention_mask = torch.tensor(batch['attention_mask'])
            labels = torch.tensor(batch['label'])

            # Forward pass
            outputs = model(inputs, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            val_loss += loss.item()

            # Calculate accuracy
            predictions = torch.argmax(logits, dim=1)
            correct_predictions += torch.sum(predictions == labels).item()

    average_val_loss = val_loss / len(val_dataloader)
    accuracy = correct_predictions / len(val_dataset)

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {average_val_loss}, Accuracy: {accuracy}')

  inputs = torch.tensor(batch['input_ids'])
  attention_mask = torch.tensor(batch['attention_mask'])
  labels = torch.tensor(batch['label'])
  inputs = torch.tensor(batch['input_ids'])
  attention_mask = torch.tensor(batch['attention_mask'])
  labels = torch.tensor(batch['label'])


Epoch 1/3, Loss: 0.5110280163910078, Accuracy: 0.8375
Epoch 2/3, Loss: 0.4095768196427304, Accuracy: 0.8611111111111112
Epoch 3/3, Loss: 0.3922855847555658, Accuracy: 0.8625


In [88]:
#Testing with russian language
russian_task = ['купить молоко']

In [89]:
#Tokenization
tokenized_russian_task = tokenizer(russian_task, return_tensors="pt", padding=True, truncation=True)

In [90]:
#Using the model!
predictions = model(**tokenized_russian_task)

In [91]:
#Unpacking the predictions
list_pred = predictions['logits'].tolist()
list_pred

[[-0.20535063743591309, -1.0940823554992676, 1.4615010023117065]]

In [92]:
#Getting the 1st and the 2nd tag
max_prob = max(list_pred[0])
max_index = list_pred[0].index(max_prob)
if max_index == 0:
  lucky_guess = 'home'
elif max_index == 1:
  lucky_guess = 'school-work'
elif max_index == 2:
  lucky_guess = 'shopping'
list_pred[0][max_index] = -100
second_max_prob = max(list_pred[0])
second_max_index = list_pred[0].index(second_max_prob)
if second_max_index == 0:
  unlucky_guess = 'home'
elif second_max_index == 1:
  unlucky_guess = 'school-work'
elif second_max_index == 2:
  unlucky_guess = 'shopping'

print(russian_task)
print(lucky_guess)
print(unlucky_guess)

['купить молоко']
shopping
home


In [55]:
#Saving the model
model.save_pretrained("/content/drive/MyDrive/TaskMaster-BERT")