<a href="https://colab.research.google.com/github/natrixbasil/taskmaster_classification/blob/main/Final_TaskMaster_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import torch
import pandas as pd
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader

In [4]:
#загружаем модель и токенизатор
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=3)
#num_labels: 1 - home, 2 - school-work, 3 - shopping
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [5]:
#загружаем подготовленнный csv-файл
file_path = '/content/preprocessed_dataset.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,tasks,tags
0,rearrange closet,1
1,meeting tasks,2
2,taste home,3
3,bring book,2
4,sociology paper,2


In [6]:
#разбиваем на тренировочную и тестовую выборки
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [7]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label_encoder = LabelEncoder()
        self.dataframe['encoded_labels'] = self.label_encoder.fit_transform(dataframe['tags'])

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        task_title = str(self.dataframe.iloc[idx]['tasks'])
        label = self.dataframe.iloc[idx]['encoded_labels']

        # Tokenize the text
        inputs = self.tokenizer(
            task_title,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'label': label
        }


In [8]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [9]:
#Determine the max token length
token_lengths = [len(tokenizer.encode(text)) for text in df['tasks']]
max_token_length = max(token_lengths)
print(f"Max token length: {max_token_length}")

Max token length: 18


In [10]:
max_length = 18

In [11]:
#Create training and validation datasets
train_dataset = CustomDataset(train_df, tokenizer, max_length)
val_dataset = CustomDataset(val_df, tokenizer, max_length)

In [13]:
#Create DataLoader instances for training and validation
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

In [14]:
#Set hyperparameters
num_epochs = 3
learning_rate = 1e-5

In [15]:
#Initialize the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [16]:
# Training loop
for epoch in range(num_epochs):
    # Training steps
    model.train()
    for batch in train_dataloader:
        inputs = torch.tensor(batch['input_ids'])
        attention_mask = torch.tensor(batch['attention_mask'])
        labels = torch.tensor(batch['label'])

        # Forward pass
        outputs = model(inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    # Validation steps
    model.eval()
    val_loss = 0.0
    correct_predictions = 0

    with torch.no_grad():
        for batch in val_dataloader:
            inputs = torch.tensor(batch['input_ids'])
            attention_mask = torch.tensor(batch['attention_mask'])
            labels = torch.tensor(batch['label'])

            # Forward pass
            outputs = model(inputs, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            val_loss += loss.item()

            # Calculate accuracy
            predictions = torch.argmax(logits, dim=1)
            correct_predictions += torch.sum(predictions == labels).item()

    average_val_loss = val_loss / len(val_dataloader)
    accuracy = correct_predictions / len(val_dataset)

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {average_val_loss}, Accuracy: {accuracy}')

  self.pid = os.fork()
  inputs = torch.tensor(batch['input_ids'])
  attention_mask = torch.tensor(batch['attention_mask'])
  labels = torch.tensor(batch['label'])
  inputs = torch.tensor(batch['input_ids'])
  attention_mask = torch.tensor(batch['attention_mask'])
  labels = torch.tensor(batch['label'])


Epoch 1/3, Loss: 0.702024346380903, Accuracy: 0.7045579352004393
Epoch 2/3, Loss: 0.6793567867655503, Accuracy: 0.7204832509610104
Epoch 3/3, Loss: 0.6849768025833264, Accuracy: 0.7226798462383306


In [None]:
#Testing with russian language
russian_task = ['купить молоко']

In [None]:
#Tokenization
tokenized_russian_task = tokenizer(russian_task, return_tensors="pt", padding=True, truncation=True)

In [None]:
#Using the model!
predictions = model(**tokenized_russian_task)

In [None]:
#Unpacking the predictions
list_pred = predictions['logits'].tolist()
list_pred

[[-0.20535063743591309, -1.0940823554992676, 1.4615010023117065]]

In [None]:
#Getting the 1st and the 2nd tag
max_prob = max(list_pred[0])
max_index = list_pred[0].index(max_prob)
if max_index == 0:
  lucky_guess = 'home'
elif max_index == 1:
  lucky_guess = 'school-work'
elif max_index == 2:
  lucky_guess = 'shopping'
list_pred[0][max_index] = -100
second_max_prob = max(list_pred[0])
second_max_index = list_pred[0].index(second_max_prob)
if second_max_index == 0:
  unlucky_guess = 'home'
elif second_max_index == 1:
  unlucky_guess = 'school-work'
elif second_max_index == 2:
  unlucky_guess = 'shopping'

print(russian_task)
print(lucky_guess)
print(unlucky_guess)

['купить молоко']
shopping
home


In [17]:
#Saving the model
model.save_pretrained("/content/drive/MyDrive/TaskMaster-BERT2")
tokenizer.save_pretrained("/content/drive/MyDrive/TaskMaster-BERT2")

('/content/drive/MyDrive/TaskMaster-BERT2/tokenizer_config.json',
 '/content/drive/MyDrive/TaskMaster-BERT2/special_tokens_map.json',
 '/content/drive/MyDrive/TaskMaster-BERT2/vocab.txt',
 '/content/drive/MyDrive/TaskMaster-BERT2/added_tokens.json')

In [19]:
from google.colab import files
import os

# Create a zip file of your directory
os.system('zip -r model_files.zip /content/drive/MyDrive/TaskMaster-BERT2')

FileNotFoundError: Cannot find file: TaskMaster-BERT2.zip

In [20]:
# Download the zip file
files.download('model_files.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>