<a href="https://colab.research.google.com/github/natrixbasil/taskmaster_classification/blob/main/Multilingual_BERT_68.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
import torch
import pandas as pd
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader

In [8]:
#Load pre-trained multilingual BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=4)
#The tags are: 0 - shopping, 1 - home, 2 - school, 3 - work
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [56]:
#Load my dataset
file_path = '/content/filtered_data.json'
df = pd.read_json(file_path)
df.head()

Unnamed: 0,id,TaskTitle,ListTitle
0,1,rearrange closet,home
1,2,meeting tasks,work
2,3,taste of home,groceries
3,4,bring book in,default list
4,5,sociology paper,today


In [62]:
#Find out which tags are the most popular
variety = df['ListTitle'].value_counts()
variety.head(10)

default list    5393
to do           1367
work            1252
today            343
home             167
family           157
to do list       143
school           140
groceries        108
house             86
Name: ListTitle, dtype: int64

In [63]:
#Take the first 100 todos for each category
home = df.loc[df['ListTitle'] == 'home'][0:100]
work = df.loc[df['ListTitle'] == 'work'][0:100]
school = df.loc[df['ListTitle'] == 'school'][0:100]
groceries = df.loc[df['ListTitle'] == 'groceries'][0:100]
print(len(home), len(work), len(school), len(shopping))

100 100 100 100


In [68]:
#Make one dataframe
trimmed_df = pd.concat([home, work, school, groceries])
print(trimmed_df.head())
print(len(trimmed_df))

      id             TaskTitle ListTitle
0      1      rearrange closet      home
6      7             osteopath      home
48    49    change mobile plan      home
97    98   bring in headphones      home
167  168  emissions inspection      home
400


In [69]:
#Shuffle the dataframe
trimmed_df = trimmed_df.sample(frac=1)
print(trimmed_df.head())
print(len(trimmed_df))

        id             TaskTitle  ListTitle
32      33              copy dvd       work
4720  4721  cranberry lime juice  groceries
3944  3945            sweep pool       home
2078  2079            buy sander       home
3107  3108       grefe detergent  groceries
400


In [100]:
label_encoder = LabelEncoder()
trimmed_df['ListTitle'] = label_encoder.fit_transform(trimmed_df['ListTitle'])
print(trimmed_df[100:120])

        id                         TaskTitle  ListTitle
75      76                    zzzquil liquid          0
125    126                 astronaut costume          0
765    766              figure out voicemail          3
2537  2538                        us project          2
327    328                         ais final          2
3992  3993                 submit reflection          2
1935  1936                 gf spiral noodles          0
3163  3164  install carbon monoxide detector          1
8643  8644                       finnur milk          0
1475  1476                  book electrician          1
1006  1007                      white potato          0
4659  4660                        patho exam          2
661    662                 algebra questions          3
793    794                        email deon          3
4827  4828                          dinner x          0
491    492                  care plan review          3
2943  2944                straighten kitchen    

In [71]:
#Split the dataset into training and validation sets
train_df, val_df = train_test_split(trimmed_df, test_size=0.2, random_state=42)

In [82]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        task_title = str(self.dataframe.iloc[idx]['TaskTitle'])
        list_title = self.dataframe.iloc[idx]['ListTitle']

        # Tokenize the text
        inputs = self.tokenizer(
            task_title,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Get the label (ListTitle)
        label = list_title

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'label': label
        }

In [83]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [84]:
#Determine the max token length
token_lengths = [len(tokenizer.encode(text)) for text in trimmed_df['TaskTitle']]
max_token_length = max(token_lengths)
print(f"Max token length: {max_token_length}")

Max token length: 12


In [75]:
max_length = 12

In [85]:
#Create training and validation datasets
train_dataset = CustomDataset(train_df, tokenizer, max_length)
val_dataset = CustomDataset(val_df, tokenizer, max_length)

In [89]:
#Create DataLoader instances for training and validation
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [90]:
#Set hyperparameters
num_epochs = 3
learning_rate = 1e-5

In [91]:
#Initialize the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [92]:
# Training loop
for epoch in range(num_epochs):
    # Training steps
    model.train()
    for batch in train_dataloader:
        inputs = torch.tensor(batch['input_ids'])
        attention_mask = torch.tensor(batch['attention_mask'])
        labels = torch.tensor(batch['label'])

        # Forward pass
        outputs = model(inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    # Validation steps
    model.eval()
    val_loss = 0.0
    correct_predictions = 0

    with torch.no_grad():
        for batch in val_dataloader:
            inputs = torch.tensor(batch['input_ids'])
            attention_mask = torch.tensor(batch['attention_mask'])
            labels = torch.tensor(batch['label'])

            # Forward pass
            outputs = model(inputs, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            val_loss += loss.item()

            # Calculate accuracy
            predictions = torch.argmax(logits, dim=1)
            correct_predictions += torch.sum(predictions == labels).item()

    average_val_loss = val_loss / len(val_dataloader)
    accuracy = correct_predictions / len(val_dataset)

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {average_val_loss}, Accuracy: {accuracy}')

  inputs = torch.tensor(batch['input_ids'])
  attention_mask = torch.tensor(batch['attention_mask'])
  labels = torch.tensor(batch['label'])
  inputs = torch.tensor(batch['input_ids'])
  attention_mask = torch.tensor(batch['attention_mask'])
  labels = torch.tensor(batch['label'])


Epoch 1/3, Loss: 0.8216330409049988, Accuracy: 0.6375
Epoch 2/3, Loss: 0.8132106542587281, Accuracy: 0.65
Epoch 3/3, Loss: 0.7942898333072662, Accuracy: 0.6875


In [121]:
russian_tasks = ["написать отчет", "молоко", "математика", "помыть пол"]
results = []
other_results = []

In [122]:
tokenized_russian_tasks = tokenizer(russian_tasks, return_tensors="pt", padding=True, truncation=True)

In [123]:
predictions = model(**tokenized_russian_tasks)

In [124]:
list_pred = predictions['logits'].tolist()

In [125]:
for sublist in list_pred:
  max_prob = max(sublist)
  max_index = sublist.index(max_prob)
  if max_index == 0:
    lucky_guess = 'groceries'
  elif max_index == 1:
    lucky_guess = 'home'
  elif max_index == 2:
    lucky_guess = 'school'
  else:
    lucky_guess = 'work'
  results.append(lucky_guess)
  sublist.remove(max_prob)
  second_max_prob = max(sublist)
  second_max_index = sublist.index(second_max_prob)
  if second_max_index == 0:
    unlucky_guess = 'groceries'
  elif second_max_index == 1:
    unlucky_guess = 'home'
  elif second_max_index == 2:
    unlucky_guess = 'school'
  else:
    unlucky_guess = 'work'
  other_results.append(unlucky_guess)
print(russian_tasks)
print(results)
print(other_results)

['написать отчет', 'молоко', 'математика', 'помыть пол']
['work', 'groceries', 'school', 'home']
['school', 'groceries', 'school', 'groceries']
