Your task is to create a bert-base-classifier of vacancy areas based on their titles.

Each vacancy can have more than one area so it's **Multi-label classification** not Multiclass classification




In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from nltk.tokenize import word_tokenize
from string import punctuation
from tqdm import tqdm

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, RandomSampler, Dataset, SequentialSampler
import random
import transformers

# Try two or more different bert-like models(different berts, robertas etc. or any other transformer based model) (**2 points max**)
 your notebook should contain the training process of all your models!

In [None]:
MODEL_NAME =  # ToDo try different models
MAX_SEQ_LENGTH = # ToDo choose seq len
RESULT_MODEL_PATH = './model.pt'

In [None]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed = 12
seed_everything(seed)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
device

In [None]:
punctuation = set('!"$%&\'()*,-/:;<=>?@[\\]^_`{|}~') # убрал #

In [None]:
def clean(text):
    return ' '.join([token.lower() for token in word_tokenize(text) if token not in punctuation])

In [None]:
df = pd.read_csv('./dataset_2020.csv')
df.shape

Each vacancy can have more than one area separated be space

Exapmle:

Malware Analyst for Imunify Security,analyst it_security

In [None]:
df_train, df_test = train_test_split(df, train_size=0.9, random_state=42)
df_train, df_valid = train_test_split(df_train, train_size=0.8, random_state=42)

# Finish TextClassificationDataset (**1 point max**)

In [None]:
class TextClassificationDataset(Dataset):
    def __init__(self, data, tokenizer, bianizer):
        self.data = data
        self.tokenizer = tokenizer
        sentences = [clean(sent) for sent in data.title.tolist()]
        self.target = [labels.split() for labels in data.area.tolist()]
        self.bianiezer = bianizer
        self.target_one_hot = torch.tensor(self.bianiezer.transform(self.target), dtype=torch.float)
        # ToDo add if you need somethong else



    def __len__(self):
         # ToDo

    def __getitem__(self, idx):
         # ToDo

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
binarizer = MultiLabelBinarizer()
labels_train = [labels.split() for labels in df_train.area.tolist()]
binarizer.fit(labels_train)


In [None]:
batch_size = # ToDo

train_dataset = TextClassificationDataset(df_train, tokenizer, binarizer)
train_sampler = RandomSampler(train_dataset)
train_dataloader =  DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size,)

valid_dataset = TextClassificationDataset(df_valid, tokenizer, binarizer)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size)

test_dataset = TextClassificationDataset(df_test, tokenizer, binarizer)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


In [None]:
class BertForMultilabel(nn.Module):
    def __init__(self,  num_labels: int):
        super().__init__()

        self.bert = transformers.BertModel.from_pretrained(MODEL_NAME)
         # ToDo add your layers


    def train_bert(self, train_bert_flag=True):
      for param in self.bert.parameters():
		      param.requires_grad = train_bert_flag



    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None):

         # ToDo

In [None]:
num_labels = len(binarizer.classes_)
model = BertForMultilabel(num_labels)
model.to(device)
;

# Train your classifier with freezed bert and save model with the lowest val loss during training (**2 points max**)

print train/val loss after each epoch


In [None]:
def train(model, iterator, optimizer, criterion):

     # ToDo write train loop

In [None]:
def validate(model, iterator, criterion):

    # ToDo write val loop

In [None]:
def logits_to_labels(logits):
    preds = nn.Sigmoid()(logits.view(-1, num_labels))
    preds = preds.to('cpu').numpy()>0.5
    return preds.tolist()

In [None]:
model.train_bert(False)

In [None]:
epochs = # ToDo
criterion = # ToDo what criterion do you need for multilabel classification?
optimizer  = # ToDo use adam optimizer
scheduler = # ToDo use StepLR scheduler

In [None]:
# ToDo Train your model

In [None]:
model.load_state_dict(torch.load(RESULT_MODEL_PATH, map_location=torch.device(device)))
test_preds = validate(model, test_dataloader, criterion)

In [None]:
print(classification_report(binarizer.transform(test_dataset.target), test_preds,
                            target_names=binarizer.classes_))

# Train your classifier with unfreezed bert and save model with the lowest val loss during training (**2 points max**)

print train/val loss after each epoch

In [None]:
epochs = # ToDo
lr = # ToDo
WARMUP_PROPORTION = 0.1
warmup_steps = int(len(train_dataloader) * epochs * WARMUP_PROPORTION)

In [None]:
model.train_bert(True)

In [None]:
t_total = len(train_dataloader) * epochs
no_decay = # ToDo create a list of parameters to which weight_decay should not be applied, explain your choice in the results section
param_optimizer = list(model.named_parameters())
optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]


criterion = # ToDo
lr = # ToDo
optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
)

In [None]:
model.load_state_dict(torch.load(RESULT_MODEL_PATH, map_location=torch.device(device)))
test_preds = validate(model, test_dataloader, criterion)

In [None]:
print(classification_report(binarizer.transform(test_dataset.target), test_preds,
                            target_names=binarizer.classes_))

In [None]:
# Results

# Results (3 points max)

Write your conclusion

What models and what training parameters did you use?

What was the reason for your choice?

What were the results?

What metrics do you consider the most important?