<a href="https://colab.research.google.com/github/mr1er0y/Topic-Modelling-Neuro/blob/main/model_SCIBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SCIBERT


Задача: По анотации статьи и определить темы, к которым она относится. 

Эта задача относится к Multi-label classification.

Для задачи многоклассовой классификации с несколькими метками (multi-label classification) требуется модифицировать данные и метки, чтобы алгоритм мог предсказывать несколько меток для каждого объекта.



In [None]:
!pip install -U adapter-transformers  > /dev/null
!pip install datasets  > /dev/null

## Dataset Preprocessing

В первую очередь нужно скачать данные из Kaggle(у меня они хранятся на Google Drive) и произвести преобразования  
1. Токенизировать данные
2. Бинанизировать вектор тем с помощью MultiLabelBinarizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
import torch.nn as nn
from torch.utils.data import DataLoader


path = "drive/MyDrive/Data_arXiv/filtered_arxiv_db.csv"
MODEL_PATH = 'allenai/scibert_scivocab_uncased'
NUM_OBJ = 300000
SEED = 77
LR = 3e-5
CRITERION = nn.BCEWithLogitsLoss()

df = pd.read_csv(path)
df[['created_date', 'update_date']] = df[['created_date', 'update_date']].apply(pd.to_datetime)
df = df.drop(['versions', 'description', 'new_category', 'sub_category'], axis=1)
df.columns = ['id', 'title', 'authors', 'category', 'published_date', 'updated_date', 'abstract']
df["category"] = df["category"].apply(eval)
df.head()

  df = pd.read_csv(path)


Unnamed: 0,id,title,authors,category,published_date,updated_date,abstract
0,704.0033,Convergence of the discrete dipole approximati...,"Maxim A. Yurkin, Valeri P. Maltsev, Alfons G. ...","[physics.optics, physics.comp-ph]",2022-03-29,2022-03-31,We performed a rigorous theoretical converge...
1,704.0038,The discrete dipole approximation: an overview...,"Maxim A. Yurkin, Alfons G. Hoekstra","[physics.optics, physics.comp-ph]",2022-03-29,2022-03-30,We present a review of the discrete dipole a...
2,704.0275,Mapping radii of metric spaces,George M. Bergman (U.C.Berkeley),[math.MG],2008-03-28,2021-10-15,It is known that every closed curve of lengt...
3,704.0479,The affine part of the Picard scheme,T.Geisser,"[math.AG, math.KT]",2021-01-28,2021-01-29,We describe the maximal torus and maximal un...
4,704.0752,Actions for the Bosonic String with the Curved...,Davoud Kamani,[hep-th],2008-04-18,2020-08-21,At first we introduce an action for the stri...


In [None]:
def preprocessing(data):
    data = data[data['abstract'].str.contains('paper has been withdrawn') == False]
    data['abstract'] = [a.strip() for a in data['abstract']]
    data['abstract'] = data['abstract'].str.replace('\n', ' ', regex = False).str.replace('\t', ' ', regex = False).str.replace(r'\s\s+', ' ', regex = True)
    data['abstract'] = data['abstract'].str.replace('([.,!?()])', r' \1 ')
    return data

df = preprocessing(df)
df.head()

  data['abstract'] = data['abstract'].str.replace('([.,!?()])', r' \1 ')


Unnamed: 0,id,title,authors,category,published_date,updated_date,abstract
0,704.0033,Convergence of the discrete dipole approximati...,"Maxim A. Yurkin, Valeri P. Maltsev, Alfons G. ...","[physics.optics, physics.comp-ph]",2022-03-29,2022-03-31,We performed a rigorous theoretical convergenc...
1,704.0038,The discrete dipole approximation: an overview...,"Maxim A. Yurkin, Alfons G. Hoekstra","[physics.optics, physics.comp-ph]",2022-03-29,2022-03-30,We present a review of the discrete dipole app...
2,704.0275,Mapping radii of metric spaces,George M. Bergman (U.C.Berkeley),[math.MG],2008-03-28,2021-10-15,It is known that every closed curve of length ...
3,704.0479,The affine part of the Picard scheme,T.Geisser,"[math.AG, math.KT]",2021-01-28,2021-01-29,We describe the maximal torus and maximal unip...
4,704.0752,Actions for the Bosonic String with the Curved...,Davoud Kamani,[hep-th],2008-04-18,2020-08-21,At first we introduce an action for the string...


In [None]:
df['unifed_text'] = df['title'] + '[SEP]' + df['abstract'] + '[SEP]'

In [None]:
# Extract the categories column as a list of lists
categories = []
for el in df["category"]:
    categories.extend(el)
categories = np.unique(categories)
NUM_LABELS = len(categories)

# Initialize the MultiLabelBinarizer and fit_transform the categories
mlb = MultiLabelBinarizer()
df['labels'] =  mlb.fit_transform(df["category"].values).tolist()


In [None]:
NUM_LABELS

170

In [None]:
categories

array(['adap-org', 'alg-geom', 'astro-ph', 'astro-ph.CO', 'astro-ph.EP',
       'astro-ph.GA', 'astro-ph.HE', 'astro-ph.IM', 'astro-ph.SR',
       'chao-dyn', 'cmp-lg', 'comp-gas', 'cond-mat', 'cond-mat.dis-nn',
       'cond-mat.mes-hall', 'cond-mat.mtrl-sci', 'cond-mat.other',
       'cond-mat.quant-gas', 'cond-mat.soft', 'cond-mat.stat-mech',
       'cond-mat.str-el', 'cond-mat.supr-con', 'cs.AI', 'cs.AR', 'cs.CC',
       'cs.CE', 'cs.CG', 'cs.CL', 'cs.CR', 'cs.CV', 'cs.CY', 'cs.DB',
       'cs.DC', 'cs.DL', 'cs.DM', 'cs.DS', 'cs.ET', 'cs.FL', 'cs.GL',
       'cs.GR', 'cs.GT', 'cs.HC', 'cs.IR', 'cs.IT', 'cs.LG', 'cs.LO',
       'cs.MA', 'cs.MM', 'cs.MS', 'cs.NA', 'cs.NE', 'cs.NI', 'cs.OH',
       'cs.OS', 'cs.PF', 'cs.PL', 'cs.RO', 'cs.SC', 'cs.SD', 'cs.SE',
       'cs.SI', 'cs.SY', 'dg-ga', 'econ.EM', 'econ.GN', 'econ.TH',
       'eess.AS', 'eess.IV', 'eess.SP', 'eess.SY', 'funct-an', 'gr-qc',
       'hep-ex', 'hep-lat', 'hep-ph', 'hep-th', 'math-ph', 'math.AC',
       'math.AG', 'm

In [None]:
t = df["abstract"].apply(len)
print("Min:" + str(min(t)))
print("Max:" + str(max(t)))
print("Mean:" + str(np.mean(t)))

Min:20
Max:5216
Mean:1098.1211080387932


In [None]:
from transformers import BertTokenizer, DistilBertTokenizerFast
from datasets import Dataset
import torch

dataset = Dataset.from_pandas(df[["unifed_text", "labels"]].sample(NUM_OBJ, random_state=SEED))
dataset.set_format(type="torch", columns=["unifed_text"])

# https://stackoverflow.com/questions/66096703/running-huggingface-bert-tokenizer-on-gpu
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', lowercase=True)
# tokenizer.save_pretrained('/std-bert-base-uncased')
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', lowercase=True)
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)



def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  # токенизируем текстовые данные
  return tokenizer(
      batch["unifed_text"],
      max_length=350,
      truncation=True,
      padding="max_length"
  )

# Encode the input data
dataset = dataset.map(encode_batch, batched=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Map:   0%|          | 0/300000 [00:00<?, ? examples/s]

In [None]:
dataset = dataset.remove_columns(['unifed_text'])
dataset

Dataset({
    features: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 300000
})

In [None]:
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=SEED)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

dataset = test_dataset.train_test_split(test_size=0.5, shuffle=False)
val_dataset = dataset["train"]
test_dataset = dataset["test"]


N_GPU = torch.cuda.device_count()
BATCH_SIZE = N_GPU * 8
# BATCH_SIZE = 8

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

`labels` - это закодированные в бинарном виде темы статей. Он состоит из 0 и 1, где 1 на i позиции обозначает, что тема i соотвествует тексту, а 0 - нет.

`input_ids` - это последовательность чисел, которые представляют отдельные токены в тексте, каждый токен преобразуется в уникальное число, которое соответствует его позиции в словаре модели BERT. Это основной входной тензор, который передается в модель.

`token_type_ids` - это последовательность чисел, которая указывает, к какому из двух предложений относится каждый токен в input_ids (0 для первого предложения и 1 для второго). Это необходимо в многих задачах, например, в задачах сравнения двух предложений, где модель должна понимать, какой токен относится к какому предложению.

`attention_mask`  это последовательность чисел, которая указывает, какие токены должны быть проигнорированы моделью в процессе обработки текста. Она состоит из 0 и 1, где 1 обозначает, что токен должен быть учтен в модели, а 0 - игнорирован.

Теперь мы готовы настроить Adapter Fusion...

## Fusion Training
Мы используем предварительно обученную лучшую модель из `Hugging Face` и создаем экземпляр нашей модели с помощью `BertModelWithHeads`.

Теперь у нас есть все настроенное для загрузки нашей программы Adapter Fusion setup. Сначала мы загружаем из концентратора три адаптера, предварительно обученных различным задачам: `MultiNLI`, `QQP` и `QNLI`. Поскольку нам не нужны их заголовки предсказаний, мы передаем `with_head=False` методу загрузки. Далее мы добавляем новый слой fusion, который объединяет все адаптеры, которые мы только что загрузили. Наконец, мы добавляем новую классификационную рубрику для нашей целевой задачи сверху.

## BertForSequenceClassification обучение 

In [None]:
from transformers import BertForSequenceClassification, AdamW
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from tqdm import tqdm



def train_model(train_dataloader, val_dataloader, test_dataloader, EPOCHS):
    # intitialize model
    model = BertForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=NUM_LABELS)
    if torch.cuda.is_available():
        model = model.cuda() 
        
    # set optimizer
    param_optimizer = list(model.named_parameters())
    # According to the huggingface recommendations
    # weight decay is set to 0 for bias layers
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                                     'weight_decay_rate': 0.01},
                                    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                                     'weight_decay_rate': 0.0}]
    # Using BERT's Adam optimizer similar to the original Tensorflow optimizer
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr = LR,
                      weight_decay = 0.01,
                      correct_bias = False)
    
    ## initialize values
    if N_GPU > 1:
        model = nn.DataParallel(model)
    
    epoch_train_loss = []
    epoch_valid_loss = []
    epoch_valid_f1 = []
    epoch_valid_accuracy = []
    epoch_test_loss = []
    epoch_test_f1 = []
    epoch_test_accuracy = []
    epoch_test_auc = []

    ## Training/Validation Loop
    for epoch in range(1, EPOCHS + 1):
        print('\tEPOCH:', epoch)

        train_loss = 0.0
        valid_loss = 0.0
        test_loss = 0.0

        ######### TRAINING #############
        # set model to train mode
        model.train()

        #batch = 1
        # iterate through each observation
        train_dataloader_with_progress = tqdm(train_dataloader, desc="Training Progress")

        for data in train_dataloader_with_progress:
            #print('BATCH:', batch)
            input_ids_ = data["input_ids"]
            labels_ = data["labels"]
            masks_ = data["attention_mask"]

            # move data to GPU
            if torch.cuda.is_available():
                input_ids_ = input_ids_.cuda()
                masks_ = masks_.cuda()
                labels_ = labels_.cuda()

            # zero out optimizer gradients
            optimizer.zero_grad()

            # fit model and calculate loss
            logits = model(input_ids = input_ids_, attention_mask = masks_)[0]
            loss = CRITERION(logits, labels_.float())

            if N_GPU > 1 :
                loss = loss.mean()

            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            #batch += 1

        epoch_t_loss = train_loss/len(train_dataloader)
        print(f"\t\tTrain loss: {epoch_t_loss}")

        ###### VALIDATION ########
        # set model to train mode
        model.eval()

        valid_truth = []
        valid_preds = []

        # iterate through each observation
        val_dataloader_with_progress = tqdm(val_dataloader, desc="VALIDATION Progress")

        for data in val_dataloader_with_progress:
            #print('BATCH:', batch)
            input_ids_ = data["input_ids"]
            labels_ = data["labels"]
            masks_ = data["attention_mask"]
            # move data to GPU
            if torch.cuda.is_available():
                input_ids_ = input_ids_.cuda()
                masks_ = masks_.cuda()
                labels_ = labels_.cuda()

            # fit model and calculate loss
            logits = model(input_ids = input_ids_, attention_mask = masks_)[0]
            loss = CRITERION(logits, labels_.float())

            if N_GPU > 1 :
                loss = loss.mean()

            valid_loss += loss.item()
            #batch += 1

            # since we're using BCEWithLogitsLoss, to get the predictions -
            # - sigmoid has to be applied on the logits first
            logits_cpu = torch.sigmoid(logits)
            logits_cpu = np.round(logits_cpu.cpu().detach().numpy())
            labels_cpu = labels_.cpu().numpy()

            # keep list of outputs for validation
            valid_truth.extend(labels_cpu)
            valid_preds.extend(logits_cpu)
            
        ### Validation Metrics 
        epoch_v_loss = valid_loss/len(val_dataloader)
        print(f"\t\tValid loss: {epoch_v_loss}")

        epoch_v_accuracy_score = accuracy_score(valid_truth,valid_preds)
        print('\t\tVal Accuracy:', epoch_v_accuracy_score)

        epoch_v_micro_f1_score = f1_score(valid_truth, valid_preds, average='micro')
        print('\t\tVal Micro F1 score:', epoch_v_micro_f1_score)
            
        ###### TEST ########
        test_truth = []
        test_preds = []

        # iterate through each observation
        test_dataloader_with_progress = tqdm(test_dataloader, desc="Test Progress")
        for data in test_dataloader_with_progress:
            # print('BATCH:', batch)
            input_ids_ = data["input_ids"]
            labels_ = data["labels"]
            masks_ = data["attention_mask"]
            # move data to GPU
            if torch.cuda.is_available():
                input_ids_ = input_ids_.cuda()
                masks_ = masks_.cuda()
                labels_ = labels_.cuda()

            # fit model and calculate loss
            logits = model(input_ids = input_ids_, attention_mask = masks_)[0]
            loss = CRITERION(logits, labels_.float())

            if N_GPU > 1 :
                loss = loss.mean()

            test_loss += loss.item()

            # since we're using BCEWithLogitsLoss, to get the predictions -
            # - sigmoid has to be applied on the logits first
            logits_cpu = torch.sigmoid(logits)
            logits_cpu = np.round(logits_cpu.cpu().detach().numpy())
            labels_cpu = labels_.cpu().numpy()

            # keep list of outputs for validation
            test_truth.extend(labels_cpu)
            test_preds.extend(logits_cpu)

        
        epoch_tst_loss = test_loss/len(test_dataloader)
        print(f"\t\tTest loss: {epoch_tst_loss}")

        epoch_tst_accuracy_score = accuracy_score(test_truth,test_preds)
        print('\t\tTest Accuracy:', epoch_tst_accuracy_score)

        epoch_tst_micro_f1_score = f1_score(test_truth,test_preds, average='micro')
        print('\t\tTest Micro F1 score:', epoch_tst_micro_f1_score)

        epoch_tst_micro_roc_auc = roc_auc_score(test_truth,test_preds, average = 'micro')
        print('\t\tTest Micro roc_auc:', epoch_tst_micro_roc_auc)

        

        # update epoch loss lists
        epoch_train_loss.append(epoch_t_loss)
        epoch_valid_loss.append(epoch_v_loss)
        epoch_valid_f1.append(epoch_v_micro_f1_score)
        epoch_valid_accuracy.append(epoch_v_accuracy_score)
        epoch_test_loss.append(epoch_tst_loss)
        epoch_test_f1.append(epoch_tst_micro_f1_score)
        epoch_test_accuracy.append(epoch_tst_accuracy_score)
        epoch_test_auc.append(epoch_tst_micro_roc_auc)
    


In [None]:
train_model(train_dataloader, val_dataloader, test_dataloader, 5)

Downloading pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

	EPOCH: 1


Training Progress: 100%|██████████| 30000/30000 [1:24:04<00:00,  5.95it/s]


		Train loss: 0.024176837874762713


VALIDATION Progress: 100%|██████████| 3750/3750 [03:25<00:00, 18.21it/s]


		Valid loss: 0.019030871451149383
		Val Accuracy: 0.3678
		Val Micro F1 score: 0.651462669486763


Test Progress: 100%|██████████| 3750/3750 [03:25<00:00, 18.24it/s]


		Test loss: 0.019104677985484403
		Test Accuracy: 0.3668666666666667
		Test Micro F1 score: 0.6499457423066685
		Test Micro roc_auc: 0.7820668278137948
	EPOCH: 2


Training Progress: 100%|██████████| 30000/30000 [1:24:02<00:00,  5.95it/s]


		Train loss: 0.01746335856957982


VALIDATION Progress: 100%|██████████| 3750/3750 [03:25<00:00, 18.22it/s]


		Valid loss: 0.018495173978743455
		Val Accuracy: 0.37726666666666664
		Val Micro F1 score: 0.666020581476671


Test Progress: 100%|██████████| 3750/3750 [03:26<00:00, 18.18it/s]


		Test loss: 0.01862243567965925
		Test Accuracy: 0.37716666666666665
		Test Micro F1 score: 0.6644757276689545
		Test Micro roc_auc: 0.7946134501249938
	EPOCH: 3


Training Progress:   2%|▏         | 510/30000 [01:25<1:22:21,  5.97it/s]