In [1]:
from tqdm.auto import tqdm
from grammar_ru.corpus import CorpusReader, CorpusBuilder
from diplom.utils.corpus_utils import CorpusFramework
from pathlib import Path
import warnings
warnings.simplefilter('ignore')
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
from diplom.bert_training.mydatasets import MyDataset
import logging
import pytorch_lightning as pl
from diplom.bert_training.metrics import *
logging.basicConfig(level=logging.ERROR)

In [2]:
pl.seed_everything(42)

Seed set to 42


42

In [3]:
device = torch.device('cuda:0')
# path_corpus = Path(f"../data/corpora/diplom.wow.zip")
# corpus = CorpusReader(path_corpus)
# corpus_framework = CorpusFramework(corpus)
# authors = corpus.get_toc().author.unique()
torch.cuda.get_device_properties(0)

_CudaDeviceProperties(name='AMD Radeon RX 6600 XT', major=10, minor=3, gcnArchName='gfx1030', total_memory=8176MB, multi_processor_count=16)

In [4]:
text_corpus = pd.read_csv('../filtered_updated_text_corpus.csv',index_col=0)#pd.read_csv('./text_corpus.csv')

labels = text_corpus['action'].unique().tolist()
labels = [s.strip() for s in labels if s !='said']

id2label={id:label for id,label in enumerate(labels)}

label2id={label:id for id,label in enumerate(labels)}
#there is deleting said words
text_corpus = text_corpus.loc[text_corpus.action != 'said']

text_corpus["labels"]=text_corpus.action.map(lambda x: label2id[x.strip()])
text_corpus = text_corpus.drop(['action'], axis=1).rename({'speech':'text'},axis=1)#'sample_id',
NUM_LABELS= text_corpus.labels.nunique()

labels


['muttered',
 'sounded',
 'called',
 'exclaimed',
 'murmured',
 'grumbled',
 'shouted',
 'growled',
 'wailed',
 'laughed',
 'say',
 'intoned',
 'spoke',
 'snorted',
 'sighed',
 'mumbled',
 'whined',
 'squeaked',
 'roared',
 'retorted',
 'barked',
 'moaned',
 'sound',
 'cried',
 'screamed',
 'hissed',
 'rumbled',
 'spoken',
 'rasped',
 'spat',
 'gasped',
 'poked',
 'bellowed',
 'grunted',
 'scoffed',
 'croaked',
 'whimpered',
 'grown',
 'sniffed',
 'snarled',
 'claimed',
 'sobbed',
 'squeezed',
 'groaned',
 'shivered',
 'crooned',
 'cackled',
 'hummed',
 'yelped',
 'sang',
 'grew',
 'voiced',
 'cooed',
 'wept',
 'grasped',
 'talked',
 'yawned',
 'shrieked',
 'squealed',
 'howled',
 'purred',
 'giggled',
 'yelled',
 'screeched',
 'rattled',
 'chanted',
 'grow',
 'scolded',
 'claim']

In [5]:
from functools import partial
import evaluate

def count_top_k(pred,labels, k = 5):
    # Get the indices of the top k predictions
    top_k_indices = pred.argsort(axis=1)[:,::-1][:, :k]
    matches = np.any(top_k_indices == np.expand_dims(labels, axis=1), axis=1)
    count = np.sum(matches) / len(labels)

    return count

metric = evaluate.load("accuracy")

def eval_acc(logits, labels):
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric.compute(predictions=predictions, references=labels)
    return accuracy['accuracy']

top_2 = partial(count_top_k,k=2)
top_3 = partial(count_top_k,k=3)
top_5 = partial(count_top_k,k=5)
top_10 = partial(count_top_k,k=10)

In [6]:
from transformers import AutoTokenizer

MAX_LEN = 512
TRAIN_BATCH_SIZE = 4 # 4 is totaly work fine
VALID_BATCH_SIZE = 2
EPOCHS = 10
LEARNING_RATE = 1e-05#5e-04#
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

from diplom.bert_training.configs import *
from transformers import get_linear_schedule_with_warmup
from torchmetrics import Accuracy
from sklearn.utils.class_weight import compute_class_weight

y = text_corpus['labels'].values
class_weights= torch.from_numpy(compute_class_weight('balanced',classes=np.unique(y),y=y)).float().to(device)


model_config = MyBertConfig(label2id=label2id,id2label=id2label,model_name="distilbert-base-uncased")

train_config = MyTrainConfig(NUM_LABELS=NUM_LABELS,criterion=CrossEntropyLoss(weight=class_weights),head_hidd_dim=768,out_head_dropout=0.1)

optim_config = MyOptimizerConfig(
    optimizer=torch.optim.Adam,
    optimizer_params={'lr':LEARNING_RATE},
    scheduler=torch.optim.lr_scheduler.StepLR,
    scheduler_params={'step_size':5,'gamma':0.85})
#scheduler=None,scheduler_params=None)
#scheduler=get_linear_schedule_with_warmup,
#scheduler_params={'num_warmup_steps':20,'num_training_steps':50})

metrics_config = MyMetricsConfig(metrics=[eval_acc,top_2,top_3,top_5,top_10],names=['Acc == in_top_1',"in_top_2","in_top_3","in_top_5","in_top_10"])

In [7]:
train_size = 0.8
train_dataset=text_corpus.sample(frac=train_size,random_state=200)
test_dataset=text_corpus.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(text_corpus.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = MyDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = MyDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (10510, 2)
TRAIN Dataset: (8408, 2)
TEST Dataset: (2102, 2)


In [8]:
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, TQDMProgressBar
from pytorch_lightning.loggers import TensorBoardLogger

checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)
logger = TensorBoardLogger("lightning_logs", name="distill_bert")
#early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)

In [9]:
#TQDMProgressBar(refresh_rate=100)
trainer = pl.Trainer(
    logger=logger,
    callbacks=[checkpoint_callback,TQDMProgressBar()],#TQDMProgressBar()
    max_epochs=EPOCHS,
    devices=[0],
    #enable_progress_bar=False
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [10]:
# from diplom.bert_training.lighning_BERT import BertHudLit
# 
# model = BertHudLit(model_params=model_config,train_params=train_config,optim_params=optim_config,metrics_params=metrics_config)

In [11]:
# model.bert

In [12]:
from diplom.bert_training.mydatasets  import MyDataModule

data_module = MyDataModule(train_dataset,test_dataset,tokenizer,4,MAX_LEN)

In [13]:
torch.set_float32_matmul_precision('medium')

In [14]:
import transformers

class SentenceClassifier(pl.LightningModule):

    def __init__(self, learning_rate=5e-5):
        super(SentenceClassifier, self).__init__()

        # Load pretrained distilbert-base-uncased configured for classification with 2 labels
        self.model = transformers.DistilBertForSequenceClassification.from_pretrained(
            "distilbert-base-uncased",
            num_labels = NUM_LABELS,
            output_attentions = False, # Whether the model returns attentions weights.
            output_hidden_states = False, # Whether the model returns all hidden-states.
        )
        self.learning_rate = learning_rate

    def training_step(self, batch, batch_no):
        """
        This function overrides the pl.LightningModule class. 
        
        When trainer.fit is called, each batch from the provided data loader is fed 
        to this function successively. 
        """
        ids = batch["ids"]
        masks = batch["mask"]
        labels = batch["targets"]
        outputs = self.model(ids, attention_mask=masks, labels=labels)
        preds = torch.argmax(outputs["logits"], axis=1)
        correct = sum(preds.flatten() == labels.flatten())
        self.log("train_loss", outputs["loss"], on_step=True, on_epoch=True)
        self.log("train_acc", correct/len(ids), on_step=True, on_epoch=True)
        return outputs["loss"]

    def validation_step(self, batch, batch_no):
        """
        """
        ids = batch["ids"]
        masks = batch["mask"]
        labels = batch["targets"]
        outputs = self.model(ids, attention_mask=masks, labels=labels)
        logits = outputs["logits"]
        log_dict = {"val_loss": outputs["loss"]}
        log_dict['Acc == in_top_1'] = eval_acc(logits.cpu().numpy(),labels.cpu().numpy())
        for k,top_i in zip([2,3,5,10],[top_2,top_3,top_5,top_10]):
            log_dict[f"in_top_{k}"] = top_i(logits.cpu().numpy(),labels.cpu().numpy())
        self.log_dict(log_dict,prog_bar=True,on_step=False,on_epoch=True)

    def configure_optimizers(self):
        """
        This is overriding a LightningModule method that is called to return the
        optimizer used for training.
        """
        return transformers.AdamW(
            self.model.parameters(),
            lr = self.learning_rate,
            eps = 1e-8
        )

In [15]:
model = SentenceClassifier(learning_rate=1.75e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
trainer.fit(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                                | Params
--------------------------------------------------------------
0 | model | DistilBertForSequenceClassification | 67.0 M
--------------------------------------------------------------
67.0 M    Trainable params
0         Non-trainable params
67.0 M    Total params
268.026   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0, global step 2102: 'val_loss' reached 2.93366 (best 2.93366), saving model to '/home/mixailkys/DataSpell_Projects/grammar_ru/diplom/bert_training/checkpoints/best-checkpoint-v12.ckpt' as top 1


In [17]:
trainer.callback_metrics

{'train_loss': tensor(3.7290, device='cuda:0'),
 'train_loss_step': tensor(3.7290, device='cuda:0'),
 'train_acc': tensor(0., device='cuda:0'),
 'train_acc_step': tensor(0., device='cuda:0'),
 'val_loss': tensor(2.9337, device='cuda:0'),
 'Acc == in_top_1': tensor(0.2479, device='cuda:0'),
 'in_top_2': tensor(0.3949, device='cuda:0'),
 'in_top_3': tensor(0.4853, device='cuda:0'),
 'in_top_5': tensor(0.5942, device='cuda:0'),
 'in_top_10': tensor(0.7303, device='cuda:0'),
 'train_loss_epoch': tensor(3.1124, device='cuda:0'),
 'train_acc_epoch': tensor(0.2217, device='cuda:0')}