In [1]:
from pathlib import Path
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from grammar_ru.corpus import CorpusReader, CorpusBuilder
from diplom.utils.corpus_utils import CorpusFramework
from diplom.utils.dialog_markuper import DialogMarkupFeaturizer
#from diplom.utils.speech_action_maker import SpeechActionFeaturizer
import matplotlib.pyplot as plt
from torch.nn.functional import cosine_similarity
from collections import defaultdict
import torch
#reported speach. 

In [2]:
device = torch.device('cuda:0')
path_corpus = Path(f"../data/corpora/diplom.wow.zip")
corpus = CorpusReader(path_corpus)
corpus_framework = CorpusFramework(corpus)
authors = corpus.get_toc().author.unique()
torch.cuda.get_device_properties(0)

In [3]:
text_corpus = pd.read_csv('../text_corpus.csv')

labels = text_corpus['action'].unique().tolist()
labels = [s.strip() for s in labels if s !='said']

id2label={id:label for id,label in enumerate(labels)}

label2id={label:id for id,label in enumerate(labels)}
#there is deleting said words
text_corpus = text_corpus.loc[text_corpus.action != 'said']

text_corpus["labels"]=text_corpus.action.map(lambda x: label2id[x.strip()])
text_corpus = text_corpus.drop(['sample_id','action'], axis=1).rename({'speech':'text'},axis=1)
NUM_LABELS= text_corpus.labels.nunique()

labels


In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')

In [5]:
import datasets

dataset = datasets.Dataset.from_pandas(text_corpus)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
dict_set = tokenized_datasets.train_test_split(test_size=0.1,train_size=0.9,seed=42)
test_dataset = dict_set['test']
train_val_dict = dict_set['train'].train_test_split(test_size=0.1,train_size=0.9,seed=42)
train_dataset, val_dataset = train_val_dict['train'], train_val_dict['test']

In [6]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-cased", num_labels=NUM_LABELS,id2label=id2label, label2id=label2id,dropout=0.4, attention_dropout=0.4)

In [1]:
import evaluate

def count_top_k(res, k = 5):
    labels, pred = res.label_ids,res.predictions
    # Get the indices of the top k predictions
    top_k_indices = pred.argsort(axis=1)[:,::-1][:, :k]
    matches = np.any(top_k_indices == np.expand_dims(labels, axis=1), axis=1)
    count = np.sum(matches) / len(labels)

    return count

metric = evaluate.load("accuracy")

def eval_acc(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric.compute(predictions=predictions, references=labels)
    return accuracy

def compute_metrics(pred):
    acc = eval_acc(pred)
    mapk_2 = count_top_k(pred,k=2)
    mapk_3 = count_top_k(pred,k=3)
    mapk_5 = count_top_k(pred,k=5)
    mapk_10 = count_top_k(pred,k=10)
    return {'Acc == in_top_1': acc['accuracy'],"in_top_2":mapk_2,"in_top_3":mapk_3,"in_top_5":mapk_5,"in_top_10":mapk_10}


KeyboardInterrupt: 

In [8]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
from torch.nn import CrossEntropyLoss
from transformers import TrainingArguments, Trainer
from sklearn.utils.class_weight import compute_class_weight

y = text_corpus['labels'].values
class_weights= torch.from_numpy(compute_class_weight('balanced',classes=np.unique(y),y=y)).float().to(device)

class MyTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # You pass the class weights when instantiating the Trainer
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.
        Subclass and override for custom behavior.
        """
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)
        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            loss = self.label_smoother(outputs, labels)
        else:
            # We don't use .loss here since the model may return tuples instead of ModelOutput.

            # Changes start here
            # loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
            logits = outputs['logits']
            criterion = torch.nn.CrossEntropyLoss(weight=self.class_weights)
            loss = criterion(logits, inputs['labels'])
            # Changes end here

        return (loss, outputs) if return_outputs else loss

In [10]:
def count_top_k(res, k = 5):
    labels, pred = res.label_ids,res.predictions
    # Get the indices of the top k predictions
    top_k_indices = pred.argsort(axis=1)[:,::-1][:, :k]
    matches = np.any(top_k_indices == np.expand_dims(labels, axis=1), axis=1)
    count = np.sum(matches) / len(labels)

    return {f'in_top_{k}': count}



training_args = TrainingArguments(
    output_dir = './LessLRTrainBert', #Выходной каталог
    num_train_epochs = 100, #Кол-во эпох для обучения
    per_device_train_batch_size = 8, #Размер пакета для каждого устройства во время обучения
    per_device_eval_batch_size = 8, #Размер пакета для каждого устройства во время валидации
    weight_decay =0.01, #Понижение весов
    logging_dir = './sec_logs', #Каталог для хранения журналов
    load_best_model_at_end = True, #Загружать ли лучшую модель после обучения
    learning_rate = 1e-5, #Скорость обучения
    evaluation_strategy ='epoch', #Валидация после каждой эпохи (можно сделать после конкретного кол-ва шагов)
    logging_strategy = 'epoch', #Логирование после каждой эпохи
    save_strategy = 'epoch', #Сохранение после каждой эпохи
    save_total_limit = 1,
    fp16=True,
    seed=42)

trainer = Trainer(tokenizer=tokenizer,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=count_top_k,
    data_collator=data_collator,
)#MyTrainer(#Trainer(
    #class_weights=class_weights,
trainer.train()

In [11]:
trainer.train()

In [11]:
trainer.train()

In [None]:
model_path = "./experiments/fine-tune-bert"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
def get_prediction():
    test_pred = trainer.predict(test_dataset)
    labels = np.argmax(test_pred.predictions, axis = -1)
    return labels
pred = get_prediction()

In [None]:
test_dataset.to_pandas()[['text','labels']]

In [None]:
true_ans = test_dataset.to_pandas()[['text','labels']]
true_ans.labels = [id2label[pr] for pr in true_ans.labels]

In [None]:
pred_l = test_dataset.to_pandas()[['text','labels']]
pred_l.labels = [id2label[pr] for pr in pred]

In [None]:
pd.merge(true_ans,pred_l,on='text',suffixes=['_true','_pred'])