In [1]:
from pathlib import Path
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from grammar_ru.corpus import CorpusReader, CorpusBuilder
from diplom.utils.corpus_utils import CorpusFramework
from diplom.utils.dialog_markuper import DialogMarkupFeaturizer
#from diplom.utils.speech_action_maker import SpeechActionFeaturizer
import matplotlib.pyplot as plt
from torch.nn.functional import cosine_similarity
from collections import defaultdict
import torch
#reported speach. 

In [2]:
torch.cuda.get_device_properties(0)

In [3]:
device = torch.device('cuda:0')

In [4]:
path_corpus = Path(f"./data/corpora/diplom.wow.zip")
corpus = CorpusReader(path_corpus)
corpus_framework = CorpusFramework(corpus)

In [5]:
authors = corpus.get_toc().author.unique()

In [6]:
# whole_dataset = []
# 
# for author in tqdm(authors):
#     series_count = len(corpus_framework.get_books_by_author(author))
#     for book_id in range(series_count):
#         try:
#             book = corpus_framework.get_book_by_author_and_series_id(author, book_id)
#         except IndexError:
#             print(f"Не нашлась книга {book_id} для {author}, проверить df")
#             continue
#         chapters_count = len(book)
#         for chapter_id in range(chapters_count):
#             chapter = corpus_framework.get_chapter_as_bundle(book, chapter_id)
#             whole_dataset.append(chapter)

In [7]:
# text_corpus = pd.concat([d.speech_action for d in whole_dataset])

In [8]:
# text_corpus = text_corpus.reset_index(drop=True)#.to_csv('text_corpus.csv')
# text_corpus.sample_id = text_corpus.index
# text_corpus.to_csv('text_corpus.csv', index=False)

In [9]:
text_corpus = pd.read_csv('text_corpus.csv')

In [10]:
labels = text_corpus['action'].unique().tolist()
labels = [s.strip() for s in labels ]
labels

In [11]:
NUM_LABELS= len(labels)

id2label={id:label for id,label in enumerate(labels)}

label2id={label:id for id,label in enumerate(labels)}

In [12]:
text_corpus["labels"]=text_corpus.action.map(lambda x: label2id[x.strip()])

In [13]:
pie_count = lambda x: '{:.0f}'.format(x * (text_corpus['action'].count()) / 100)
pie_count_without_said = lambda x: '{:.0f}'.format(x * (text_corpus.loc[text_corpus.action != 'said','action'].count()) / 100)

In [48]:
text_corpus.loc[text_corpus.action=='say','speech']

# Диаграмма со всеми словами

In [14]:
text_corpus.action.value_counts().plot(kind='pie', figsize=(10,10),autopct=pie_count)

# Диаграмма без said

In [15]:
text_corpus.loc[text_corpus.action != 'said','action'].value_counts().plot(kind='pie', figsize=(10,10),autopct=pie_count_without_said)

# Токенизация

### Сохраняю в файлик спич, чтоб потом обучать токенайзер

In [16]:
#np.savetxt(r'speechs.txt', text_corpus.speech.values, fmt='%s')

In [17]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [18]:
iterator = text_corpus.speech.values

tokenizer.train_new_from_iterator(iterator, vocab_size=30522)

In [19]:
tokenizer.save_pretrained('best_bert_tokenizer')

In [20]:
tokenizer = AutoTokenizer.from_pretrained('best_bert_tokenizer')

In [21]:
tokenizer.tokenize("""I wasn't sleeping, my Lord,""")

In [1]:
import datasets

dataset = datasets.Dataset.from_pandas(text_corpus.drop(['sample_id','action'], axis=1).rename({'speech':'text'}, axis=1))

In [23]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [24]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [25]:
dict_set = tokenized_datasets.train_test_split(test_size=0.1,train_size=0.9)
train_dataset, test_dataset = dict_set['train'], dict_set['test']

In [26]:
from transformers import DistilBertConfig, DistilBertForSequenceClassification, DataCollatorWithPadding

config = DistilBertConfig(
    vocab_size=30522,
    dim=256,
    hidden_dim=256,
    num_labels=NUM_LABELS,
    n_heads=4,
    n_layers=2,
)

model = DistilBertForSequenceClassification(config)
print('No of parameters: ', model.num_parameters())


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [27]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='custom_bert_output/',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,#16,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    report_to=['none'],
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

In [28]:
trainer.train()

In [29]:
trainer.save_model('custom_bert_output/')

In [30]:
test_model = DistilBertForSequenceClassification.from_pretrained('custom_bert_output/')

In [31]:
from transformers import pipeline

predict_label = pipeline('sentiment-analysis', model=test_model, tokenizer=tokenizer)

In [34]:
text_corpus.tail(5)

In [36]:
pred_res = predict_label("""Get in here, Slayer,""")

In [42]:
pred_res

In [41]:
id2label[int(pred_res[0]['label'][-1])]

In [46]:
test_dataset[3]

In [47]:
predict_label("""I know this fellow,""")