### применяем модель bilstm-crf (которую обучали)

In [None]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
import torch

import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(1)

from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

In [None]:
#train/dev split
texts, ids = [], []
with open('train_reviews.txt') as f:
    for line in f:
        text_id, text = line.rstrip('\r\n').split('\t')
        texts.append(text)
        ids.append(text_id)

train_texts, dev_texts, train_ids, dev_ids = train_test_split(texts, ids)

train_aspects, dev_aspects = [], []
with open('train_aspects.txt') as f:
    for line in f:
        line = line.rstrip('\r\n')
        text_id = line.split('\t')[0]
        if text_id in train_ids:
            train_aspects.append(line)
        if text_id in dev_ids:
            dev_aspects.append(line)


train_sentiment, dev_sentiment = [], []
with open('train_cats.txt') as f:
    for line in f:
        line = line.rstrip('\r\n')
        text_id = line.split('\t')[0]
        if text_id in train_ids:
            train_sentiment.append(line)
        if text_id in dev_ids:
            dev_sentiment.append(line)

with open('train_split_aspects.txt', 'w') as f:
    for l in train_aspects:
        print(l, file=f)
with open('dev_aspects.txt', 'w') as f:
    for l in dev_aspects:
        print(l, file=f)
with open('train_split_reviews.txt', 'w') as f:
    for i, l in zip(train_ids, train_texts):
        print(i, l, sep="\t", file=f)
with open('dev_reviews.txt', 'w') as f:
    for i, l in zip(dev_ids, dev_texts):
        print(i, l, sep="\t", file=f)
with open('train_split_cats.txt', 'w') as f:
    for l in train_sentiment:
        print(l, file=f)
with open('dev_cats.txt', 'w') as f:
    for l in dev_sentiment:
        print(l, file=f)


In [None]:
!pip install nltk pymorphy2
!pip install stanza

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m769.3 kB/s[0m eta [36m0:00:00[0m
Collecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4 (from pymorphy2)
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt>=0.6 (from pymorphy2)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=9c0eb034433f941c4d4a07cdda71b6ca9264956ea8e1398cae4ecbefd953a050
  Stored in directory: /root/.cach

In [None]:
import pandas as pd

from nltk.tokenize import RegexpTokenizer
from pymorphy2 import MorphAnalyzer

import stanza

morph = MorphAnalyzer()
token = RegexpTokenizer('\w+')

def normalize(text):
    words = [morph.parse(word)[0].normal_form for word in tokenize(text) if word]
    return words

def tokenize(text):
    return token.tokenize(text)

stanza.download('ru')
nlp = stanza.Pipeline('ru', processors='tokenize')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

INFO:stanza:Downloading default packages for language: ru (Russian) ...


Downloading https://huggingface.co/stanfordnlp/stanza-ru/resolve/v1.7.0/models/default.zip:   0%|          | 0…

INFO:stanza:Finished downloading models and saved to /root/stanza_resources.
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

INFO:stanza:Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |

INFO:stanza:Using device: cuda
INFO:stanza:Loading: tokenize
INFO:stanza:Done loading processors!


In [None]:
# там потом ниже для инициализации и использования модели нужна будет длина трейн дата и word_to_ix и это тупо (но я не понимаю как сделать без этого)
train_asp = pd.read_csv(
    'train_aspects.txt',
    delimiter='\t',
    names=['text_id', 'category', 'mention', 'start', 'end', 'sentiment']
)
train_texts = pd.read_csv('train_reviews.txt', delimiter='\t', names=['text_id','text'])

In [None]:
reviews = {}
with open('train_reviews.txt') as f:
  for line in f:
    line = line.rstrip('\r\n').split('\t')
    reviews[line[0]] = line[1]

In [None]:
from collections import defaultdict
aspects = defaultdict(list)

with open('train_aspects.txt') as f:
  for line in f:
    line = line.rstrip('\r\n').split('\t')
    keys = ('category', 'mention', 'start', 'end', 'sentiment')
    aspects[line[0]].append(dict(zip(keys, line[1:])))

In [None]:
training_data = []

for text_id, text in reviews.items():
  new_sent = []
  new_lab = []
  processed = nlp(text)
  for token in processed.iter_tokens():
    add = False
    for mention in aspects[text_id]:
      if token.start_char == int(mention['start']) and token.end_char == int(mention['end']):
        add = True
        new_sent.append(token.text)
        new_lab.append('B-'+mention['category'])
    if not add:
      new_sent.append(token.text)
      new_lab.append('O')
  training_data.append((new_sent, new_lab))
  #break

#len(training_data)

284

In [None]:
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

In [None]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [None]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 5
HIDDEN_DIM = 4


training_data = training_data

word_to_ix = {}
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

tag_to_ix = {"B-Whole": 0, "B-Interior": 1, "B-Price": 2, 'B-Food': 3, 'B-Service': 4, 'O':5, START_TAG: 6, STOP_TAG: 7}



model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
model.load_state_dict(torch.load('model'))
model.eval()

BiLSTM_CRF(
  (word_embeds): Embedding(8034, 5)
  (lstm): LSTM(5, 2, bidirectional=True)
  (hidden2tag): Linear(in_features=4, out_features=8, bias=True)
)

In [None]:
ix_to_tag = {0: "B-Whole", 1: "B-Interior", 2: "B-Price", 3: 'B-Food', 4: 'B-Service', 5: 'O', 6: START_TAG, 7: STOP_TAG}

### в ячейке ниже нужно указать вместо 'dev_reviews.txt' путь к файлу с тестовыми данными

In [None]:
t_reviews = {}
with open('dev_reviews.txt') as f: # ЗДЕСЬ ИЗМЕНИТЬ ПУТЬ К ТЕСТОВЫМ ДАННЫМ
  for line in f:
    line = line.rstrip('\r\n').split('\t')
    t_reviews[line[0]] = line[1]

test_data = []
for text_id, text in tqdm(t_reviews.items()):
  new_sent = []
  processed = nlp(text)
  for token in processed.iter_tokens():
    new_sent.append(token.text)
  test_data.append((text_id, text, new_sent))
  #break

  0%|          | 0/71 [00:00<?, ?it/s]

In [None]:
#test_data[0]

In [None]:
#выделяем сами упоминания аспектов, res типа: [[review_id, review_text, [выделенные аспекты], [категории для аспектов]]...]
res = []
for i in tqdm(range(len(test_data))):
  new_inst = []
  new_asps = []
  new_cats = []
  text_id = test_data[i][0]
  text = test_data[i][1]
  sentence = test_data[i][2]
  precheck_sent = prepare_sequence(sentence, word_to_ix)
  ans = model(precheck_sent)
  new_inst.append(text_id)
  new_inst.append(text)
  for j in range(len(ans[1])):
    if ans[1][j] != 5:
      new_asps.append(sentence[j])
      new_cats.append(ix_to_tag[ans[1][j]][2:])
  new_inst.append(new_asps)
  new_inst.append(new_cats)
  res.append(new_inst)
  #break

  0%|          | 0/71 [00:00<?, ?it/s]

In [None]:
res[0]

['33912',
 'Добрый день! Были с подружкой еще две недели назад в ресторане "Оливье", вот только сейчас решила написать отзыв. Поэтому воспоминания немного притупились, но все таки поделиться впечатлениями очень захотелось!)) Пришли мы туда в пятницу вечером и как ни странно, народу было совсем немного, что в принципе нас порадовала, так как без проблем смогли выбрать любой понравившийся нам столик. На входе встретил приветливый официант, сразу проводил до столика, даже успели с ним обменяться парочкой шуточек) Милый интерьер, очень даже располагает к приятным беседам и хорошему времяпрепровождению! Теперь о самом приятном, о еде..)Было вкусно, очень по домашнему и знаете, можно даже сказать приготовлено с любовью)) Мы заказали котлетки из телятины с пюре, хачапури и два бокала белого вина. Котлетки нежнейшие, пюре вкусное, но было чуть теплое, хачапури превзошло все ожидания, съели за две минуты)))Вообщем мы остались очень довольны, думаю что обязательно приду туда еще попробовать сала

### дальше добавляем разметку тональности моделью отсюда https://huggingface.co/yangheng/deberta-v3-large-absa-v1.1

(тут надо на гпу)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("yangheng/deberta-v3-large-absa-v1.1")
model = AutoModelForSequenceClassification.from_pretrained("yangheng/deberta-v3-large-absa-v1.1")

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
with open('first.txt', 'w') as fr:
  for ins in tqdm(res):
    text_id = ins[0]
    text = ins[1]
    aspects = ins[2]
    categories = ins[3]
    for a in range(len(aspects)): # для каждого из аспектов
      # Аня: тут классификатор оценивает тональность пары(текст ревью, аспект)
      ton = classifier(text,  text_pair=aspects[a]) # Аня: может надо оценивать как-то в границах предложения, а не по всему отзыву?? а может и не надо АПД я посмотрела с предложениями получилось хуже...
      fr.write(f'{text_id}\t{categories[a]}\t{aspects[a]}\t{text.find(" "+aspects[a])+1}\t{text.find(" " +aspects[a])+len(aspects[a])+1}\t{ton[0]["label"].lower()}\n')
      break

  0%|          | 0/71 [00:00<?, ?it/s]