In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 4.3 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 31.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 4.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 52.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 32.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml


In [2]:
import os
import bz2

from six.moves import urllib
import torch
from torch.utils.data import DataLoader, Dataset
import os
import pickle
import sys
from transformers import AutoTokenizer
from transformers import AutoModel

import pandas as pd
import re

import os
import torch
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm



In [3]:
def download_lenta(path: str = '.'):
    output_dir = os.path.join(path, 'lenta')

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    url = 'https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.1/lenta-ru-news.csv.bz2'
    print("downloading url ", url)

    data = urllib.request.urlopen(url)
    file_path = os.path.join(output_dir, os.path.basename(url))
    print(file_path)
    with open(file_path, 'wb') as f:
        f.write(data.read())

    print("Extracting data")
    with open(file_path, 'rb') as source, open(os.path.join(output_dir, 'lenta.csv'), 'wb') as dest:
        dest.write(bz2.decompress(source.read()))

    os.remove(file_path)

    return output_dir


In [4]:
def collate_fn(batch):
    max_len = max(len(row["feature"]) for row in batch)

    input_ids = torch.empty((len(batch), max_len), dtype=torch.long)
    input_target = torch.empty((len(batch), max_len), dtype=torch.long)
    target_mask = torch.empty((len(batch), max_len), dtype=torch.long)
    attention_mask = torch.empty((len(batch), max_len), dtype=torch.long)

    for idx, row in enumerate(batch):
        to_pad = max_len - len(row["feature"])
        input_ids[idx] = torch.cat((row["feature"], torch.zeros(to_pad)))
        input_target[idx] = torch.cat((row["target"], torch.zeros(to_pad)))
        target_mask[idx] = torch.cat((row["target_mask"], torch.zeros(to_pad)))
        attention_mask[idx] = torch.cat((row["attention_mask"], torch.zeros(to_pad)))

    return {
        'feature': input_ids,
        'target': input_target,
        'target_mask': target_mask,
        'attention_mask': attention_mask
    }


def build_dataloader(dataset: Dataset, batch_size: int):
    loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)
    return loader

In [14]:
tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased-sentence')
targets = {',': 1, '.': 2}


def extract_sample(dataset_path: str = 'lenta/lenta.csv', sample_size: int = 100_000):
    output_path = os.path.join('data/raw/lenta', 'lenta_text.csv')
    pd.read_csv(dataset_path, low_memory=False)['text'][:sample_size].to_csv('lenta_text.csv')



In [16]:
def cut_text(max_length: int, sample_path: str = 'lenta_text.csv'):
    text = pd.read_csv(sample_path)['text'].values

    def reshape_sentence(sentence_splitted: list, n: int) -> str:
        for i in range(0, len(sentence_splitted), n):
            yield ' '.join(sentence_splitted[i:i + n])

    res = []
    for sample in text:
        for sentence in reshape_sentence(sample.split(), max_length):
            res.append(sentence)

    pd.DataFrame(res, columns=['text']).to_csv('lenta_cutted.csv', index=False)


In [7]:
def build_features(sample_path: str = 'lenta_cutted.csv',
                   text: list = None,
                   return_features: bool = False
                   ):
    if text is None:
        text = pd.read_csv(sample_path)['text'].values
    # text = ['казнить, нельзя помиловать#.', 'привет со дна #38.', 'что-то пошло не так (.']
    # text = text[0:1]
    tokenized_text = [tokenizer.tokenize(sent) for sent in text]
    tokenized_text = [['[SOS]'] + sentence + ['[EOS]'] for sentence in tokenized_text]
    # print(tokenized_text)

    input_tokens = list(
        map(lambda sentence: list(
            filter(
                lambda x: x not in targets.keys(),
                sentence)
        ),
            tokenized_text)
    )
    # print(input_tokens)

    input_ids = list(map(tokenizer.convert_tokens_to_ids, input_tokens))

    # print(input_ids)

    def shift_target(arr: list) -> list:
        res = []
        for i in arr:
            if i != 0:
                res.pop()
            res.append(i)
        return res

    input_targets = list(map(lambda sentence: shift_target([targets.get(x, 0) for x in sentence]), tokenized_text))

    # print(input_targets)

    def mask_tokens(tokens: list) -> list:
        res = []
        for i in range(len(tokens) - 1):
            if tokens[i + 1][0] != '#':
                res.append(1)
            elif tokens[i + 1] == '#':
                res.append(1)
            elif tokens[i + 1][0] == '#':
                res.append(0)
            else:
                raise NotImplementedError
        res.append(1)
        assert len(res) == len(tokens)
        return res

    target_mask = list(map(lambda x: mask_tokens(x), input_tokens))
    # print(target_mask)

    attention_mask = list(map(lambda x: [1 for _ in range(len(x))], input_ids))
    # print(attention_mask)

    with open('input_ids.pkl', 'wb') as f:
        pickle.dump(input_ids, f)
    with open('input_targets.pkl', 'wb') as f:
        pickle.dump(input_targets, f)
    with open('target_mask.pkl', 'wb') as f:
        pickle.dump(target_mask, f)
    with open('attention_mask.pkl', 'wb') as f:
        pickle.dump(attention_mask, f)

    with open('input_ids.pkl', 'rb') as f:
        input_ids = pickle.load(f)

    if return_features:
        return input_ids, input_targets, target_mask, attention_mask


In [8]:
class CommaDataset(Dataset):
    def __init__(self, input_ids, input_targets, target_mask, attention_mask):
        self.input_ids = list(map(torch.tensor, input_ids))
        self.input_targets = list(map(torch.tensor, input_targets))
        self.target_mask = list(map(torch.tensor, target_mask))
        self.attention_mask = list(map(torch.tensor, attention_mask))

    def __getitem__(self, item):
        return {
            'feature': self.input_ids[item],
            'target': self.input_targets[item],
            'target_mask': self.target_mask[item],
            'attention_mask': self.attention_mask[item]
        }

    def __len__(self):
        return len(self.input_ids)

In [9]:
pretrained_transformer = AutoModel.from_pretrained('DeepPavlov/rubert-base-cased-sentence')


class CommaModel(nn.Module):
    def __init__(self, num_class: int):
        super(CommaModel, self).__init__()
        bert_dim = 768
        hidden_size = bert_dim

        self.hidden_size = hidden_size
        self.pretrained_transformer = pretrained_transformer
        self.lstm = nn.LSTM(input_size=bert_dim,
                            hidden_size=hidden_size,
                            num_layers=1,
                            bidirectional=True)

        self.linear = nn.Linear(in_features=hidden_size * 2,
                                out_features=num_class)

    def forward(self, x: torch.tensor, attn_masks: torch.tensor) -> torch.tensor:
        # add dummy batch for single sample
        if len(x.shape) == 1:
            x = x.view(1, x.shape[0])
        # (B, N, E) -> (B, N, E)
        x = self.pretrained_transformer(x, attention_mask=attn_masks)[0]
        # (B, N, E) -> (N, B, E)
        x = torch.transpose(x, 0, 1)
        x, (_, _) = self.lstm(x)
        # (N, B, E) -> (B, N, E)
        x = torch.transpose(x, 0, 1)
        x = self.linear(x)
        return x


Downloading:   0%|          | 0.00/678M [00:00<?, ?B/s]

In [10]:
def train(epoch: int,
          model: nn.Module,
          training_data_loader: DataLoader,
          criterion: nn.Module,
          optimizer: torch.optim.Optimizer,
          device: str):
    """
    функция для обучения на одной эпохе
    :param epoch: номер эпохи
    :param model: модель для обучения
    :param training_data_loader: тренировочный DataLoader
    :param criterion: функция потерь
    :param optimizer: оптимизатор
    :param device: cuda или cpu
    :return:
    """
    train_loss = 0.0
    # train_accuracy = 0.0
    train_iteration = 0
    correct = 0.0
    total = 0.0

    model.train()
    for batch in tqdm(training_data_loader):
        x, y, y_mask, att_mask = batch['feature'], batch['target'], batch['target_mask'], batch['attention_mask']
        x = x.to(device)
        y = y.view(-1).to(device)
        y_mask = y_mask.view(-1).to(device)
        att_mask = att_mask.to(device)

        try:
            y_predict = model(x, att_mask)
        except:
            print(x.shape)
            print(att_mask.shape)
            continue
            raise NotImplementedError
        # print(y_predict.shape)

        y_predict = y_predict.view(-1, y_predict.shape[2])
        loss = criterion(y_predict, y)

        y_predict = torch.argmax(y_predict, dim=1).view(-1)
        correct += torch.sum(y_mask * (y_predict == y)).item()

        optimizer.zero_grad()
        train_loss += loss.item()
        train_iteration += 1
        loss.backward()

        optimizer.step()
        total += torch.sum(y_mask.view(-1)).item()

    train_loss /= train_iteration
    train_accuracy = correct / total

    return train_loss, train_accuracy


def fit(dataset: Dataset, epochs: int):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(device)
    batch_size = 128

    training_data_loader = build_dataloader(dataset, batch_size)

    # testing_data_loader = DataLoader(dataset=test_set, num_workers=2, batch_size=batch_size,
    #                                  shuffle=False)

    model = CommaModel(num_class=3).to(device)

    for param in model.pretrained_transformer.parameters():
        param.requires_grad = False

    optimizer = optim.Adam(model.parameters(), lr=2e-4)
    criterion = nn.CrossEntropyLoss()

    train_losses = []
    val_losses = []
    train_accuracy = []
    val_accuracy = []

    for epoch in range(1, epochs):
        train_loss, train_acc = train(epoch, model, training_data_loader, criterion, optimizer, device)
        # val_loss, val_acc = test(model, testing_data_loader, criterion, device)
        # checkpoint(epoch, model, 'models')

        train_losses.append(train_loss)
        # val_losses.append(val_loss)
        train_accuracy.append(train_acc)
        # val_accuracy.append(val_acc)
      
    print(train_losses)

    torch.save(model, 'model.pth')


In [18]:
def predict():
    text = ['Показатели давления могут изменяться в зависимости от ряда факторов Даже у одного и того же '
            'пациента в течение суток наблюдаются колебания АД Например утром после пробуждения кровяное '
            'давление может быть низким после обеда оно может начать подниматься']
    input_ids, input_targets, target_mask, attention_mask = build_features(
        text=text,
        return_features=True)
    dataset = CommaDataset(input_ids, input_targets, target_mask, attention_mask)
    train_dataloader = build_dataloader(dataset, 1)
    model = torch.load('model.pth')

    with torch.no_grad():
        for batch in train_dataloader:
            x, y, y_mask, att_mask = batch['feature'], batch['target'], batch['target_mask'], batch['attention_mask']
            y_mask = y_mask.view(-1)
            x = x.to('cuda')
            att_mask = att_mask.to('cuda')
            y_predict = model(x, att_mask)

    y_predict = y_predict.view(-1, y_predict.shape[2])
    y_predict = torch.argmax(y_predict, dim=1).view(-1)

    print(y_predict)

    result = ""
    decode_idx = 0
    decode_map = {0: '', 1: ',', 2: '.'}
    words_original_case = ['SOS'] + text[0].split() + ['EOS']

    for i in range(y_mask.shape[0]):
        if y_mask[i] == 1:
            result += words_original_case[decode_idx]
            result += decode_map[y_predict[i].item()]
            result += ' '
            decode_idx += 1

    result = result.strip()
    print(result)

In [21]:
if __name__ == '__main__':
    # download_lenta()
    extract_sample(sample_size=100_000)
    cut_text(100)
    build_features()

    with open('input_ids.pkl', 'rb') as f:
        input_ids = pickle.load(f)
    with open('input_targets.pkl', 'rb') as f:
        input_targets = pickle.load(f)
    with open('target_mask.pkl', 'rb') as f:
        target_mask = pickle.load(f)
    with open('attention_mask.pkl', 'rb') as f:
        attention_mask = pickle.load(f)

    print('data loaded')
    dataset = CommaDataset(input_ids, input_targets, target_mask, attention_mask)
    # print(dataset[0])
    # train_dataloader = build_dataloader(dataset, 32)

    print(len(dataset))

    # for batch in train_dataloader:
    #     x, y, y_mask, att_mask = batch['feature'], batch['target'], batch['target_mask'], batch['attention_mask']
    #     print(x)
    #     print(y)
    #     print(y_mask)
    #     print(att_mask)
    #     break

    fit(dataset, 2)
    predict()


data loaded
207780
cuda


100%|██████████| 1624/1624 [1:27:33<00:00,  3.23s/it]


[0.11555913676108633]
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
SOS Показатели давления могут изменяться в зависимости от ряда факторов. Даже у одного и того же пациента в течение суток наблюдаются колебания АД. Например утром после пробуждения кровяное давление может быть низким после обеда оно может начать подниматься EOS


In [22]:
def predict():
    text = ['Затем чтобы было весело и праздно но с тобой есть проблемы Сегодня чтобы узнать правду']
    input_ids, input_targets, target_mask, attention_mask = build_features(
        text=text,
        return_features=True)
    dataset = CommaDataset(input_ids, input_targets, target_mask, attention_mask)
    train_dataloader = build_dataloader(dataset, 1)
    model = torch.load('model.pth')

    with torch.no_grad():
        for batch in train_dataloader:
            x, y, y_mask, att_mask = batch['feature'], batch['target'], batch['target_mask'], batch['attention_mask']
            y_mask = y_mask.view(-1)
            x = x.to('cuda')
            att_mask = att_mask.to('cuda')
            y_predict = model(x, att_mask)

    y_predict = y_predict.view(-1, y_predict.shape[2])
    y_predict = torch.argmax(y_predict, dim=1).view(-1)

    print(y_predict)

    result = ""
    decode_idx = 0
    decode_map = {0: '', 1: ',', 2: '.'}
    words_original_case = ['SOS'] + text[0].split() + ['EOS']

    for i in range(y_mask.shape[0]):
        if y_mask[i] == 1:
            result += words_original_case[decode_idx]
            result += decode_map[y_predict[i].item()]
            result += ' '
            decode_idx += 1

    result = result.strip()
    print(result)

In [23]:
predict()

tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0], device='cuda:0')
SOS Затем чтобы было весело и праздно, но с тобой есть проблемы. Сегодня чтобы узнать правду, EOS
