## Устанавливаем библиотеку и скачиваем предобученную модель

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/b5/d5/c6c23ad75491467a9a84e526ef2364e523d45e2b0fae28a7cbe8689e7e84/transformers-4.8.1-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 7.7MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 50.1MB/s 
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |█████

In [2]:
!wget -c http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_pt.tar.gz
!tar -zxf rubert_cased_L-12_H-768_A-12_pt.tar.gz
!cp rubert_cased_L-12_H-768_A-12_pt/bert_config.json rubert_cased_L-12_H-768_A-12_pt/config.json


--2021-06-27 06:15:39--  http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_pt.tar.gz
Resolving files.deeppavlov.ai (files.deeppavlov.ai)... 93.175.29.74
Connecting to files.deeppavlov.ai (files.deeppavlov.ai)|93.175.29.74|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_pt.tar.gz [following]
--2021-06-27 06:15:40--  https://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_pt.tar.gz
Connecting to files.deeppavlov.ai (files.deeppavlov.ai)|93.175.29.74|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 662024852 (631M) [application/octet-stream]
Saving to: ‘rubert_cased_L-12_H-768_A-12_pt.tar.gz’


2021-06-27 06:17:51 (4.87 MB/s) - ‘rubert_cased_L-12_H-768_A-12_pt.tar.gz’ saved [662024852/662024852]



## Предобработка - удалить pos-тэги, добавить нужные тэги, токенизировать, частично маскировать и получить бертовские индексы


In [3]:
import transformers
from tqdm.auto import tqdm
import torch
from transformers import AdamW


In [4]:
from random import sample
def preproc(text):
  tokenizer = transformers.BertTokenizerFast.from_pretrained('rubert_cased_L-12_H-768_A-12_pt')
  sents_tagged = text.split('\n') #делим на предложения
  sents = []
  for sent in tqdm(sents_tagged): #уберем теги
    new_sent = ''
    for word in sent.split():
      new_sent += word.split('_')[0] + ' '
    sents.append(new_sent[:-1])

  sents = sents[:70000]
  inputs = tokenizer(sents, return_tensors='pt', max_length=100, \
                   truncation=True, padding='max_length')

  inputs['labels'] = inputs.input_ids.detach().clone()
  rand = torch.rand(inputs.input_ids.shape)
    # create mask array
  mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

  selection = []

  for i in range(inputs.input_ids.shape[0]):
    selection.append(
      torch.flatten(mask_arr[i].nonzero()).tolist()
    )
  for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

  return inputs

## Класс датасета для обучения

In [5]:
class Train_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

## Тренировочная петля

In [6]:
def training_loop(model, dataloader, optimizer, epochs, device):
  for epoch in range(epochs):
      loop = tqdm(dataloader, leave=True)
      for batch in loop:
          optimizer.zero_grad()
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          labels = batch['labels'].to(device)

          outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
          loss = outputs.loss

          loss.backward()
          optimizer.step()

          loop.set_description(f'Epoch {epoch}')
          loop.set_postfix(loss=loss.item())
          
  return model


## Функция обучения модели

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
def get_model(side: str, year: str):
    
    config = transformers.BertConfig.from_pretrained('rubert_cased_L-12_H-768_A-12_pt', output_hidden_states=True)
    model = transformers.BertForMaskedLM.from_pretrained('rubert_cased_L-12_H-768_A-12_pt', config=config)

    path = f'/content/drive/MyDrive/processed/{side}_{year}.txt'
    with open(path, 'r', encoding='utf-8') as file:
        text = file.read()


    inputs = preproc(text)
    dataset = Train_Dataset(inputs)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True) 

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)
    model.train()
    lr = 5e-5
    optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)
    epochs = 1

    model = training_loop(model, dataloader, optimizer, epochs, device)

    return model


## Обучала я в итоге по одной, не циклом

In [28]:
periods = ['2015', '2016', '2017', '2018', '2019']

print('Training loyal models...')
loyal_dict = {}
loyal_dict[2017] = get_model('loyal', 2017)


Training loyal models...


Some weights of the model checkpoint at rubert_cased_L-12_H-768_A-12_pt were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=56428.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3527.0), HTML(value='')))

  """





In [29]:
village_2019 = loyal_dict[2019]
torch.save(village_2019, '/content/drive/MyDrive/models/village_2019.model')

## Загружаем модели

за 2019 год плохо обучилась 

In [10]:
model_2018 = torch.load('/content/drive/MyDrive/models/loyal_2018.model')
model_2017 = torch.load('/content/drive/MyDrive/models/loyal_2017.model')
model_2016 = torch.load('/content/drive/MyDrive/models/loyal_2016.model')
model_2015 = torch.load('/content/drive/MyDrive/models/loyal_2015.model')

In [44]:
model_2017_opp = torch.load('/content/drive/MyDrive/models/opp_2017.model')
model_2016_opp = torch.load('/content/drive/MyDrive/models/opp_2016.model')
model_2015_opp = torch.load('/content/drive/MyDrive/models/opp_2015.model')

In [12]:
village_2019 = torch.load('/content/drive/MyDrive/models/village_2019.model')

In [13]:
lenta_2019 = torch.load('/content/drive/MyDrive/models/lenta_2019.model')

## Поиграться

In [86]:
tokenizer = transformers.BertTokenizerFast.from_pretrained('rubert_cased_L-12_H-768_A-12_pt')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
def get_bert_substs(text, model):
    text_encoded = tokenizer.encode(text, return_tensors="pt").to(device)
    pos = [i for i in range(len(text_encoded[0])) if text_encoded[0][i] == 103]
    model_pred = model(text_encoded, return_dict=True)
    indices = model_pred.logits.topk(6).indices[0][pos]
    substs = tokenizer.convert_ids_to_tokens(indices.flatten().tolist())
    return substs



In [None]:
for mod, name in zip([model_2015, model_2016, model_2017,
            model_2018, model_2015_opp, model_2016_opp,
            model_2017_opp, lenta_2019, village_2019], ['loyal_2015', 'loyal_2016', 
            'loyal_2017', 'loyal_2018', 'opp_2015', 'opp_2016', 'opp_2017', 'lenta_2019', 'village_2019']):

  print(name, get_bert_substs("В России продолжается скандал с [MASK].", mod))

In [145]:
for mod, name in zip([model_2015, model_2016, model_2017,
            model_2018, model_2015_opp, model_2016_opp,
            model_2017_opp, lenta_2019, village_2019], ['loyal_2015', 'loyal_2016', 
            'loyal_2017', 'loyal_2018', 'opp_2015', 'opp_2016', 'opp_2017', 'lenta_2019', 'village_2019']):

  print(name, get_bert_substs("В России продолжаются уголовные преследования [MASK].", mod))

loyal_2015 ['дело', 'человек', 'гражданин', 'россиянин', 'женщина', 'лицо']
loyal_2016 ['мужчина', 'год', 'человек', 'ст', 'подозревать', 'задержать']
loyal_2017 ['коррупция', 'кража', 'человек', 'обвинять', 'против', 'подозревать']
loyal_2018 ['год', 'дело', 'см', 'р', 'вв', 'г']
opp_2015 ['ст', 'отношение', 'подозревать', 'г', 'россия', 'р']
opp_2016 ['россия', 'женщина', 'коррупция', 'человек', 'иностранец', 'год']
opp_2017 ['человек', 'коррупция', 'гражданин', 'грабеж', 'мошенничество', 'вымогательство']
lenta_2019 ['чиновников', 'банды', 'задержанных', 'осужденных', 'лиц', 'журналистов']
village_2019 ['оппозиционеров', 'россиян', 'политзаключенных', 'заключенных', 'осужденных', 'преступников']


In [138]:
for mod, name in zip([model_2015, model_2016, model_2017,
            model_2018, model_2015_opp, model_2016_opp,
            model_2017_opp, lenta_2019, village_2019], ['loyal_2015', 'loyal_2016', 
            'loyal_2017', 'loyal_2018', 'opp_2015', 'opp_2016', 'opp_2017', 'lenta_2019', 'village_2019']):

  print(name, get_bert_substs("В России продолжаются преследования [MASK].", mod))


loyal_2015 ['человек', 'ребенок', 'преступник', 'медведь', 'автомобиль', 'россиянин']
loyal_2016 ['человек', 'преступник', 'мужчина', 'ребенок', 'подозревать', 'женщина']
loyal_2017 ['подозревать', 'коррупция', 'террорист', 'боевик', 'человек', 'россия']
loyal_2018 ['год', 'человек', 'полиция', 'суд', 'мужчина', 'обвинение']
opp_2015 ['р', 'россия', 'ст', 'обвинять', 'подозревать', 'задержать']
opp_2016 ['страна', 'человек', 'россия', 'иностранец', 'воина', 'запад']
opp_2017 ['человек', 'геев', 'коррупция', 'гражданин', 'год', 'задержать']
lenta_2019 ['животных', 'биатлонистов', 'россиян', 'пилотов', 'геев', 'спортсменов']
village_2019 ['оппозиционеров', 'россиян', 'политзаключенных', 'активистов', 'протестующих', 'заключенных']


In [48]:
tokenizer = transformers.BertTokenizerFast.from_pretrained('rubert_cased_L-12_H-768_A-12_pt')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
village_2019.to(device)
def get_bert_substs(text):
    text_encoded = tokenizer.encode(text, return_tensors="pt").to(device)
    pos = [i for i in range(len(text_encoded[0])) if text_encoded[0][i] == 103]
    model_pred = model_2017(text_encoded, return_dict=True)
    indices = model_pred.logits.topk(10).indices[0][pos]
    substs = tokenizer.convert_ids_to_tokens(indices.flatten().tolist())
    return substs

print(get_bert_substs("Основатель Вконтакте Павел [MASK] ушел с поста генерального директора."))

['песок', 'морозов', 'романов', 'также', 'орлов', 'киров', 'сын', 'белов', 'это', 'виктор']


## не нужно


In [None]:
batches = []
for batch_hs in hidden_states_las_epoch:  #состакаем по слоям
  token_embeddings = torch.stack(batch_hs, dim=0)
  batches.append(token_embeddings)

In [None]:
token_embeddings = torch.cat(batches, dim=1) #объединим последовательно батчи
token_embeddings.shape


torch.Size([13, 54, 87, 768])

In [None]:
embs = torch.sum(token_embeddings[-4:], dim=0)  #возьмем сумму векторов последних четырех слоев

embs.shape #54 предложения, в котором 87 токенов, каждый кодируется 768 фичами

torch.Size([54, 87, 768])

In [None]:
all_embs_l = []  
for e in embs:
  all_embs_l.append(e)  #сделаем список по предложениям
all_embs = torch.cat(all_embs_l, dim=0) #конкатенируем все предложения в один длинный текст
all_embs.shape   #4698 токенов 

torch.Size([4698, 768])