In [1]:
import datetime
from tqdm import tqdm

import sys
sys.path.insert(0, '/home/skvortsova-ev/bert_cls/only_extractive/lib/python3.7/site-packages/')

import importlib.util
import torch
from transformers import BertTokenizer

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix, classification_report

from IPython.display import clear_output
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import torch
from torch.optim import Adam, SGD
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig
from transformers import AdamW, BertForSequenceClassification
from scipy.special import softmax

import pandas as pd
import numpy as np

In [7]:
if torch.cuda.is_available():
    device = torch.device("cuda", 1)
    print('GPU avaliable')
else:
    device = torch.device("cpu")
    print("GPU UNavaliable")

GPU avaliable


In [8]:
spec = importlib.util.spec_from_file_location('BertTokenizer', '/data/bert_data/transformers/transformers/tokenization_bert.py')

In [9]:
Path_pretrained = {
    'vocab': '/data/bert_data/rubert_cased_L-12_H-768_A-12_v1/vocab.txt',
    'json': '/data/bert_data/rubert_cased_L-12_H-768_A-12_v1/bert_config.json',
    'bin': '/data/bert_data/rubert_cased_L-12_H-768_A-12_v1/pytorch_model.bin',
}

In [10]:
tokenizer = BertTokenizer.from_pretrained(Path_pretrained['vocab'], do_lower_case=True)

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


In [11]:
import re

In [12]:
def clean_text(text): 
    text = re.sub(r'[А-ЯЁ]\w+ +\w{1}\. ?\w{1}\.?',' ', text) #fio
    text = re.sub(r'[А-ЯЁ][\w\.]+ [А-ЯЁ]\w+ [А-ЯЁ]\w+','', text) #fio2
    text = re.sub(r'[ёЁ]','е', text) #ё
    text = re.sub(r'(с|[пд]о)? ?\d{2}\.\d{2}\.\d{2,4} ?г?\.?','', text) #dates
    text = re.sub(r'[\W\d_]',' ', text) #not string and digits and _
    text = re.sub(r'\s+',' ', text) #whitespaces
    return text.strip().lower()

In [None]:
df = pd.read_csv(r"working_labels_extractive.csv")

In [None]:
df['type'] = df['sentenized'].map(lambda x: type(x))

In [None]:
df['type'].value_counts()

In [None]:
df = df[df['type']==str]

In [None]:
df = df.reset_index(drop=True)

In [None]:
df_test = df[df['data_type']=='test']
df_train = df[df['data_type']=='train']

In [None]:
df_test['sentenized'] = df_test['sentenized'].map(lambda x: clean_text(x))
df_train['sentenized'] = df_train['sentenized'].map(lambda x: clean_text(x))

In [None]:
df_test['sentenized'] = df_test['sentenized'].map(lambda x: "[CLS] " + x + " [SEP]")
df_train['sentenized'] = df_train['sentenized'].map(lambda x: "[CLS] " + x + " [SEP]")

In [None]:
tokenized_texts_train = [tokenizer.tokenize(sent) for sent in df_train['sentenized'].to_list()]
tokenized_texts_test = [tokenizer.tokenize(sent) for sent in df_test['sentenized'].to_list()]

df_train['tokenized'] = tokenized_texts_train
df_train['ids'] = df_train['tokenized'].map(lambda x: [tokenizer.convert_tokens_to_ids(y) for y in x])

df_test['tokenized'] = tokenized_texts_test
df_test['ids'] = df_test['tokenized'].map(lambda x: [tokenizer.convert_tokens_to_ids(y) for y in x])

In [None]:
df_train=df_train.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)

In [None]:
def pad_sequences(arr, len_, dtype=np.long):
    arr = np.array(arr, dtype=dtype)
    if arr.shape[0] >= len_:
        return arr[:len_]
    return np.concatenate((arr, np.zeros(len_ - arr.shape[0], dtype=dtype)))

In [None]:
plt.hist([len(t) for t in df_train['ids'].to_list()], bins = 80)
plt.xlabel('words count')
plt.show()

In [None]:
plt.hist([len(t) for t in df_test['ids'].to_list()], bins = 80)
plt.xlabel('words count')
plt.show()

In [None]:
df_train['input_ids'] = df_train['ids'].map(lambda x: pad_sequences(x, 50))
df_test['input_ids'] = df_test['ids'].map(lambda x: pad_sequences(x, 50))

df_train['attention_masks'] = df_train['input_ids'].map(lambda x: [float(i>0) for i in x])
df_test['attention_masks'] = df_test['input_ids'].map(lambda x: [float(i>0) for i in x])

In [None]:
df_train.groupby(['km_id']).count()[['filename']].hist(bins = 50)

In [None]:
# разбиваем на бины, в которых равное количество предложений для описания 
len_violations = df_train.groupby(['violation_index']).count()[['filename']].reset_index()
len_violations = len_violations.sort_values(['filename'], ascending = False)
lengths = list(set(len_violations['filename'].to_list()))

In [None]:
df_test.groupby(['violation_index']).count()[['filename']].hist(bins = 50)

In [None]:
%%time
torch.cuda.empty_cache()
model = BertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=Path_pretrained['json'],
    state_dict=torch.load(Path_pretrained['bin']),
    num_labels=2
)

model.to(device)
# model.config.output_attentions = True

param_optimizer = list(model.named_parameters())
no_decay = ["LayerNorm", "layer_norm", "bias"]
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.00}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

In [None]:
def DrawPlot(train_loss_set, val_loss_set, epoch, step, train_dataloader):
    clear_output(True)
    plt.figure(figsize=(20,10))
    plt.ylim(bottom=-.15, top=2.5)
    plt.plot(train_loss_set)
    plt.plot(val_loss_set, linewidth=4)
    plt.axhline(0, c='black', linewidth=2)
    val_loss_ = np.array(val_loss_set)[np.unique(np.array(val_loss_set), return_index=True)[1][::-1]]
    for i in range(epoch):         
#         plt.text(len(train_dataloader)*i + len(train_dataloader)//4, -.05, '{:.3f}'.format(val_loss_[i]), fontsize=10, bbox=dict(facecolor='red', alpha=0.5))
        plt.text(len(train_dataloader)*i + len(train_dataloader)//4, -.07, '{:.3f}'.format(val_loss_set[i*len(train_dataloader)]), fontsize=10, bbox=dict(facecolor='red', alpha=0.5))        
        plt.text(len(train_dataloader)*i + len(train_dataloader)//2, 2, i+1, bbox=dict(facecolor='yellow', alpha=0.3))
    for vline in range(epoch+1):
        plt.axvline((vline+1)*len(train_dataloader), ymax=max(train_loss_set), color='red')
    plt.title(f"Training loss epoch {epoch+1}, step {step+1}/{len(train_dataloader)}")
    plt.xlabel("Batch")
    plt.ylabel("Loss")            
    plt.show()

In [None]:
# разбиваем на бины, в которых равное количество предложений для описания нарушений
len_violations_test = df_test.groupby(['violation_index']).count()[['filename']].reset_index()
len_violations_test = len_violations_test.sort_values(['filename'], ascending = False)
lengths_test = list(set(len_violations_test['filename'].to_list()))

In [None]:
%%time

EPOCH_NUM = 10

# Будем сохранять loss во время обучения и рисовать график в режиме реального времени
train_loss_set = []
val_loss_set = []

train_loss = 0
val_score = []
val_score_f=[]
model.train()

for epoch in range(EPOCH_NUM):
    for num in range(len(lengths)):
        BATCH_SIZE = lengths[num]
        temp_length_filter = set(len_violations[len_violations['filename']==lengths[num]]['violation_index'].to_list())
        temp_df_train = df_train[df_train['violation_index'].isin(temp_length_filter)]
        kms = list(set(temp_df_train['violation_index'].to_list()))
        for violation in tqdm(range(len(kms))):
            df_batch = df_train[df_train['violation_index']==kms[violation]].reset_index(drop = True)
            train_inputs = torch.tensor(df_batch['input_ids'].to_list())
            train_labels = torch.tensor(df_batch['label'].to_list())
            train_masks = torch.tensor(df_batch['attention_masks'].to_list())
            
            train_data = TensorDataset(train_inputs, train_masks, train_labels)
            
            train_dataloader = DataLoader(
            train_data,
            sampler=RandomSampler(train_data),
            batch_size=BATCH_SIZE
        )
            
            for step, batch in enumerate(train_dataloader):
                # добавляем батч для вычисления на GPU
                batch = tuple(t.to(device).to(torch.int64) for t in batch)
                # Распаковываем данные из dataloader
                b_input_ids, b_input_mask, b_labels = batch
                optimizer.zero_grad()

                # Forward pass
                loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

                train_loss_set.append(loss[0].item())  

                # Backward pass
                loss[0].backward()

                # Обновляем параметры и делаем шаг используя посчитанные градиенты
                optimizer.step()

                # Обновляем loss
                train_loss += loss[0].item()
                optimizer.zero_grad()


            torch.cuda.empty_cache()
            
    model.eval()    
    val_loss = 0
    for num in range(len(lengths_test)):
        BATCH_SIZE_test = lengths[num]
        temp_length_filter_test = set(len_violations_test[len_violations_test['filename']==lengths_test[num]]['violation_index'].to_list())
        temp_df_test = df_test[df_test['violation_index'].isin(temp_length_filter_test)]
        kms_test = list(set(temp_df_test['violation_index'].to_list()))
        for viol in range(len(kms_test)):
        
            df_batch_test = df_test[df_test['violation_index']==kms_test[viol]].reset_index(drop = True)
            test_inputs = torch.tensor(df_batch_test['input_ids'].to_list())
            test_labels = torch.tensor(df_batch_test['label'].to_list())
            test_masks = torch.tensor(df_batch_test['attention_masks'].to_list())
            
            test_data = TensorDataset(test_inputs, test_masks, test_labels)
            
            test_dataloader = DataLoader(
            test_data,
            sampler=RandomSampler(test_data),
            batch_size=BATCH_SIZE_test)
            
            for i, val in enumerate(test_dataloader):        
                if i > step:
                    break        
                val = tuple(t.to(device).to(torch.int64) for t in val)
                b_input_ids, b_input_mask, b_labels = val
                with torch.no_grad():
                    logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
                val_loss += logits[0].item()
                logits = logits[1].detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()
                val_score.append(precision_score(np.array(label_ids), np.argmax(logits, axis=1), average='macro'))
                val_score_f.append(f1_score(np.array(label_ids), np.argmax(logits, axis=1), average='macro'))
                print('EPOCH_NUMBER: ', epoch, 'PRECISION :', precision_score(np.array(label_ids), np.argmax(logits, axis=1), average='macro'))
            val_loss_set += [val_loss/(step+1)]*(step+1)
    model.train()

In [None]:
path= 'bersumext.pt'

In [None]:
torch.save(model.state_dict(), path)