In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_transformers import BertTokenizer, BertConfig
from pytorch_transformers import AdamW, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from IPython.display import clear_output
import tensorflow as tf

In [None]:
pd.options.display.max_colwidth = 500

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == 'cpu':
    print('cpu')
else:
    n_gpu = torch.cuda.device_count()
    print(torch.cuda.get_device_name(0))

Tesla T4


In [None]:
def bert_classification(dataloader, model , model_type = 'train'):
  model = model.to(device)

  #optimizer
  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'gamma', 'beta']
  optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
  ] 

  optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)

  # Обучение
  if model_type == 'train':
    print('Model train')
    train_loss_set = []
    train_loss = 0

    model.train()

    for step, batch in enumerate(dataloader):
   
        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_seg, b_labels = batch

        optimizer.zero_grad()

        loss = model(b_input_ids, token_type_ids=b_seg, attention_mask=b_input_mask, labels=b_labels)

        train_loss_set.append(loss[0].item())  
        loss[0].backward()
        optimizer.step()
        train_loss += loss[0].item()

    return train_loss
  if model_type == 'test':
    print('Model test')
    
    model.eval()
    valid_preds, valid_labels = [], []
    logits_array, valid_preds_score = [],[]
    for batch in dataloader:   
 
        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_seg, b_labels = batch
        
        with torch.no_grad():
            logits = model(b_input_ids, token_type_ids=b_seg, attention_mask=b_input_mask)

    
        logits = logits[0].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        logits_array = np.append(logits_array, logits)
        batch_preds = np.argmax(logits, axis=1)
        batch_labels = np.concatenate(label_ids)

        batch_score = np.max(logits, axis=1)

        valid_preds.extend(batch_preds)
        valid_labels.extend(batch_labels)
        valid_preds_score = np.append(valid_preds_score, batch_score)
 
    return valid_labels, valid_preds, valid_preds_score, logits_array.reshape(-1,2)

In [None]:
def preprocessing_data_for_BertModel(df = None, sen1 = None, sen2 = None, claim = 'ru_claim', evidence = 'ru_evidence',\
                                     lang = 'ru', re_index = False, b_size = 8):
  df = df.drop_duplicates(subset = claim, keep = False)
  df = df[df.label != 0]

  if lang == 'eng':
    print('eng')
    df[evidence] = df[evidence].apply(clean_evidence)

  if re_index == True:
    df = df.sample(frac=1, random_state=42).reset_index(drop=True) # перемешиваем DataFrame

    #convert label to special form for model
  df["label_bi"] = df["label"].apply(lambda x: 0 if x == 2 else 1)
  label_test = [[label] for label in df.label_bi]
  test_labels = torch.tensor(label_test)

  #convert dataFrame 

  test_input_ids, test_token_type_ids, test_attention_mask = encoded_inputs_(list(df[claim]), list(df[evidence]))

  # padding 

  test_input_ids, test_token_type_ids, test_attention_mask = inputs_with_padding(test_input_ids),\
    inputs_with_padding(test_token_type_ids), inputs_with_padding(test_attention_mask)

  # приводим к типу tensor
  # test
  test_inputs = torch.tensor(test_input_ids)
  test_token = torch.tensor(test_token_type_ids)
  test_masks = torch.tensor(test_attention_mask)
  test_data = TensorDataset(test_inputs, test_masks, test_token, test_labels)
  return DataLoader(test_data, sampler=RandomSampler(test_data), batch_size=b_size)


In [None]:
# clean special labels, words (dont english), and tranform 's
# input: str 
# output: str
import re
def clean_evidence(evidence):
    evidence = evidence[2:-2]
    clean = re.sub('-...-','',evidence) #clean special mark
    clean = re.sub(" 's ", "'s ", clean) # son 's to son's
    return re.sub(r'[^A-z0-9\s\.,\-\'\"%]+', '', clean) #мб добавить специальные ковычки 

In [None]:
def inputs_with_padding(encoded_input, MAXLEN = 512):
  return pad_sequences(
    encoded_input,
    maxlen=MAXLEN,
    dtype="long",
    truncating="post",
    padding="post"
)

In [None]:
from transformers import AutoTokenizer
#input: str, str
#output: input_ids, token_type_ids, attention_mask
#tokenizer для мультиязычного: 'bert-base-multilingual-cased'
#только для русского языка: 'DeepPavlov/rubert-base-cased'
def encoded_inputs_(claim, evidence, tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased'), MAXLEN = 512):
  encoded_inputs = tokenizer(claim, evidence, max_length=MAXLEN)
  return encoded_inputs['input_ids'], encoded_inputs['token_type_ids'], encoded_inputs ['attention_mask']

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1961828.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…


