In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, \
f1_score, log_loss

import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification,  \
Trainer, TrainingArguments

import evaluate
from scipy.special import softmax
import warnings
from warnings import simplefilter

import json
import yaml

import tqdm
from tqdm import tqdm_notebook
from tqdm.auto import tqdm

from typing import Dict

warnings.filterwarnings("ignore")
simplefilter("ignore", category=RuntimeWarning)

In [7]:
config_path = "../config/config.yaml"
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

In [15]:
preproc = config['preprocessing']
train_params = config['train']
test_params = config['test']

In [12]:
def get_dataset(path: str):
    
    data = pd.read_csv(path)
    cols_to_drop = data.columns[data.columns.str.startswith('Unnamed')]
    
    data.drop(cols_to_drop, axis=1, inplace=True)
    data = data.drop_duplicates().reset_index(drop=True)
    
    return data


def plotting_trainer_loss(trainer):
    auc = []
    eval_loss = []
    
    
    for step in trainer.state.log_history:
        try:
            auc.append(step["eval_roc_auc"])
            eval_loss.append(step['eval_loss'])
            
        except KeyError:
            continue
    
    fig, axes = plt.subplots(ncols=2, figsize=(15, 5))
    sns.lineplot(eval_loss, ax=axes[0], color='orange')
    sns.lineplot(auc, ax=axes[1])
    
    axes[0].set_title('Validation Loss')
    axes[1].set_title('ROC-AUC SCORE')
    axes[0].set(xlabel='Epochs')
    axes[1].set(xlabel='Epochs')
    
    
def compute_metrics(eval_preds):
    """Расчет метрики roc-auc"""

    metric = evaluate.load("roc_auc")
    logits, labels = eval_preds
    predictions = softmax(logits)[:, 1]
    res = metric.compute(prediction_scores=predictions, 
                         references=labels)

    return {'roc_auc': res['roc_auc']}


def get_metrics(y_test: np.array, y_pred: np.array, y_proba: np.array) -> Dict:
    dict_metrics = {
        'roc_auc': round(roc_auc_score(y_test, y_proba[:, 1]), 3),
        'precision': round(precision_score(y_test, y_pred), 3),
        'recall': round(recall_score(y_test, y_pred), 3),
        'f1': round(f1_score(y_test, y_pred), 3),
        'logloss': round(log_loss(y_test, y_proba), 3)
    }

    return dict_metrics

In [23]:
class PrepareData:
    
    def __init__(self, texts, tokenizer, batch_size_split=train_params['batch_size_split'], 
                 max_length=train_params['max_length']):
        
        self.texts = texts
        self.tokenizer = tokenizer
        self.batch_size_split = batch_size_split
        self.max_length = max_length
        
    def pre_tokenizer(self, text):
        return self.tokenizer(text,
                              add_special_tokens=True, 
                              max_length=self.max_length,
                              pad_to_max_length=True,
                              truncation=True,
                              return_attention_mask=True,
                              return_tensors='pt')
    
        
    def transform(self):
        
        N = len(self.texts)
        size_split = N // self.batch_size_split

        train_encodings = self.pre_tokenizer(self.texts[:size_split])
        input_ids = train_encodings['input_ids']
        attention_mask = train_encodings['attention_mask']
        token_type_ids = train_encodings['token_type_ids']

        for pos in tqdm(range(size_split, N, size_split)):
            train_encodings_2 = self.pre_tokenizer(self.texts[pos:pos +
                                                              size_split])
            input_ids = torch.cat((input_ids, train_encodings_2['input_ids']))
            attention_mask = torch.cat(
                (attention_mask, train_encodings_2['attention_mask']))
            token_type_ids = torch.cat(
                (token_type_ids, train_encodings_2['token_type_ids']))
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids
        } 

In [14]:
class CustomDataset(torch.utils.data.Dataset):
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [16]:
preproc

{'random_state': 10,
 'test_size': 0.25,
 'raw_path': '../data/raw/data_reviews.csv',
 'train_path': '../data/processed/train_data.csv',
 'test_path': '../data/processed/test_data.csv'}

In [19]:
test_df = get_dataset(preproc['test_path'])

In [22]:
tokenizer = BertTokenizer.from_pretrained(train_params['tokenizer_path'])

In [25]:
clf = PrepareData(test_df.reviewText.tolist(), tokenizer)
test_encodings = clf.transform()

  0%|          | 0/10 [00:00<?, ?it/s]

In [26]:
test_dataset = CustomDataset(test_encodings, test_df.target.tolist())

In [31]:
train_params

{'max_length': 512,
 'batch_size': 8,
 'random_state': 10,
 'learning_rate': 2e-05,
 'tokenizer_path': 'cointegrated/rubert-tiny2',
 'model_path': 'cointegrated/rubert-tiny2',
 'epochs': 10,
 'weight_decay': 0.01,
 'per_device_batch_size': 64,
 'batch_size_split': 10,
 'metrics_path': '../metrics/metrics.json'}

In [33]:
model = BertForSequenceClassification.from_pretrained(test_params['model_path'])

In [39]:
trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics,
)

# Предсказание модели классификации Bert
y_pred = trainer.predict(test_dataset)

# Получение вероятностей
pred_proba = y_pred[0]

# Получение предсказанных меток класса
pred = pred_proba.argmax(axis=1)

In [41]:
metrics = get_metrics(test_df.target.tolist(), pred, pred_proba)
metrics

{'roc_auc': 0.973,
 'precision': 0.924,
 'recall': 0.898,
 'f1': 0.911,
 'logloss': 1.021}

In [42]:
with open (train_params['metrics_path'], 'a') as f:
    json.dump(metrics, f)

In [45]:
# with open("../config/config.yaml", 'w',) as f :
#     yaml.dump(config, f, sort_keys=False)