## Модель

In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score

In [2]:
import numpy as np
import pandas as pd

In [3]:
model_name = 'DeepPavlov/rubert-base-cased-conversational'

In [4]:
tokenizer = BertTokenizer.from_pretrained(model_name)

In [5]:
model = BertForSequenceClassification.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Данные

In [7]:
df = pd.read_csv("train.csv")
df_val = pd.read_csv("val.csv")
df_test = pd.read_csv("test.csv")

In [9]:
df.columns

Index(['text', 'inappropriate', 'offline_crime', 'online_crime', 'drugs',
       'gambling', 'pornography', 'prostitution', 'slavery', 'suicide',
       'terrorism', 'weapons', 'body_shaming', 'health_shaming', 'politics',
       'racism', 'religion', 'sexual_minorities', 'sexism', 'social_injustice',
       'human_labeled'],
      dtype='object')

In [11]:
def round_val(val):
    return round(val)

In [12]:
label_name = 'inappropriate'
threshold = 0
df = df[(df[label_name] >= 1-threshold)|(df[label_name] <=threshold)]
df_val = df_val[(df_val[label_name] >= 1-threshold)|(df_val[label_name] <=threshold)]
df_test = df_test[(df_test[label_name] >= 1-threshold) | (df_test[label_name] <=threshold)]

df[label_name] = df[label_name].apply(round_val)
df_val[label_name] = df_val[label_name].apply(round_val)
df_test[label_name] = df_test[label_name].apply(round_val)

In [13]:
df.columns

Index(['text', 'inappropriate', 'offline_crime', 'online_crime', 'drugs',
       'gambling', 'pornography', 'prostitution', 'slavery', 'suicide',
       'terrorism', 'weapons', 'body_shaming', 'health_shaming', 'politics',
       'racism', 'religion', 'sexual_minorities', 'sexism', 'social_injustice',
       'human_labeled'],
      dtype='object')

In [14]:
class UnsafeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [16]:
train_dataset = UnsafeDataset(tokenizer(df.text.tolist(),
                                        max_length=64,
                                        truncation=True,
                                        padding='longest'), df.inappropriate.tolist())

In [18]:
eval_dataset = UnsafeDataset(tokenizer(df_val.text.tolist(),
                                       max_length=64,
                                       truncation=True,
                                       padding='longest'), df_val.inappropriate.tolist())


In [19]:
test_dataset = UnsafeDataset(tokenizer(df_test.text.tolist(),
                                       max_length=64,
                                       truncation=True,
                                       padding='longest'), df_test.inappropriate.tolist())

## Обучение

In [22]:
from transformers import Trainer, TrainingArguments
from transformers.file_utils import cached_property
from typing import Tuple

device = torch.device('cuda')

class TrAr(TrainingArguments):
    @cached_property
    def _setup_devices(self) -> Tuple["torch.device", int]:
        return device

In [23]:
torch.cuda.set_device(device)
model.to(device);

In [24]:
for param in model.bert.parameters():
    param.requires_grad=True

In [25]:
training_args = TrAr(
    output_dir='./unsafe/FINAL_VERS',   # output directory
    overwrite_output_dir=True,
    num_train_epochs=5,            # total # of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=0,               # number of warmup steps for learning rate scheduler
    weight_decay=1e-8,              # strength of weight decay
    learning_rate=2e-5,
    save_total_limit=2,
    logging_dir='./logs',           # directory for storing logs
    logging_steps=2500,
    eval_steps=2500,
    save_steps=2500,
    evaluation_strategy='steps',metric_for_best_model = 'f1',greater_is_better = True, load_best_model_at_end = True
)

In [26]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [27]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=eval_dataset,           # evaluation dataset
    tokenizer=tokenizer,
    compute_metrics  = compute_metrics
)

W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.


In [28]:
from transformers.trainer_callback import EarlyStoppingCallback
trainer.add_callback(EarlyStoppingCallback(3)) 

In [29]:
training_args.device

device(type='cuda', index=3)

In [31]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
2500,0.3138,0.274069,0.885798,0.882627,0.882859,0.885798,13.5431,782.982
5000,0.1629,0.322487,0.888533,0.887054,0.886421,0.888533,13.531,783.684
7500,0.0792,0.523052,0.88344,0.88221,0.881518,0.88344,13.5414,783.078
10000,0.0381,0.730064,0.885043,0.884201,0.883625,0.885043,13.5603,781.989
12500,0.0181,0.814693,0.885326,0.882978,0.882551,0.885326,13.5514,782.503


TrainOutput(global_step=12500, training_loss=0.12241329833984375, metrics={'train_runtime': 2201.8128, 'train_samples_per_second': 6.027, 'total_flos': 27311694989644800, 'epoch': 4.71})

## Evaluation

In [32]:
pred = trainer.predict(test_dataset)

In [33]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score

In [34]:
import numpy as np
from scipy.special import softmax
from sklearn.metrics import precision_recall_fscore_support, classification_report, roc_auc_score
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score
# Function to calculate the accuracy of our predictions vs labels
def get_metrics(preds):
    preds, labels = preds.predictions, preds.label_ids
    #standard round approach    
    pred_flat = np.argmax(preds, axis=1).flatten()    
    pr, rec, f, _ = precision_recall_fscore_support(labels, pred_flat, average='weighted')  
    
    print("precision", pr)
    print("recall", rec)
    print("fscore_weighted", f)
    
    #adjust threshold approach
    preds_adj = np.array([[float(el1),float(el2)] for el1,el2 in preds])
    preds_adj = softmax(preds_adj, axis = 1)
    roc_auc = roc_auc_score(labels, preds_adj[:, 1])
    print("roc_auc", roc_auc)
    
    all_metrcis = []
    for threshold in [0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1]:
        metrcis = []
        pred_labels = (preds_adj[:, 1] >= threshold).astype(int)
        metrcis.append(threshold)
        metrcis.append(round(f1_score(labels, pred_labels, average='weighted'),2))  
        metrcis.append(round(precision_score(labels, pred_labels),2))  
        metrcis.append(round(recall_score(labels, pred_labels),2))  
        metrcis.append(round(accuracy_score(labels, pred_labels),2))  
        all_metrcis.append(metrcis)

    df_metrics = pd.DataFrame(data = all_metrcis, columns = ['threshold','f1','prec','rec','acc'])
    df_metrics = df_metrics.sort_values(by='f1', ascending=False)
    
    print(classification_report(labels, pred_flat))
    
    print(df_metrics.head())
    
    return f

get_metrics(pred)

precision 0.8845704002666753
recall 0.8867960246095599
fscore_weighted 0.8851992422915465
roc_auc 0.935834400995755
              precision    recall  f1-score   support

           0       0.91      0.94      0.92      7839
           1       0.81      0.74      0.77      2726

    accuracy                           0.89     10565
   macro avg       0.86      0.84      0.85     10565
weighted avg       0.88      0.89      0.89     10565

   threshold    f1  prec   rec   acc
3        0.3  0.89  0.77  0.80  0.88
4        0.4  0.89  0.79  0.77  0.89
5        0.5  0.89  0.81  0.74  0.89
2        0.2  0.88  0.74  0.82  0.88
6        0.6  0.88  0.82  0.71  0.89


  _warn_prf(average, modifier, msg_start, len(result))


0.8851992422915465