In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, GPT2ForSequenceClassification, GPT2TokenizerFast, AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import numpy as np
import pandas as pd

### Данные

In [None]:
train_df = pd.read_csv('/kaggle/input/unsafe/train_randst0.csv')
test_df = pd.read_csv('/kaggle/input/unsafe/val_randst0.csv')
train_df = train_df[["text", "unsafe"]]
test_df = test_df[["text", "unsafe"]]

In [None]:
train_df = train_df.loc[(train_df['unsafe'] >= 0.8) | (train_df['unsafe'] <= 0.2)]

In [None]:
def binary(val):
    return round(val)

In [None]:
train_df['unsafe'] = train_df['unsafe'].apply(binary)
test_df['unsafe'] = test_df['unsafe'].apply(binary)

In [None]:
train_df, test_df

In [None]:
x_train = train_df['text'].tolist()
y_train = train_df['unsafe'].tolist()
x_test = test_df['text'].tolist()
y_test = test_df['unsafe'].tolist()

In [None]:
class UnsafeData(Dataset):

    def __init__(self, texts, targets, tokenizer, max_len):
        
        super().__init__()
        
        self.texts = texts
        self.targets = targets        
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        
        return len(self.texts)

    
    def __getitem__(self, index):
        x = self.texts[index]

        enc_dict = self.tokenizer(x, truncation=True, max_length=self.max_len, padding='max_length')
      
        item = {key: torch.tensor(val).long() for key, val in enc_dict.items()}
        item['labels'] = torch.tensor(self.targets[index]).long()

        return item 

In [None]:
train_dataset = UnsafeData(x_train, y_train, max_len = 60)
test_dataset = UnsafeData(x_test, y_test, max_len = 60)

In [None]:
len(train_dataset), len(test_dataset)

In [None]:
train_dataset[23000]

### Руберт

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
model_name = 'DeepPavlov/rubert-base-cased-conversational'

In [None]:
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

In [None]:
training_args = TrainingArguments(
    output_dir='/kaggle/working/bert1',
    num_train_epochs=1,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    logging_steps = 250,
    evaluation_strategy = 'steps',
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/kaggle/working/bert1/logs',
    save_steps = 500,
    load_best_model_at_end = True    
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
250,0.5424,0.46454,0.777397,0.771103,0.769191,0.777397,44.3719,552.174
500,0.4031,0.459837,0.782172,0.759164,0.777525,0.782172,44.3495,552.453
750,0.3879,0.441375,0.797192,0.79195,0.790612,0.797192,44.3606,552.314
1000,0.3792,0.453131,0.796131,0.780167,0.790522,0.796131,44.3561,552.371
1250,0.3589,0.416489,0.806661,0.804773,0.803541,0.806661,44.3318,552.674
1500,0.3487,0.418545,0.809559,0.808421,0.80754,0.809559,44.344,552.521
1750,0.3425,0.409088,0.811844,0.810816,0.810004,0.811844,44.334,552.645


TrainOutput(global_step=1877, training_loss=0.39004441405079376, metrics={'train_runtime': 1179.8352, 'train_samples_per_second': 1.591, 'total_flos': 7691018384649600, 'epoch': 1.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.40908750891685486,
 'eval_accuracy': 0.8118444145136933,
 'eval_f1': 0.8108156957257882,
 'eval_precision': 0.8100040633064035,
 'eval_recall': 0.8118444145136933,
 'eval_runtime': 44.383,
 'eval_samples_per_second': 552.036,
 'epoch': 1.0}

In [None]:
trainer.save_model('rubert-base-cased-conversational-1')

### Токсичный Берт

In [None]:
model_name2 = 'sismetanin/rubert-toxic-pikabu-2ch'

In [None]:
tokenizer2 = BertTokenizer.from_pretrained(model_name2)
model2 = BertForSequenceClassification.from_pretrained(model_name2, num_labels=2)

In [None]:
train_dataset = UnsafeData(x_train, y_train, tokenizer2, max_len = 60)
test_dataset = UnsafeData(x_test, y_test, tokenizer2, max_len = 60)

In [None]:
training_args = TrainingArguments(
    output_dir='/kaggle/working/bert2',
    num_train_epochs=1,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    logging_steps = 250,
    evaluation_strategy = 'steps',
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/kaggle/working/bert2/logs',
    save_steps = 500,
    load_best_model_at_end = True    
)

In [None]:
trainer2 = Trainer(
    model=model2,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [None]:
trainer2.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
250,0.5423,0.5024,0.750867,0.750583,0.750308,0.750867,44.3987,551.84
500,0.4462,0.48126,0.767724,0.748523,0.755767,0.767724,44.3902,551.947
750,0.4322,0.469718,0.773519,0.772027,0.770823,0.773519,44.459,551.092
1000,0.4175,0.48289,0.785274,0.773931,0.775797,0.785274,44.6374,548.889
1250,0.3926,0.446189,0.786907,0.787545,0.788251,0.786907,44.3687,552.213
1500,0.3889,0.454392,0.792743,0.791523,0.790544,0.792743,44.4086,551.718
1750,0.3792,0.4413,0.796008,0.793187,0.791603,0.796008,44.3823,552.045


TrainOutput(global_step=1877, training_loss=0.4238505924897557, metrics={'train_runtime': 1211.3602, 'train_samples_per_second': 1.549, 'total_flos': 7691018384649600, 'epoch': 1.0})

In [None]:
trainer2.evaluate()

{'eval_loss': 0.4412999153137207,
 'eval_accuracy': 0.7960083261907678,
 'eval_f1': 0.7931873626954754,
 'eval_precision': 0.7916032416799048,
 'eval_recall': 0.7960083261907678,
 'eval_runtime': 44.4463,
 'eval_samples_per_second': 551.25,
 'epoch': 1.0}

In [None]:
trainer2.save_model('rubert-toxic-pikabu-2ch-1')

### Мультиязычный Берт

In [None]:
model_name3 = 'bert-base-multilingual-cased'

In [None]:
tokenizer3 = BertTokenizer.from_pretrained(model_name3)
model3 = BertForSequenceClassification.from_pretrained(model_name3)

In [None]:
train_dataset = UnsafeData(x_train, y_train, tokenizer3, max_len = 60)
test_dataset = UnsafeData(x_test, y_test, tokenizer3, max_len = 60)

In [None]:
training_args = TrainingArguments(
    output_dir='/kaggle/working/bert3',
    num_train_epochs=1,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    logging_steps = 250,
    evaluation_strategy = 'steps',
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/kaggle/working/bert3/logs',
    save_steps = 500,
    load_best_model_at_end = True    
)

In [None]:
trainer3 = Trainer(
    model=model3,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [None]:
trainer3.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
250,0.5644,0.537485,0.723277,0.716637,0.71286,0.723277,44.578,549.62
500,0.4941,0.510846,0.742582,0.737213,0.73418,0.742582,44.3589,552.336
750,0.4724,0.494198,0.761602,0.75336,0.751125,0.761602,44.3315,552.676
1000,0.4578,0.533611,0.767152,0.755627,0.755381,0.767152,44.3842,552.02
1250,0.4377,0.479622,0.770499,0.764939,0.762651,0.770499,44.358,552.347
1500,0.4233,0.49613,0.777887,0.768942,0.768148,0.777887,44.688,548.268
1750,0.4107,0.470671,0.78058,0.770685,0.770759,0.78058,44.7894,547.027


TrainOutput(global_step=1877, training_loss=0.46105092068460307, metrics={'train_runtime': 1192.2598, 'train_samples_per_second': 1.574, 'total_flos': 7691018384649600, 'epoch': 1.0})

In [None]:
trainer3.evaluate()

{'eval_loss': 0.47067052125930786,
 'eval_accuracy': 0.780580384474103,
 'eval_f1': 0.7706852379297162,
 'eval_precision': 0.7707590875588345,
 'eval_recall': 0.780580384474103,
 'eval_runtime': 44.3899,
 'eval_samples_per_second': 551.95,
 'epoch': 1.0}

In [None]:
trainer3.save_model('bert-base-multilingual-cased-1')

### GPT

In [None]:
model_name4 = 'sberbank-ai/rugpt3medium_based_on_gpt2'

In [None]:
tokenizer4 = GPT2TokenizerFast.from_pretrained(model_name4)
model4 = GPT2ForSequenceClassification.from_pretrained(model_name4, num_labels=2)

Downloading:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


Downloading:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.73G [00:00<?, ?B/s]

Some weights of the model checkpoint at sberbank-ai/rugpt3medium_based_on_gpt2 were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at sberbank-ai/rugpt3medium_based_on_gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenizer4.pad_token = '<pad>'
model4.config.pad_token_id = 0

In [None]:
train_dataset = UnsafeData(x_train, y_train, tokenizer4, max_len = 60)
test_dataset = UnsafeData(x_test, y_test, tokenizer4, max_len = 60)

In [None]:
train_dataset[23000]

{'input_ids': tensor([  404, 45601, 18387,    16,   374,   505,  2156,  4155, 21691,    18,
         10118,    16,   374,   492,   282, 13331,  7757,    16,   367,  1122,
           726,   289,  4181,   375,   282, 17570,  4172,  5920,  5346,   289,
          4136,    17,  1796, 47643, 12904,   289, 17664, 30809,   449,  1369,
          1262,   282, 44999,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor(1)}

In [None]:
training_args = TrainingArguments(
    output_dir='/kaggle/working/gpt',
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps = 500,
    evaluation_strategy = 'steps',
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/kaggle/working/gpt/logs',
    save_steps = 3000,
    #load_best_model_at_end = True    
)

In [None]:
model4.is_parallelizable = False

In [None]:
trainer4 = Trainer(
    model=model4,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.


In [None]:
trainer4.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
500,0.4497,0.514598,0.76254,0.725942,0.760833,0.76254,188.8852,129.714
1000,0.3867,0.452288,0.786703,0.783783,0.782053,0.786703,188.1372,130.229
1500,0.3742,0.424989,0.801845,0.800154,0.798966,0.801845,187.9244,130.377
2000,0.3662,0.433921,0.804743,0.797204,0.797745,0.804743,187.845,130.432
2500,0.35,0.418852,0.808089,0.808487,0.808919,0.808089,187.8284,130.444
3000,0.341,0.412172,0.814497,0.813299,0.812402,0.814497,187.9495,130.359
3500,0.3324,0.404389,0.816252,0.813831,0.812587,0.816252,188.0015,130.323


TrainOutput(global_step=3754, training_loss=0.36774657772269476, metrics={'train_runtime': 4230.5572, 'train_samples_per_second': 0.887, 'total_flos': 15389121562214400, 'epoch': 1.0})

In [None]:
trainer4.evaluate()

{'eval_loss': 0.4090506136417389,
 'eval_accuracy': 0.8171095057344598,
 'eval_f1': 0.814694757221525,
 'eval_precision': 0.8134638453515362,
 'eval_recall': 0.8171095057344598,
 'eval_runtime': 188.2163,
 'eval_samples_per_second': 130.175,
 'epoch': 1.0}

In [None]:
trainer4.save_model('rugpt3medium_based_on_gpt2-1')

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

### Роберта

In [None]:
model_name5 = 'blinoff/roberta-base-russian-v0'

In [None]:
tokenizer5 = AutoTokenizer.from_pretrained(model_name5)
model5 = AutoModelForSequenceClassification.from_pretrained(model_name5, num_labels=2)

Some weights of the model checkpoint at blinoff/roberta-base-russian-v0 were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at blinoff/roberta-base-russian-v0 and

In [None]:
train_dataset = UnsafeData(x_train, y_train, tokenizer5, max_len = 60)
test_dataset = UnsafeData(x_test, y_test, tokenizer5, max_len = 60)

In [None]:
train_dataset[23000]

{'input_ids': tensor([    0,   340, 21880, 14563,    16,   392,   508,  1887,  6878,  9687,
            18,   225,   145,   246,  3078,   563,    16,   392,   457,   283,
         12555,  6447,    16,   225,   145,   113,   845,   644,   292, 45871,
           283, 10118,   340, 28478,   292,  2715,    17,  1734, 40008,  3952,
           292, 14301, 28005,  1281,   488,   728,   872,   283, 34457,     2,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor(1)}

In [None]:
training_args = TrainingArguments(
    output_dir='/kaggle/working/roberta',
    num_train_epochs=1,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    logging_steps = 250,
    evaluation_strategy = 'steps',
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/kaggle/working/roberta/logs',
    save_steps = 500,
    load_best_model_at_end = True    
)

In [None]:
trainer5 = Trainer(
    model=model5,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.


In [None]:
trainer5.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
250,0.5786,0.601927,0.660096,0.661018,0.661983,0.660096,44.6258,549.033
500,0.5517,0.588827,0.656953,0.670227,0.706609,0.656953,44.6197,549.108
750,0.5137,0.531183,0.735113,0.721182,0.718657,0.735113,44.5747,549.662
1000,0.4987,0.564144,0.736582,0.684643,0.7265,0.736582,44.5684,549.739
1250,0.4822,0.511923,0.745602,0.740826,0.737994,0.745602,44.5735,549.676
1500,0.4717,0.518572,0.748541,0.745081,0.742725,0.748541,44.5803,549.592
1750,0.4639,0.506725,0.754459,0.746256,0.743611,0.754459,44.5631,549.805


TrainOutput(global_step=1877, training_loss=0.5044689920269941, metrics={'train_runtime': 1103.7289, 'train_samples_per_second': 1.701, 'total_flos': 5382039072009600, 'epoch': 1.0})

In [None]:
trainer5.evaluate()

{'eval_loss': 0.5067250728607178,
 'eval_accuracy': 0.7544590016734011,
 'eval_f1': 0.7462563967776344,
 'eval_precision': 0.7436107816231792,
 'eval_recall': 0.7544590016734011,
 'eval_runtime': 44.6039,
 'eval_samples_per_second': 549.301,
 'epoch': 1.0}

In [None]:
trainer5.save_model('roberta-base-russian-v0-1')