In [None]:
!pip install transformers

In [36]:
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn import functional as F

from tqdm import tqdm
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import numpy as np

In [3]:
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
model_bert = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711M [00:00<?, ?B/s]

In [5]:
train_df = pd.read_csv('/kaggle/input/unsafe/train_randst0.csv')
test_df = pd.read_csv('/kaggle/input/unsafe/val_randst0.csv')
train_df = train_df[["text", "unsafe"]]
test_df = test_df[["text", "unsafe"]]

In [6]:
train_df = train_df.loc[(train_df['unsafe'] >= 0.8) | (train_df['unsafe'] <= 0.2)]

In [7]:
def binary(prob):
    if prob < 0.5:
        return 0.
    else:
        return 1.0

In [8]:
train_df['unsafe'] = train_df['unsafe'].apply(binary)
test_df['unsafe'] = test_df['unsafe'].apply(binary)

In [9]:
train_df, test_df

(                                                     text  unsafe
 0       я думал что левиафаны - это те медленные страх...     1.0
 1       А был бы этот полицейский в Петербурге, так пе...     1.0
 2       Напоминаю, что пора искать актис невзрослого п...     1.0
 3       курю лет пятнадцать никаких проблем кроме како...     1.0
 4       окей, я тогда проведу парад гетеросексуалов, п...     1.0
 ...                                                   ...     ...
 138825      Перед клиентом отвечает банк, а не сотрудник.     0.0
 138826  Так воооот откуда я их знаю, какое старое виде...     0.0
 138827  Да потом просто "такие вот люди" начинают жало...     0.0
 138829  А теперь давай фоточки не миллионеров, а обычн...     0.0
 138830  Не нашел информации о том, что он был доктором...     0.0
 
 [120120 rows x 2 columns],
                                                     text  unsafe
 0                       уровень ссачнее, чем ад в доом 3     1.0
 1      У нас несколько спортсмено

In [10]:
x_train = train_df['text'].tolist()
y_train = train_df['unsafe'].tolist()
x_test = test_df['text'].tolist()
y_test = test_df['unsafe'].tolist()

In [12]:
max_len = 45

In [13]:
class UnsafeData(Dataset):

    def __init__(self, texts, targets, tokenizer, max_len):
        
        super().__init__()
        
        self.texts = texts
        self.targets = targets        
        self.max_len = max_len
        
        self.tokenizer = tokenizer

    def __len__(self):
        
        return len(self.texts)

    
    def __getitem__(self, index):
        x = self.texts[index]
        y = self.targets[index]
        
        enc_dict = self.tokenizer(x, truncation=True, max_length=self.max_len, padding='max_length')
        
        tokenized = enc_dict['input_ids']
        mask = enc_dict['attention_mask']
        
        x = torch.tensor(tokenized).long()
        mask = torch.tensor(mask).long()
        y = torch.tensor(y).float()
        
        return x, mask, y

In [14]:
train_dataset = UnsafeData(x_train, y_train, tokenizer, max_len)
test_dataset = UnsafeData(x_test, y_test, tokenizer, max_len)

In [15]:
len(train_dataset), len(test_dataset)

(120120, 24501)

In [16]:
x_train[23000], y_train[23000]

('не расстраивайтесь, что его быстро отпустят. Главное, что он в отделение попал, а там уже и героин в чемодане найдётся и какая-нибудь порнография и экстремистские репосты в телефоне',
 1.0)

In [17]:
train_dataset[23000]

(tensor([   101,   1699, 108867,  11213,  77489,    128,   1997,   2752,  13586,
          26375,   2190,    132,  27609,    128,   1997,   2886,    845,  15636,
          15380,    128,    625,   8528,   4745,    851,  87943,    845,  81741,
            842,   1469,  10596,   1523,    851,  36014,    130,  22655,  36522,
           1577,    851, 114928,  12710, 107485,    845, 111152,    102,      0]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]),
 tensor(1.))

In [18]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle = True)

In [19]:
for x, m, y in train_loader:
    break

x.shape, m.shape, y.shape

(torch.Size([64, 45]), torch.Size([64, 45]), torch.Size([64]))

Новая модель

In [48]:
class RCNN(nn.Module):

    
    def __init__(self, embedding_dim, hidden_size, hidden_size_linear, class_num, dropout, n_layers):
        super(RCNN, self).__init__()
        self.pretrained_model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True, bidirectional=True, dropout=dropout, num_layers=n_layers)
        self.W = nn.Linear(embedding_dim + 2*hidden_size, hidden_size_linear)
        self.fc = nn.Linear(hidden_size_linear, class_num)
        
        self.act = nn.Sigmoid()
        
        for param in self.pretrained_model.parameters():
                param.requires_grad = False

        
    def forward(self, x, mask):
        # x = |bs, seq_len|
        x_emb = self.pretrained_model(x, mask)[0]
        # x_emb = |bs, seq_len, embedding_dim|
        output, _ = self.lstm(x_emb)
        # output = |bs, seq_len, 2*hidden_size|
        output = torch.cat([output, x_emb], 2)
        # output = |bs, seq_len, embedding_dim + 2*hidden_size|
        output = self.W(output).transpose(1, 2)
        # output = |bs, seq_len, hidden_size_linear| -> |bs, hidden_size_linear, seq_len|
        output = F.max_pool1d(output, output.size(2)).squeeze(2)
        # output = |bs, hidden_size_linear|
        output = self.fc(output)
        # output = |bs, class_num|
        return self.act(output)

In [49]:
model = RCNN(
    embedding_dim = 768,
    hidden_size = 256,
    hidden_size_linear = 128,
    class_num = 1,
    n_layers = 3,
    dropout = 0.5
)

In [35]:
model(x, m).size()

torch.Size([64, 1])

In [50]:
device = torch.device('cuda')
#device = torch.device('cpu')
model.to(device)

RCNN(
  (pretrained_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=

In [51]:
num_epochs = 7
learning_rate = 0.0001
warmup_steps = 50
total_steps = len(train_loader) * num_epochs

In [52]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=warmup_steps,
                                            num_training_steps=total_steps)

In [53]:
criterion = nn.BCELoss()

In [54]:
def metrics(true, predictions):
    
    rounded_preds = torch.round(predictions)
    
    precision, recall, f1, _ = precision_recall_fscore_support(true, rounded_preds, average='binary', zero_division = 0)
    acc = accuracy_score(true, rounded_preds)
    #roc_auc = roc_auc_score(true, predictions)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
        #'roc_auc': roc_auc
    }   
    

In [55]:
def train(model, loader, optimizer, scheduler, criterion, last_n_losses=200, verbose=True):

    losses = []
    f_scores = []
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    #roc_auc_scores = []

    progress_bar = tqdm(total=len(loader), disable=not verbose, desc='Train')

    model.train()

    for x, m, y in loader:

        x = x.to(device)
        m = m.to(device)
        y = y.to(device)
        
        optimizer.zero_grad()
        
        yhat = model(x, m).squeeze()
        
        loss = criterion(yhat, y)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        cur_metrics = metrics(y.cpu(), yhat.detach().cpu())


        losses.append(loss.item())
        f_scores.append(cur_metrics['f1'])
        accuracy_scores.append(cur_metrics['accuracy'])
        precision_scores.append(cur_metrics['precision'])
        recall_scores.append(cur_metrics['recall'])
        #roc_auc_scores.append(cur_metrics['roc_auc'])
        

        progress_bar.set_postfix(loss=np.mean(losses[-last_n_losses:]), f1=np.mean(f_scores[-last_n_losses:]),
                                accuracy=np.mean(accuracy_scores[-last_n_losses:]))
        
        progress_bar.update()

    progress_bar.close()
    
    return {'loss': np.sum(losses)/len(loader), 'f_score': np.sum(f_scores)/len(loader), 'accuracy': np.sum(accuracy_scores)/len(loader),
           'precision': np.sum(precision_scores)/len(loader), 'recall': np.sum(recall_scores)/len(loader)}

In [56]:
def evaluate(model, loader, criterion, last_n_losses=200, verbose=True):

    losses = []
    f_scores = []
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    #roc_auc_scores = []

    progress_bar = tqdm(total=len(loader), disable=not verbose, desc='Eval')

    model.eval()
    with torch.no_grad():

        for x, m, y in loader:

            x = x.to(device)
            m = m.to(device)
            y = y.to(device)
        
            yhat = model(x, m).squeeze()
        
            loss = criterion(yhat, y)
        
            cur_metrics = metrics(y.cpu(), yhat.detach().cpu())


            losses.append(loss.item())
            f_scores.append(cur_metrics['f1'])
            accuracy_scores.append(cur_metrics['accuracy'])
            precision_scores.append(cur_metrics['precision'])
            recall_scores.append(cur_metrics['recall'])
            #roc_auc_scores.append(cur_metrics['roc_auc'])
        

            progress_bar.set_postfix(loss=np.mean(losses[-last_n_losses:]), f1=np.mean(f_scores[-last_n_losses:]),
                                accuracy=np.mean(accuracy_scores[-last_n_losses:]))
        
            progress_bar.update()

        progress_bar.close()
    
    return {'loss': np.sum(losses)/len(loader), 'f_score': np.sum(f_scores)/len(loader), 'accuracy': np.sum(accuracy_scores)/len(loader),
           'precision': np.sum(precision_scores)/len(loader), 'recall': np.sum(recall_scores)/len(loader)}

In [57]:
#tqdm._instances.clear()

In [38]:
save_best_model_path = '/kaggle/working/best_model_state_dict.pth'
save_best_optimizer_path = '/kaggle/working/best_optimizer_state_dict.pth'

In [58]:
n_epochs = 7
best_valid_loss = float('inf')
patience = 0
for epoch in range(n_epochs):
     
    #train the model
    train_metrics = train(model, train_loader, optimizer, scheduler, criterion)
    
    #evaluate the model
    valid_metrics = evaluate(model, test_loader, criterion)
    
    print(train_metrics)
    print(valid_metrics)
    
    #save the best model
    if valid_metrics['loss'] < best_valid_loss:
        best_valid_loss = valid_metrics['loss']
        torch.save(model.state_dict(), save_best_model_path)
        torch.save(optimizer.state_dict(), save_best_optimizer_path)
    else:
        patience +=1
        if patience>3:
            break

Train: 100%|██████████| 1877/1877 [04:53<00:00,  6.39it/s, accuracy=0.772, f1=0.481, loss=0.472]
Eval: 100%|██████████| 383/383 [00:47<00:00,  8.07it/s, accuracy=0.751, f1=0.509, loss=0.504]


{'loss': 0.48951109199048864, 'f_score': 0.4387170341768001, 'accuracy': 0.7606184355734834, 'precision': 0.6072930644693312, 'recall': 0.3642996738627255}
{'loss': 0.504410288352568, 'f_score': 0.5067518978830489, 'accuracy': 0.7474013498201882, 'precision': 0.6081783674765513, 'recall': 0.4454706205444552}


Train:  22%|██▏       | 407/1877 [08:34<30:57,  1.26s/it, accuracy=0.755, f1=0.44, loss=0.496]]
Train: 100%|██████████| 1877/1877 [04:55<00:00,  6.35it/s, accuracy=0.789, f1=0.55, loss=0.44]  
Eval: 100%|██████████| 383/383 [00:47<00:00,  8.06it/s, accuracy=0.757, f1=0.508, loss=0.498]


{'loss': 0.4573057105106817, 'f_score': 0.5160473756741593, 'accuracy': 0.7799930074587107, 'precision': 0.6613117393834752, 'recall': 0.4409182372835891}
{'loss': 0.5022160625644515, 'f_score': 0.5096757546461277, 'accuracy': 0.754579966993448, 'precision': 0.6357995072179342, 'recall': 0.4357155363031775}


Train: 100%|██████████| 1877/1877 [04:56<00:00,  6.34it/s, accuracy=0.79, f1=0.567, loss=0.436] 
Eval: 100%|██████████| 383/383 [00:47<00:00,  8.03it/s, accuracy=0.759, f1=0.514, loss=0.495]


{'loss': 0.43944619017264663, 'f_score': 0.552622187859983, 'accuracy': 0.791391563284877, 'precision': 0.6778449824295868, 'recall': 0.4813499733899634}
{'loss': 0.49438481480272256, 'f_score': 0.5180041938122543, 'accuracy': 0.7599581568057541, 'precision': 0.6550837232305435, 'recall': 0.4397148889890095}


Train: 100%|██████████| 1877/1877 [04:56<00:00,  6.34it/s, accuracy=0.796, f1=0.577, loss=0.424]
Eval: 100%|██████████| 383/383 [00:47<00:00,  8.03it/s, accuracy=0.758, f1=0.513, loss=0.489]


{'loss': 0.42593506083376687, 'f_score': 0.5761581310447734, 'accuracy': 0.7988788149783089, 'precision': 0.6886865492445897, 'recall': 0.5117209774554276}
{'loss': 0.4903745595219864, 'f_score': 0.5140265404131781, 'accuracy': 0.7605701019754668, 'precision': 0.6523443471658726, 'recall': 0.43505871898228626}


Train: 100%|██████████| 1877/1877 [04:56<00:00,  6.33it/s, accuracy=0.809, f1=0.602, loss=0.41] 
Eval: 100%|██████████| 383/383 [00:47<00:00,  8.00it/s, accuracy=0.77, f1=0.56, loss=0.487]  
Train:   0%|          | 1/1877 [00:00<05:08,  6.07it/s, accuracy=0.875, f1=0.636, loss=0.346]

{'loss': 0.41400120004447905, 'f_score': 0.5989752783694108, 'accuracy': 0.8078311705609255, 'precision': 0.7031250565166668, 'recall': 0.5361469069674489}
{'loss': 0.49416776456658584, 'f_score': 0.560535533270204, 'accuracy': 0.7655487955071678, 'precision': 0.6400814983752748, 'recall': 0.5123898497712712}


Train: 100%|██████████| 1877/1877 [04:56<00:00,  6.33it/s, accuracy=0.815, f1=0.618, loss=0.406]
Eval: 100%|██████████| 383/383 [00:47<00:00,  8.05it/s, accuracy=0.762, f1=0.528, loss=0.504]
Train:   0%|          | 1/1877 [00:00<05:12,  6.01it/s, accuracy=0.812, f1=0.647, loss=0.367]

{'loss': 0.4047103945362358, 'f_score': 0.6132111585886557, 'accuracy': 0.8119886406880279, 'precision': 0.7075688442623046, 'recall': 0.5549054505440082}
{'loss': 0.5038097676658132, 'f_score': 0.529398676561949, 'accuracy': 0.7635582417853096, 'precision': 0.6593071762448177, 'recall': 0.4530765881481388}


Train:   3%|▎         | 60/1877 [00:09<04:43,  6.40it/s, accuracy=0.826, f1=0.636, loss=0.385]

KeyboardInterrupt: 