In [36]:
import pandas as pd
from nltk.tokenize import wordpunct_tokenize
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score


import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.autograd import Variable
from torch.nn import functional as F

In [3]:
train_df = pd.read_csv('/kaggle/input/unsafe/train_randst0.csv')
test_df = pd.read_csv('/kaggle/input/unsafe/val_randst0.csv')
train_df = train_df[["text", "unsafe"]]
test_df = test_df[["text", "unsafe"]]

In [4]:
train_df = train_df.loc[(train_df['unsafe'] >= 0.8) | (train_df['unsafe'] <= 0.2)]

In [5]:
def binary(val):
    return round(val)

In [6]:
train_df['unsafe'] = train_df['unsafe'].apply(binary)
test_df['unsafe'] = test_df['unsafe'].apply(binary)

In [7]:
train_df, test_df

(                                                     text  unsafe
 0       я думал что левиафаны - это те медленные страх...       1
 1       А был бы этот полицейский в Петербурге, так пе...       1
 2       Напоминаю, что пора искать актис невзрослого п...       1
 3       курю лет пятнадцать никаких проблем кроме како...       1
 4       окей, я тогда проведу парад гетеросексуалов, п...       1
 ...                                                   ...     ...
 138825      Перед клиентом отвечает банк, а не сотрудник.       0
 138826  Так воооот откуда я их знаю, какое старое виде...       0
 138827  Да потом просто "такие вот люди" начинают жало...       0
 138829  А теперь давай фоточки не миллионеров, а обычн...       0
 138830  Не нашел информации о том, что он был доктором...       0
 
 [120120 rows x 2 columns],
                                                     text  unsafe
 0                       уровень ссачнее, чем ад в доом 3       1
 1      У нас несколько спортсмено

In [8]:
x_train = train_df['text'].tolist()
y_train = train_df['unsafe'].tolist()
x_test = test_df['text'].tolist()
y_test = test_df['unsafe'].tolist()

### Готовые векторы fasttext

In [9]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz

--2021-03-11 10:44:14--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 172.67.9.4, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1306357571 (1.2G) [binary/octet-stream]
Saving to: ‘cc.ru.300.vec.gz’


2021-03-11 10:44:47 (38.4 MB/s) - ‘cc.ru.300.vec.gz’ saved [1306357571/1306357571]



In [10]:
!gunzip cc.ru.300.vec.gz

In [13]:
pad_token = 'PAD'

vocab = dict()
embeddings = list()

vocab_size = 400000
embedding_dim = 300

vocab[pad_token] = len(vocab)
embeddings.append(np.zeros(embedding_dim))

In [15]:
with open('cc.ru.300.vec', 'r') as f:
    a = f.readline()
    for line in f:
        parts = line.strip().split()
        token = ' '.join(parts[:-embedding_dim])
        if token in vocab:
            continue
        word_vector = np.array(list(map(float, parts[-embedding_dim:])))
        
        vocab[token] = len(vocab)
        embeddings.append(word_vector)
        
        if len(vocab) == vocab_size:
            break

In [16]:
embeddings = np.stack(embeddings)
embeddings.shape

(400000, 300)

In [17]:
max_len = 45

In [18]:
class UnsafeData(Dataset):

    def __init__(self, texts, targets, vocab, max_len, pad_index = 0):
        
        super().__init__()
        
        self.texts = texts
        self.targets = targets        
        self.max_len = max_len
        self.pad_index = pad_index
        
        self.vocab = vocab

    def __len__(self):
        
        return len(self.texts)
    
    
    def tokenization(self, text):
        
        tokens = wordpunct_tokenize(text)        
        token_indices = [self.vocab[tok] for tok in tokens if tok in self.vocab]
        
        return token_indices
    
    def padding(self, text):
        
        text = text[:self.max_len]        
        text += [self.pad_index] * (self.max_len - len(text))        
        return text

    
    def __getitem__(self, index):
        x = self.texts[index]
        y = self.targets[index]
        
        x = self.tokenization(x)
        x = self.padding(x)
        
        x = torch.tensor(x).long()
        y = torch.tensor(y).float()
        
        return x, y

In [19]:
train_dataset = UnsafeData(x_train, y_train, vocab, max_len)
test_dataset = UnsafeData(x_test, y_test, vocab, max_len)

In [20]:
len(train_dataset), len(test_dataset)

(120120, 24501)

In [21]:
x_train[23000]

'не расстраивайтесь, что его быстро отпустят. Главное, что он в отделение попал, а там уже и героин в чемодане найдётся и какая-нибудь порнография и экстремистские репосты в телефоне'

In [22]:
train_dataset[23000]

(tensor([    12,  75546,      1,     20,     40,    478, 121841,      2,   2786,
              1,     20,     53,      4,   2777,   3574,      1,     28,    202,
             73,      3,  77816,      4, 113093,  40576,      3,   2332,     15,
           6459, 111797,      3, 144472, 202000,      4,  10982,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0]),
 tensor(1.))

In [23]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle = True)

In [24]:
for x, y in test_loader:
    break

x.shape, y.shape

(torch.Size([32, 45]), torch.Size([32]))

Модель RCNN

In [26]:
embeddings = torch.tensor(embeddings).float()

In [29]:
class RCNN(nn.Module):

    
    def __init__(self, embeddings, embedding_dim, hidden_size, hidden_size_linear, class_num, dropout, n_layers):
        super(RCNN, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embeddings, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True, bidirectional=True, dropout=dropout, num_layers=n_layers)
        self.W = nn.Linear(embedding_dim + 2*hidden_size, hidden_size_linear)
        self.fc = nn.Linear(hidden_size_linear, class_num)
        
        self.act = nn.Sigmoid()

        
    def forward(self, x):
        # x = |bs, seq_len|
        x_emb = self.embedding(x)
        # x_emb = |bs, seq_len, embedding_dim|
        output, _ = self.lstm(x_emb)
        # output = |bs, seq_len, 2*hidden_size|
        output = torch.cat([output, x_emb], 2)
        # output = |bs, seq_len, embedding_dim + 2*hidden_size|
        output = self.W(output).transpose(1, 2)
        # output = |bs, seq_len, hidden_size_linear| -> |bs, hidden_size_linear, seq_len|
        output = F.max_pool1d(output, output.size(2)).squeeze(2)
        # output = |bs, hidden_size_linear|
        output = self.fc(output)
        # output = |bs, class_num|
        return self.act(output)

In [44]:
model = RCNN(
    embeddings = embeddings,
    embedding_dim = 300,
    hidden_size = 300,
    hidden_size_linear = 128,
    class_num = 1,
    n_layers = 4,
    dropout = 0.5
)

In [31]:
model(x).size()

torch.Size([32, 1])

In [45]:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCELoss()

In [43]:
def metrics(true, predictions):
    
    rounded_preds = torch.round(predictions)
    
    precision, recall, f1, _ = precision_recall_fscore_support(true, rounded_preds, average='weighted', zero_division = 0)
    acc = accuracy_score(true, rounded_preds)
    #roc_auc = roc_auc_score(true, predictions)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
        #'roc_auc': roc_auc
    }
    
    

In [46]:
device = torch.device('cuda')
model.to(device)

RCNN(
  (embedding): Embedding(400000, 300, padding_idx=0)
  (lstm): LSTM(300, 300, num_layers=4, batch_first=True, dropout=0.5, bidirectional=True)
  (W): Linear(in_features=900, out_features=128, bias=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (act): Sigmoid()
)

In [47]:
def train(model, loader, optimizer, criterion, last_n_losses=200, verbose=True):

    losses = []
    f_scores = []
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    #roc_auc_scores = []

    progress_bar = tqdm(total=len(loader), disable=not verbose, desc='Train')

    model.train()

    for x, y in loader:

        x = x.to(device)
        y = y.to(device)
        
        optimizer.zero_grad()
        
        yhat = model(x).squeeze()
        
        loss = criterion(yhat, y)
        loss.backward()
        optimizer.step()
        
        cur_metrics = metrics(y.cpu(), yhat.detach().cpu())


        losses.append(loss.item())
        f_scores.append(cur_metrics['f1'])
        accuracy_scores.append(cur_metrics['accuracy'])
        precision_scores.append(cur_metrics['precision'])
        recall_scores.append(cur_metrics['recall'])
        #roc_auc_scores.append(cur_metrics['roc_auc'])
        

        progress_bar.set_postfix(loss=np.mean(losses[-last_n_losses:]), f1=np.mean(f_scores[-last_n_losses:]),
                                accuracy=np.mean(accuracy_scores[-last_n_losses:]))
        
        progress_bar.update()

    progress_bar.close()
    
    return {'loss': np.sum(losses)/len(loader), 'f_score': np.sum(f_scores)/len(loader), 'accuracy': np.sum(accuracy_scores)/len(loader),
           'precision': np.sum(precision_scores)/len(loader), 'recall': np.sum(recall_scores)/len(loader)}

In [48]:
tqdm._instances.clear()

In [49]:
def evaluate(model, loader, criterion, last_n_losses=200, verbose=True):

    losses = []
    f_scores = []
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    #roc_auc_scores = []

    progress_bar = tqdm(total=len(loader), disable=not verbose, desc='Eval')

    model.eval()
    with torch.no_grad():

        for x, y in loader:

            x = x.to(device)
            y = y.to(device)
        
            yhat = model(x).squeeze()
        
            loss = criterion(yhat, y)
        
            cur_metrics = metrics(y.cpu(), yhat.detach().cpu())


            losses.append(loss.item())
            f_scores.append(cur_metrics['f1'])
            accuracy_scores.append(cur_metrics['accuracy'])
            precision_scores.append(cur_metrics['precision'])
            recall_scores.append(cur_metrics['recall'])
            #roc_auc_scores.append(cur_metrics['roc_auc'])
        

            progress_bar.set_postfix(loss=np.mean(losses[-last_n_losses:]), f1=np.mean(f_scores[-last_n_losses:]),
                                accuracy=np.mean(accuracy_scores[-last_n_losses:]))
        
            progress_bar.update()

        progress_bar.close()
    
    return {'loss': np.sum(losses)/len(loader), 'f_score': np.sum(f_scores)/len(loader), 'accuracy': np.sum(accuracy_scores)/len(loader),
           'precision': np.sum(precision_scores)/len(loader), 'recall': np.sum(recall_scores)/len(loader)}

In [39]:
save_best_model_path = '/kaggle/working/best_model_state_dict.pth'
save_best_optimizer_path = '/kaggle/working/best_optimizer_state_dict.pth'

In [50]:
n_epochs = 7
best_valid_loss = float('inf')
patience = 0
for epoch in range(n_epochs):
     
    #train the model
    train_metrics = train(model, train_loader, optimizer, criterion)
    
    #evaluate the model
    valid_metrics = evaluate(model, test_loader, criterion)
    
    print(train_metrics)
    print(valid_metrics)
    
    #save the best model
    if valid_metrics['loss'] < best_valid_loss:
        best_valid_loss = valid_metrics['loss']
        torch.save(model.state_dict(), save_best_model_path)
        torch.save(optimizer.state_dict(), save_best_optimizer_path)
    else:
        patience +=1
        if patience>3:
            break
    
   

Train: 100%|██████████| 3754/3754 [03:18<00:00, 18.89it/s, accuracy=0.784, f1=0.769, loss=0.465]
Eval: 100%|██████████| 766/766 [00:11<00:00, 65.12it/s, accuracy=0.755, f1=0.741, loss=0.504]


{'loss': 0.4828137460546285, 'f_score': 0.7485742820326823, 'accuracy': 0.7746764562244718, 'precision': 0.7631495961252673, 'recall': 0.7746764562244718}
{'loss': 0.5074483891146613, 'f_score': 0.7422692265866033, 'accuracy': 0.7561524773094616, 'precision': 0.7550186176469228, 'recall': 0.7561524773094616}


Train: 100%|██████████| 3754/3754 [03:18<00:00, 18.88it/s, accuracy=0.799, f1=0.783, loss=0.445]
Eval: 100%|██████████| 766/766 [00:11<00:00, 64.24it/s, accuracy=0.759, f1=0.737, loss=0.507]


{'loss': 0.4499317804509233, 'f_score': 0.7787348709526101, 'accuracy': 0.792918109572012, 'precision': 0.7938788806810664, 'recall': 0.792918109572012}
{'loss': 0.4998512322535403, 'f_score': 0.7418788733460003, 'accuracy': 0.7613938362551287, 'precision': 0.7593719915915885, 'recall': 0.7613938362551287}


Train: 100%|██████████| 3754/3754 [03:18<00:00, 18.88it/s, accuracy=0.797, f1=0.785, loss=0.441]
Eval: 100%|██████████| 766/766 [00:11<00:00, 64.86it/s, accuracy=0.757, f1=0.732, loss=0.506]


{'loss': 0.4358515429787326, 'f_score': 0.7876578042237112, 'accuracy': 0.8003518469188421, 'precision': 0.8023475489934392, 'recall': 0.8003518469188421}
{'loss': 0.49396109907807634, 'f_score': 0.7405235279301529, 'accuracy': 0.7628255936839489, 'precision': 0.7621384316164349, 'recall': 0.7628255936839489}


Train: 100%|██████████| 3754/3754 [03:18<00:00, 18.92it/s, accuracy=0.807, f1=0.796, loss=0.427]
Eval: 100%|██████████| 766/766 [00:11<00:00, 64.22it/s, accuracy=0.771, f1=0.759, loss=0.487]


{'loss': 0.42369382437288794, 'f_score': 0.7966916505444869, 'accuracy': 0.8079076762564377, 'precision': 0.8114353165577556, 'recall': 0.8079076762564377}
{'loss': 0.4927181245530554, 'f_score': 0.7548162286709772, 'accuracy': 0.7666196537361681, 'precision': 0.7673781475010625, 'recall': 0.7666196537361681}


Train: 100%|██████████| 3754/3754 [03:18<00:00, 18.90it/s, accuracy=0.82, f1=0.81, loss=0.412]  
Eval: 100%|██████████| 766/766 [00:11<00:00, 63.90it/s, accuracy=0.774, f1=0.762, loss=0.499]
Train:   0%|          | 2/3754 [00:00<03:34, 17.48it/s, accuracy=0.854, f1=0.846, loss=0.383]

{'loss': 0.41024466378135, 'f_score': 0.80571532996075, 'accuracy': 0.8154718300479489, 'precision': 0.8195157155094174, 'recall': 0.8154718300479489}
{'loss': 0.5010131045631266, 'f_score': 0.7542180569894014, 'accuracy': 0.7665128061668531, 'precision': 0.7671131261995193, 'recall': 0.7665128061668531}


Train: 100%|██████████| 3754/3754 [03:18<00:00, 18.90it/s, accuracy=0.823, f1=0.816, loss=0.387]
Eval: 100%|██████████| 766/766 [00:11<00:00, 64.28it/s, accuracy=0.772, f1=0.758, loss=0.511]
Train:   0%|          | 2/3754 [00:00<03:24, 18.38it/s, accuracy=0.844, f1=0.832, loss=0.338]

{'loss': 0.39637428208328207, 'f_score': 0.8133243130759508, 'accuracy': 0.8219010832889363, 'precision': 0.8262839668876089, 'recall': 0.8219010832889363}
{'loss': 0.5116566867414405, 'f_score': 0.7553526431855613, 'accuracy': 0.7701300820589333, 'precision': 0.769323224480295, 'recall': 0.7701300820589333}


Train: 100%|██████████| 3754/3754 [03:18<00:00, 18.87it/s, accuracy=0.83, f1=0.824, loss=0.382] 
Eval: 100%|██████████| 766/766 [00:12<00:00, 63.16it/s, accuracy=0.755, f1=0.751, loss=0.515]


{'loss': 0.3795214213359432, 'f_score': 0.82381760912425, 'accuracy': 0.8314270333866097, 'precision': 0.836019396684565, 'recall': 0.8314270333866097}
{'loss': 0.5156627208305713, 'f_score': 0.7564544109890723, 'accuracy': 0.761296702101206, 'precision': 0.766289687633515, 'recall': 0.761296702101206}
