In [1]:
!nvidia-smi

Fri Apr 24 00:32:38 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 435.21       Driver Version: 435.21       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:1A:00.0 Off |                  N/A |
| 27%   25C    P8    13W / 260W |   1341MiB / 11019MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:1B:00.0 Off |                  N/A |
| 27%   24C    P8     9W / 260W |    165MiB / 11019MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce RTX 208...  Off  | 00000000:1C:00.0 Off |                  N/A |
| 27%   

In [2]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import re

from sklearn.model_selection import train_test_split

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [3]:
np.random.seed(42)
device = torch.device('cuda:4')

In [4]:
train_data_path='train.csv'
train = pd.read_csv(train_data_path)
test_texts = pd.read_csv('test.csv')
test_labels = pd.read_csv('test_labels.csv')

test_labels = test_labels.iloc[:,1:]
test = test_texts.join(test_labels)
test = test[test['toxic']!=-1]

train_texts = train.iloc[:,1].values
train_labels = train.iloc[:,2:].values

test_texts = test.iloc[:,1].values
test_labels = test.iloc[:,2:].values

In [5]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [8]:
check_line = tokenizer.tokenize(train_texts[0])
print(check_line)

['explanation', 'why', 'the', 'edit', '##s', 'made', 'under', 'my', 'user', '##name', 'hardcore', 'metallic', '##a', 'fan', 'were', 'reverted', '?', 'they', 'weren', "'", 't', 'van', '##dal', '##isms', ',', 'just', 'closure', 'on', 'some', 'gas', 'after', 'i', 'voted', 'at', 'new', 'york', 'dolls', 'fa', '##c', '.', 'and', 'please', 'don', "'", 't', 'remove', 'the', 'template', 'from', 'the', 'talk', 'page', 'since', 'i', "'", 'm', 'retired', 'now', '.', '89', '.', '205', '.', '38', '.', '27']


In [9]:
def vectorize_elem(text, labels, max_len=128):
    
    new_words = tokenizer.tokenize(text)
    input_masks = [1]*len(new_words)

    new_words = ['[CLS]'] + new_words[:max_len-2] + ['[SEP]']
    input_masks = [1] + input_masks[:max_len-2] + [1]
  
    input_ids = tokenizer.convert_tokens_to_ids(new_words)
    #right padding
    padding = [0] * (max_len - len(input_ids))
    input_ids += padding 
    input_masks += padding

    assert len(input_ids) == max_len
    assert len(input_masks) == max_len

    label_ids = list(labels) 

    return torch.tensor(input_ids), torch.tensor(input_masks), torch.tensor(label_ids)
    

In [11]:
def preprocess(X, y, n):
    train, attention_masks_train, y_train = [], [], []
    data_train=[]
    for i, (t, l) in enumerate(zip(X, y)):
        if i%10000 == 0:
            print(i)
        ids, masks, labels = vectorize_elem(t, l, max_len=n)
        train.append(ids)
        attention_masks_train.append(masks)
        y_train.append(labels)

    for i in range(len(train)):
        data_train.append((train[i], attention_masks_train[i], y_train[i]))

    return data_train

In [12]:
%%time
MAX_LEN = 128
X_train = preprocess(train_texts, train_labels,n=MAX_LEN)
print('finished preprocessing of train set')
X_test = preprocess(test_texts, test_labels, n=MAX_LEN)
print('finished preprocessing of test set')

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
finished preprocessing of train set
0
10000
20000
30000
40000
50000
60000
finished preprocessing of test set
CPU times: user 4min 14s, sys: 1.1 s, total: 4min 15s
Wall time: 4min 15s


In [15]:
def accuracy(preds, labs):
    num_right = torch.sum(preds == labs).item()
    return num_right/preds.shape[0]

In [16]:
def right(preds, labs):
    #just a number of right answers in the batch
    num_right = torch.sum(predictions == labels)
    return num_right.item()

In [17]:
def train_epoch(model, train_dataset, test_dataset, bs_train, bs_test):
    train_losses = []
    train_right_answers = 0
    test_losses = []
    test_accuracies = []
    
   
    train_loader = DataLoader(train_dataset, batch_size=bs_train, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=bs_test, shuffle=False)

    for j, train_batch, in enumerate(train_loader):
        if j%100==9:
            print(j)
        #train
        model.train()
        b_ids, b_mask, b_labels = train_batch
        b_ids = b_ids.to(device)
        b_mask = b_mask.to(device)
        b_labels = b_labels.to(device)

        model.zero_grad()

        output = model(b_ids, token_type_ids=None, attention_mask=b_mask, labels=b_labels)
        loss = output[0]
        logits = output[1]

        train_losses.append(loss.item())
        train_right_answers += right(logits, b_labels)

        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        #test
        #model.eval()
#
        #b_ids, b_mask, b_labels = test_batch
#
        #b_ids = b_ids.to(device)
        #b_mask = b_mask.to(device)
        #b_labels = b_labels.to(device)
        if j % 500 == 9:
            model.eval()
            test_loss = 0
            test_right_answer = 0
            for test_batch in test_loader:
                test_ids, test_mask, test_labels = test_batch
                test_ids.to(device)
                test_mask.to(device)
                test_labels.to(device)
                with torch.no_grad():
                    output = model(test_ids, token_type_ids=None, attention_mask=test_mask, labels=test_labels)
                    loss = output[0]
                    logits = output[1]

                    test_loss += loss.item()
                    test_right_answer += right(logits, test_labels)
            test_losses.append(test_loss)
            test_accuracies.append(test_right_answer/len(test_dataset))
    
  
    print("Last average batch_loss on train: {}, \t Accuracy: {}".format(train_losses[-1], 
                                                                         train_right_answers/(j+1)/bs))
    print("Last average batch_loss on test: {}, \t Accuracy: {}".format(test_losses[-1], 
                                                                        test_right_answer/len(test_dataset)))
    return train_losses, train_right_answers/(j+1)/bs, test_losses, test_accuracies




In [18]:
%%time
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup

#for lr in [1e-3, 5e-4, 1e-4, 5e-5, 1e-5, 5e-6, 1e-6]:
BATCH_SIZE = 16
BATCH_SIZE_TEST = 64
MAX_N_EPOCHS = 3
LEARNING_RATE = 2e-5



model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6).to(device)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1,
                                               num_training_steps=(len(X_train) // BATCH_SIZE)*MAX_N_EPOCHS)
#lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0.1,
                                    #t_total=(len(X_train) // BATCH_SIZE)*MAX_N_EPOCHS)

train_losses = []
train_accs = []
test_losses = []
test_accs = []
n_batches = len(X_train)//BATCH_SIZE

for i in range(MAX_N_EPOCHS):
    stats = train_epoch(model, X_train, X_test, bs_train=BATCH_SIZE, bs_test=BATCH_SIZE_TEST)
    print(f"Epoch {i} finished.")
    train_losses += stats[0]
    train_accs.append(stats[1])
    test_losses += stats[2]
    test_accs += stats[3]

ValueError: Expected input batch_size (16) to match target batch_size (96).

In [29]:
p = DataLoader(X_train, batch_size=4, shuffle=True)
for i, train_batch in enumerate(p):
    a, b, c = train_batch
    break

In [31]:
print(a.shape)

torch.Size([4, 128])


In [32]:
print(b.shape)

torch.Size([4, 128])


In [48]:
print(c.unsqueeze(0).shape)

torch.Size([1, 4, 6])


In [33]:
print(c.shape)

torch.Size([4, 6])


In [45]:
model_test = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6)

In [51]:
out = model_test(a, token_type_ids=None, attention_mask=b, labels=)

ValueError: Expected input batch_size (4) to match target batch_size (24).