In [1]:
#@title Installing libraries { display-mode: "form" }
!pip3 install transformers 
!pip3 install tensorboard 
!pip3 install simpletransformers
!pip3 install jsonlines
!pip3 install pytelegrambotapi
!pip3 install --upgrade tqdm
!pip3 install tg-logger

Requirement already up-to-date: tqdm in /usr/local/lib/python3.7/dist-packages (4.58.0)


In [2]:
#@title Setup & Config & Imports
# !mkdir /usr/local/lib/python3.6/dist-packages/tqdm-4.41.1.dist-info/METADATA
from tqdm.notebook import tqdm
# class _TQDM(tqdm.tqdm):
#     def __init__(self, *argv, **kwargs):
#         kwargs['disable'] = True
#         if kwargs.get('disable_override', 'def') != 'def':
#             kwargs['disable'] = kwargs['disable_override']
#         super().__init__(*argv, **kwargs)
# tqdm.tqdm = _TQDM
 
import pandas as pd
import numpy as np
import json
import jsonlines
import re
from collections import defaultdict, Counter
 
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['text.color'] = mpl.rcParams['axes.labelcolor'] = mpl.rcParams['xtick.color'] = mpl.rcParams['ytick.color'] = 'white'
 
%pylab inline
from sklearn.metrics import *
 
try:
    from simpletransformers.classification import ClassificationModel, ClassificationArgs
except (FileNotFoundError, AttributeError):
    from simpletransformers.classification import ClassificationModel, ClassificationArgs
from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs
 
%load_ext tensorboard
 
import string
 
RS = 179
PUNCTUATION = string.punctuation
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager

Populating the interactive namespace from numpy and matplotlib


In [3]:
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [4]:
!rm -rf sample_data/

# Downloading data

In [5]:
!curl https://onti2020.ai-academy.ru/task/rucos.zip -o data.zip
!unzip -o data.zip

!mv RuCoS/ data/

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 53.6M  100 53.6M    0     0  9311k      0  0:00:05  0:00:05 --:--:-- 13.0M
Archive:  data.zip
   creating: RuCoS/
  inflating: __MACOSX/._RuCoS        
  inflating: RuCoS/.DS_Store         
  inflating: __MACOSX/RuCoS/._.DS_Store  
  inflating: RuCoS/rucos_test.jsonl  
  inflating: __MACOSX/RuCoS/._rucos_test.jsonl  
  inflating: RuCoS/rucos_val.jsonl   
  inflating: __MACOSX/RuCoS/._rucos_val.jsonl  
  inflating: RuCoS/rucos_train.jsonl  
  inflating: __MACOSX/RuCoS/._rucos_train.jsonl  


In [6]:
!ls -lah data/

total 210M
drwxr-xr-x 2 root root 4.0K Feb 26 12:40 .
drwxr-xr-x 1 root root 4.0K Mar  2 15:05 ..
-rw-r--r-- 1 root root 6.1K Nov 10 21:06 .DS_Store
-rwxr-xr-x 1 root root  17M Oct 15 03:28 rucos_test.jsonl
-rwxr-xr-x 1 root root 175M Oct 15 03:26 rucos_train.jsonl
-rwxr-xr-x 1 root root  19M Oct 15 03:25 rucos_val.jsonl


# Process data

In [7]:
class QADataset: 
    """Question and answers dataset""" 
     
    def __init__(self, path, making_smaller=False, lim=-1): 
        """ 
        Args: 
            path (string): Path to jsonl file 
        """ 
        self.path = path 
        data = [] 
        self.text = [] 
        with open(path, 'r') as json_file: 
            json_list = list(json_file) 
            for json_str in tqdm(json_list if lim == -1 else json_list[:lim]): 
                item = json.loads(json_str) 
                text = item['passage']['text'] 
                self.text.append(text) 
                entities = item['passage']['entities'] 
                used = set()
                for en in entities:
                    word = text[en['start']:en['end']] 
                    if word not in used:
                        used.add(word)
                entities = list(used)

                for row in item['qas']: 
                    for ans in row.get('answers', [None]): 
                        q_text = row['query'] 
                        for i, word in enumerate(entities): 
                            n_q = q_text.replace('@placeholder', word) 
                            data.append({"idx": f'{item["idx"]}/{i}', 
                                         "text_id": len(self.text) - 1,
                                         "query": n_q})
                            
                            if not ans:
                                continue 
 
                            label = int(ans['text'].strip() == word.strip())
                            if not making_smaller or label or random.randint(1, 10) == 1:
                                data[-1]["label"] = label
                            else:
                                data.pop()
                self.text[-1] = re.sub(r'(@\w+)|(\\n)', " ", self.text[-1])
        self.data = pd.DataFrame.from_dict(data).set_index("idx")
     
    def __len__(self): 
        return len(self.data) 
 
    def __getitem__(self, ind):  
        if type(ind) == int: 
            row = self.data.iloc[ind] 
            text_a = self.text[row.text_id].split() 
            text_b = row.query 
 
            w = 512 - len(text_b.split()) - 5 
            e = random.randint(0, len(text_a) - w) 
            text_a = ' '.join(text_a[e:e + w]) 
 
            res = [[list(self.data.index)[ind], text_a, text_b, row.label if 'label' in self.data.columns else None]] 
            return pd.DataFrame(res, columns=["idx", "text_a", "text_b", "labels"]).set_index("idx") 
        if type(ind) == slice: 
            res = []
            rows = self.data.iloc[ind] 
            for idx, row in tqdm(rows.iterrows(), total=len(rows)): 
                text_a = self.text[row.text_id].split() 
                text_b = row.query 
 
                w = 512 - len(text_b.split()) - 5 
                e = random.randint(0, max(1, len(text_a) - w)) 
                text_a = ' '.join(text_a[e:e + w]) 
 
                res.append([idx, text_a, text_b, row.label if 'label' in self.data.columns else None]) 
            return pd.DataFrame(res, columns=["idx", "text_a", "text_b", "labels"]).set_index("idx")
 
 
    def get_df(self): 
        texts = self.data['data'] 
        labels = self.data['label'] if 'label' in self.data.columns else [None] * self.data.shape[0] 
 
        return pd.DataFrame({ 
                    'text': texts, 
                    'labels': labels 
                })

In [8]:
train_df = QADataset('data/rucos_train.jsonl', making_smaller=True, lim=7000)
val_df = QADataset('data/rucos_val.jsonl')
test_df = QADataset('data/rucos_test.jsonl')

  0%|          | 0/7000 [00:00<?, ?it/s]

  0%|          | 0/7577 [00:00<?, ?it/s]

  0%|          | 0/7257 [00:00<?, ?it/s]

In [9]:
train_df[:10]

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0_level_0,text_a,text_b,labels
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0/2,"Наблюдатели полагают, что подоплекой теракта в...","Кроме того, серьезным вызовом для России стано...",1
0/4,"Наблюдатели полагают, что подоплекой теракта в...","Кроме того, серьезным вызовом для России стано...",0
1/7,О вторжении на Украину танковой колонны из РФ ...,Россия категорически опровергла сообщение СНБО...,0
1/9,О вторжении на Украину танковой колонны из РФ ...,Россия категорически опровергла сообщение ОПЕК...,0
1/13,О вторжении на Украину танковой колонны из РФ ...,Россия категорически опровергла сообщение Лысе...,1
1/5,О вторжении на Украину танковой колонны из РФ ...,Россия категорически опровергла сообщение Андр...,1
2/0,Год назад Владимир Путин вновь стал президенто...,"Инго Маннтойфель, руководитель отдела Восточно...",1
2/6,Год назад Владимир Путин вновь стал президенто...,"Кремль, руководитель отдела Восточной Европы и...",0
3/3,Союз девяти ведущих технических университетов ...,Германию позиционирует себя как союз независим...,0
3/5,Союз девяти ведущих технических университетов ...,TU9 позиционирует себя как союз независимых ун...,1


# Model params

In [10]:
!gdown --id 1Mw3fgA35nqIRFb8lQZDkwZGUWJEnjaGJ

Downloading...
From: https://drive.google.com/uc?id=1Mw3fgA35nqIRFb8lQZDkwZGUWJEnjaGJ
To: /content/RuBERT.tar.gz
662MB [00:06, 104MB/s]


In [11]:
!tar -xvf RuBERT.tar.gz
!mv rubert_cased_L-12_H-768_A-12_pt/ RuBERT/
!mv RuBERT/bert_config.json RuBERT/config.json

rubert_cased_L-12_H-768_A-12_pt/
rubert_cased_L-12_H-768_A-12_pt/bert_config.json
rubert_cased_L-12_H-768_A-12_pt/vocab.txt
rubert_cased_L-12_H-768_A-12_pt/pytorch_model.bin


In [12]:
# @title Model params { run: "auto" }
model_name = "RuBERT/" #@param {type:"string"}
num_train_epochs = 2 #@param {type:"slider", min:1, max:50, step:1}
use_cuda = True #@param ["True", "False"] {type:"raw"}
learning_rate =  5e-5 #@param {type:"number"}
train_batch_size = 8 #@param {type:"slider", min:1, max:100, step:1}
no_save = True #@param ["True", "False"] {type:"raw"}
adam_epsilon =  3e-8 #@param {type:"number"}

# Creating model

In [17]:
cnt = Counter(train_df[:].labels)
cnt

  0%|          | 0/37988 [00:00<?, ?it/s]

Counter({0: 20142, 1: 17846})

In [18]:
model = ClassificationModel(model_type='bert',
                            model_name=model_name,
                            use_cuda=use_cuda,
                            num_labels=2,

                            weight=[cnt[1], cnt[0]],

                            args=ClassificationArgs(
                                                    num_train_epochs=num_train_epochs,
                                                    learning_rate=learning_rate,
                                                    adam_epsilon=adam_epsilon,
                                                    
                                                    train_batch_size=train_batch_size,
                                                    max_seq_length=512,
                                                    manual_seed=RS,
                                                    
                                                    overwrite_output_dir=True,
                                                    no_cache=False,
                                                    no_save=no_save,
                                                    save_eval_checkpoints=False,
                                                    save_model_every_epoch=False,
                                                    save_steps=-1,
                                            )
                            )

Some weights of the model checkpoint at RuBERT/ were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at RuBERT/ a

# Training

In [19]:
model.train_model(train_df[:])

  0%|          | 0/37988 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/4749 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/4749 [00:00<?, ?it/s]

(9498, 0.4722772575839358)

# Evaluate the model

In [27]:
def eval_info(model, predictions, y_test):
    print(model)
    print("Precision: {0:6.2f}".format(precision_score(y_test, predictions, average='macro', zero_division=0)))
    print("Recall: {0:6.2f}".format(recall_score(y_test, predictions, average='macro', zero_division=0)))
    print("F1-measure: {0:6.2f}".format(f1_score(y_test, predictions, average='macro', zero_division=0)))
    print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, predictions)))
    print(classification_report(y_test, predictions, zero_division=0))
    labels = ['0', '1']
    sns.heatmap(data=confusion_matrix(y_test, predictions), annot=True, fmt="d", cbar=False, xticklabels=labels, yticklabels=labels)
    plt.title("Confusion matrix")
    plt.show()

In [None]:
data = val_df[:]

predictions1, raw_outputs1 = model.predict(list(zip(data.text_a, data.text_b)))

  0%|          | 0/256003 [00:00<?, ?it/s]

In [None]:
f1 = f1_score(data.labels, predictions1, average='macro', zero_division=0)

eval_info(model, predictions1, data.labels)

# Predict

In [20]:
def save_pred(filename, preds):
    with open('data/rucos_test.jsonl', 'r') as json_file:
        test = list(map(lambda x: json.loads(x), list(json_file)))
    ind = 0
    res = []
    
    for i in range(len(test)):
        ent_prob = []
        
        entities = test[i]['passage']['entities']
        used = set()
        for en in entities:
            word = test[i]['passage']['text'][en['start']:en['end']] 
            if word not in used:
                used.add(word)
        entities = list(used)
        
        for j in range(len(entities)):
            ent_prob.append(preds[ind])
            ind += 1
        ent_prob = np.array(ent_prob).argmax()
        
#         start, end = test[i]['passage']["entities"][ent_prob]['start'], test[i]['passage']["entities"][ent_prob]['end']
        answer = entities[ent_prob]
        res.append({"idx": test[i]["idx"], "text": answer})
    
    !mkdir submitions/
    
    with jsonlines.open("submitions/" + filename, 'w') as file:
        file.write_all(res)

In [21]:
predictions, raw_outputs = model.predict(list(zip(test_df[:].text_a, test_df[:].text_b)))

  0%|          | 0/67751 [00:00<?, ?it/s]

  0%|          | 0/67751 [00:00<?, ?it/s]

  0%|          | 0/8469 [00:00<?, ?it/s]

In [22]:
cnt = Counter(predictions)

print(f'    cnt: {cnt}')
print(f"0 label: {cnt[0] / len(predictions) * 100:.3f}%")
print(f"1 label: {cnt[1] / len(predictions) * 100:.3f}%")

    cnt: Counter({0: 46578, 1: 21173})
0 label: 68.749%
1 label: 31.251%


In [23]:
from scipy.special import softmax
pred_proba = np.array([elem[1] for elem in softmax(raw_outputs, axis=1)])
pred_proba

array([0.00406223, 0.91895426, 0.00259603, ..., 0.06822643, 0.00262144,
       0.97254539])

In [24]:
file_name = f'{__import__("datetime").datetime.now()}.jsonl'
save_pred(file_name, pred_proba)

In [25]:
#@title tg-logger (doc sender) { run: "auto", vertical-output: true, display-mode: "form" }
token = "1282076787:AAEC3rYsSHh_livmutl0qDEemVhlhjWpWyI" #@param {type:"string"}
user_id = -584355986 #@param {type:"integer"}

import tg_logger

tg_files_logger = tg_logger.TgFileLogger(
    token=token,  # tg bot token
    users=[user_id],  # list of user_id
    timeout=10  # 10 seconds by default
)

In [26]:
while True:
    try:
        bot.send_document(user_id, 
                          open('submitions/' + file_name, 'rb'),
                          caption=f'Model: {model_name}\nLR: {learning_rate}\nAdam eps: {adam_epsilon}\nEpochs: {num_train_epochs}\nTrain len: {len(train_df)}\n{cnt}\nLabel 0: {cnt[0] / len(predictions) * 100:.3f}%\nLabel 1: {cnt[1] / len(predictions) * 100:.3f}%\nF1 score on val set: {f1 * 100:.2f}%')
        break
    except:
        import time
        time.sleep(1)