In [None]:
#@title Installing libraries { display-mode: "form" }
!pip3 install transformers 
!pip3 install tensorboard 
!pip3 install simpletransformers
!pip3 install jsonlines
!pip3 install pytelegrambotapi
!pip3 install --upgrade tqdm
!pip3 install tg-logger

In [1]:
#@title Setup & Config & Imports
# !mkdir /usr/local/lib/python3.6/dist-packages/tqdm-4.41.1.dist-info/METADATA
from tqdm.notebook import tqdm
# class _TQDM(tqdm.tqdm):
#     def __init__(self, *argv, **kwargs):
#         kwargs['disable'] = True
#         if kwargs.get('disable_override', 'def') != 'def':
#             kwargs['disable'] = kwargs['disable_override']
#         super().__init__(*argv, **kwargs)
# tqdm.tqdm = _TQDM
 
import pandas as pd
import numpy as np
import json
import jsonlines
import re
from collections import defaultdict, Counter
 
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['text.color'] = mpl.rcParams['axes.labelcolor'] = mpl.rcParams['xtick.color'] = mpl.rcParams['ytick.color'] = 'white'
 
%pylab inline
from sklearn.metrics import *
 
try:
    from simpletransformers.classification import ClassificationModel, ClassificationArgs
except (FileNotFoundError, AttributeError):
    from simpletransformers.classification import ClassificationModel, ClassificationArgs
from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs
 
%load_ext tensorboard
 
import string
 
RS = 179

import random

random.seed(RS)
PUNCTUATION = string.punctuation
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager

Populating the interactive namespace from numpy and matplotlib


In [None]:
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [None]:
!rm -rf sample_data/

# Downloading data

In [2]:
!curl https://onti2020.ai-academy.ru/task/rucos.zip -o data.zip
!unzip -o data.zip

!mv RuCoS/ data/

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 53.6M  100 53.6M    0     0  10.0M      0  0:00:05  0:00:05 --:--:-- 11.5M
Archive:  data.zip
   creating: RuCoS/
  inflating: __MACOSX/._RuCoS        
  inflating: RuCoS/.DS_Store         
  inflating: __MACOSX/RuCoS/._.DS_Store  
  inflating: RuCoS/rucos_test.jsonl  
  inflating: __MACOSX/RuCoS/._rucos_test.jsonl  
  inflating: RuCoS/rucos_val.jsonl   
  inflating: __MACOSX/RuCoS/._rucos_val.jsonl  
  inflating: RuCoS/rucos_train.jsonl  
  inflating: __MACOSX/RuCoS/._rucos_train.jsonl  


In [3]:
!ls -lah data/

total 210M
drwxr-xr-x 2 root root 4.0K Feb 26 12:40 .
drwxr-xr-x 1 root root 4.0K Mar  3 11:29 ..
-rw-r--r-- 1 root root 6.1K Nov 10 21:06 .DS_Store
-rwxr-xr-x 1 root root  17M Oct 15 03:28 rucos_test.jsonl
-rwxr-xr-x 1 root root 175M Oct 15 03:26 rucos_train.jsonl
-rwxr-xr-x 1 root root  19M Oct 15 03:25 rucos_val.jsonl


# Process data

In [4]:
class QADataset: 
    """Question and answers dataset""" 
     
    def __init__(self, path, making_smaller=False, lim=-1): 
        """ 
        Args: 
            path (string): Path to jsonl file 
        """ 
        random.seed(RS)
        self.path = path 
        data = [] 
        self.text = [] 
        with open(path, 'r') as json_file: 
            json_list = list(json_file) 
            for json_str in tqdm(json_list if lim == -1 else json_list[:lim]): 
                item = json.loads(json_str) 
                text = item['passage']['text'] 
                self.text.append(text) 
                entities = item['passage']['entities'] 
                used = list()
                for en in entities:
                    word = text[en['start']:en['end']] 
                    if word not in used:
                        used.append(word)
                entities = list(used)

                for row in item['qas']: 
                    for ans in row.get('answers', [None]): 
                        q_text = row['query'] 
                        for i, word in enumerate(entities): 
                            n_q = q_text.replace('@placeholder', word) 
                            data.append({"idx": f'{item["idx"]}/{i}', 
                                         "text_id": len(self.text) - 1,
                                         "query": n_q})
                            
                            if not ans:
                                continue 
 
                            label = int(ans['text'].strip() == word.strip())

                            tmp = random.randint(1, 10)
                            if not making_smaller or label or tmp == 1:
                                data[-1]["label"] = label
                            else:
                                data.pop()
                self.text[-1] = re.sub(r'(@\w+)|(\\n)', " ", self.text[-1])
        self.data = pd.DataFrame.from_dict(data).set_index("idx")
     
    def __len__(self): 
        return len(self.data) 
 
    def __getitem__(self, ind):  
        if type(ind) == int: 
            row = self.data.iloc[ind] 
            text_a = self.text[row.text_id].split() 
            text_b = row.query 
 
            w = 512 - len(text_b.split()) - 5 
            e = random.randint(0, len(text_a) - w) 
            text_a = ' '.join(text_a[e:e + w]) 
 
            res = [[list(self.data.index)[ind], text_a, text_b, row.label if 'label' in self.data.columns else None]] 
            return pd.DataFrame(res, columns=["idx", "text_a", "text_b", "labels"]).set_index("idx") 
        if type(ind) == slice: 
            res = []
            rows = self.data.iloc[ind] 
            for idx, row in tqdm(rows.iterrows(), total=len(rows)): 
                text_a = self.text[row.text_id].split() 
                text_b = row.query 
 
                w = 512 - len(text_b.split()) - 5 
                e = random.randint(0, max(1, len(text_a) - w)) 
                text_a = ' '.join(text_a[e:e + w]) 
 
                res.append([idx, text_a, text_b, row.label if 'label' in self.data.columns else None]) 
            return pd.DataFrame(res, columns=["idx", "text_a", "text_b", "labels"]).set_index("idx")
 
 
    def get_df(self): 
        texts = self.data['data'] 
        labels = self.data['label'] if 'label' in self.data.columns else [None] * self.data.shape[0] 
 
        return pd.DataFrame({
                    'text': texts,
                    'labels': labels
                })

In [5]:
train_df = QADataset('data/rucos_train.jsonl', making_smaller=True)
# val_df = QADataset('data/rucos_val.jsonl')
test_df = QADataset('data/rucos_test.jsonl')

  0%|          | 0/72193 [00:00<?, ?it/s]

  0%|          | 0/7257 [00:00<?, ?it/s]

In [6]:
train_df[:10]

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0_level_0,text_a,text_b,labels
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0/3,"Наблюдатели полагают, что подоплекой теракта в...","Кроме того, серьезным вызовом для России стано...",0
0/7,"Наблюдатели полагают, что подоплекой теракта в...","Кроме того, серьезным вызовом для России стано...",1
1/4,О вторжении на Украину танковой колонны из РФ ...,Россия категорически опровергла сообщение Лысе...,1
1/7,О вторжении на Украину танковой колонны из РФ ...,Россия категорически опровергла сообщение Киев...,0
1/8,вторжении на Украину танковой колонны из РФ со...,Россия категорически опровергла сообщение Луга...,0
1/9,вторжении на Украину танковой колонны из РФ со...,Россия категорически опровергла сообщение Андр...,1
1/11,О вторжении на Украину танковой колонны из РФ ...,Россия категорически опровергла сообщение Герм...,0
2/1,назад Владимир Путин вновь стал президентом Ро...,"России, руководитель отдела Восточной Европы и...",0
2/3,Год назад Владимир Путин вновь стал президенто...,"Инго Маннтойфель, руководитель отдела Восточно...",1
2/6,Год назад Владимир Путин вновь стал президенто...,"Россия, руководитель отдела Восточной Европы и...",0


# Upload Model

In [7]:
!gdown --id 15Tf3n2U4g1uRmn0oijJrThtdHYEJ-85I

Downloading...
From: https://drive.google.com/uc?id=15Tf3n2U4g1uRmn0oijJrThtdHYEJ-85I
To: /content/model.bin
716MB [00:05, 133MB/s]


In [8]:
import torch

model = torch.load('model.bin')

In [9]:
def save_pred(filename, preds):
    with open('data/rucos_test.jsonl', 'r') as json_file:
        test = list(map(lambda x: json.loads(x), list(json_file)))
    ind = 0
    res = []
    
    for i in range(len(test)):
        ent_prob = []
        
        entities = test[i]['passage']['entities']
        used = set()
        for en in entities:
            word = test[i]['passage']['text'][en['start']:en['end']] 
            if word not in used:
                used.add(word)
        entities = list(used)
        
        for j in range(len(entities)):
            ent_prob.append(preds[ind])
            ind += 1
        ent_prob = np.array(ent_prob).argmax()
        
#         start, end = test[i]['passage']["entities"][ent_prob]['start'], test[i]['passage']["entities"][ent_prob]['end']
        answer = entities[ent_prob]
        res.append({"idx": test[i]["idx"], "text": answer})
    
    !mkdir submitions/
    
    with jsonlines.open("submitions/" + filename, 'w') as file:
        file.write_all(res)

In [10]:
!mkdir cache_dir/
! echo "Hello!" > cache_dir/cached_dev_bert_512_2_10

In [11]:
data = test_df[:]

predictions, raw_outputs = model.predict(list(zip(data.text_a, data.text_b)))

  0%|          | 0/67751 [00:00<?, ?it/s]

  0%|          | 0/8469 [00:00<?, ?it/s]

In [12]:
cnt = Counter(predictions)

print(f'    cnt: {cnt}')
print(f"0 label: {cnt[0] / len(predictions) * 100:.3f}%")
print(f"1 label: {cnt[1] / len(predictions) * 100:.3f}%")

    cnt: Counter({0: 51294, 1: 16457})
0 label: 75.710%
1 label: 24.290%


In [13]:
from scipy.special import softmax
pred_proba = np.array([elem[1] for elem in softmax(raw_outputs, axis=1)])
pred_proba

array([6.44943220e-04, 9.64153072e-01, 7.11056842e-04, ...,
       8.04098434e-01, 4.97016942e-01, 5.74786178e-04])

In [14]:
file_name = f'{__import__("datetime").datetime.now()}.jsonl'
save_pred(file_name, pred_proba)

In [15]:
#@title tg-logger (doc sender) { run: "auto", vertical-output: true, display-mode: "form" }
token = "1282076787:AAEC3rYsSHh_livmutl0qDEemVhlhjWpWyI" #@param {type:"string"}
user_id = -584355986 #@param {type:"integer"}

import tg_logger

tg_files_logger = tg_logger.TgFileLogger(
    token=token,  # tg bot token
    users=[user_id],  # list of user_id
    timeout=10  # 10 seconds by default
)

In [18]:
tg_files_logger.send('/content/submitions/' + file_name,
                    f'Model: Model-misha')


# Save raw_outputs

# Saving test

In [19]:
test_out = pd.DataFrame({'idx': list(test_df[:].index), "proba": list(pred_proba)})
test_out = test_out.set_index('idx')
test_out

  0%|          | 0/67751 [00:00<?, ?it/s]

Unnamed: 0_level_0,proba
idx,Unnamed: 1_level_1
0/0,0.000645
0/1,0.964153
0/2,0.000711
0/3,0.713881
0/4,0.000592
...,...
7256/2,0.000573
7256/3,0.002731
7256/4,0.804098
7256/5,0.497017


In [23]:
with open(f'out/test_out.pickle', 'wb') as f:
    pickle.dump(test_out, f)

import time

time.sleep(5)

tg_files_logger.send(f'out/test_out.pickle', "misha-model")

## Saving train

In [39]:
batch_size = 10000

In [40]:
train_raw_outputs = np.array([])

for ind in tqdm(range(0, len(train_df), batch_size)):
    data = train_df[ind:ind + batch_size]
    _, temp = model.predict(list(zip(data.text_a, data.text_b)))

    if len(train_raw_outputs) == 0:
        train_raw_outputs = temp.copy()
    else:
        train_raw_outputs = np.concatenate((train_raw_outputs, temp), axis=0)
    del data


train_pred_proba = np.array([elem[1] for elem in softmax(train_raw_outputs, axis=1)])
train_pred_proba

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/4911 [00:00<?, ?it/s]

  0%|          | 0/614 [00:00<?, ?it/s]

array([3.56114134e-02, 8.27758548e-01, 8.61712858e-01, ...,
       9.90997163e-01, 9.88624400e-01, 5.65879987e-04])

In [41]:
train_out = pd.DataFrame({'idx': train_df[:].index, "proba": train_pred_proba}, columns=["idx", "proba"]).set_index("idx")
train_out

  0%|          | 0/364911 [00:00<?, ?it/s]

Unnamed: 0_level_0,proba
idx,Unnamed: 1_level_1
0/3,0.035611
0/7,0.827759
1/4,0.861713
1/7,0.000691
1/8,0.000761
...,...
72192/7,0.988624
72192/8,0.008644
72192/7,0.990997
72192/7,0.988624


In [42]:
!mkdir out/
import pickle

mkdir: cannot create directory ‘out/’: File exists


In [45]:
with open(f'out/train_out.pickle', 'wb') as f:
    pickle.dump(train_out, f)

import time

time.sleep(5)

tg_files_logger.send('/content/out/train_out.pickle', "misha-model")