In [56]:
import math
import matplotlib as plt
from typing import List
from typing import Tuple

import gym
import numpy as np
from gym import logger
from gym import spaces
from gym.utils import seeding
import pandas as pd
from sklearn.preprocessing import MinMaxScaler


import ipywidgets as widgets

from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased-sentence')
model = BertModel.from_pretrained('DeepPavlov/rubert-base-cased-sentence')


In [57]:
data_1 = pd.read_json('/Users/ruagmn9/PycharmProjects/raif-bootcamp-2021/data/umnik.json')


# umnik data

In [58]:
variants = pd.concat([data_1['variants'].apply(lambda x: x[i]) for i in range(4)],axis=1)
variants.columns = ['variant_1', 'variant_2', 'variant_3', 'variant_4']

data_1[variants.columns] = variants

data_1['tags'] = data_1['tags'].apply(lambda x: ' '.join(x))

data_1['reward'] = data_1.tags.apply(lambda x: x.split('руб.',1)[0][-15:])

data_1['reward'] = data_1['reward'].str.replace(" ", "").str.extract('(\d+)').astype(np.int32)

data_1 = data_1[(data_1.reward > 1) & (data_1.reward < 0.8e6)]

data_1 = data_1.drop(columns=['variants', 'tags'])

In [59]:
data_1.head()

Unnamed: 0,question,answer_index,variant_1,variant_2,variant_3,variant_4,reward
0,"Кто с кем остаётся один на один, когда назнача...",0,Игрок с вратарём,Вратарь с тренером,Судья с болельщиком,Комментатор со зрителем,1000
1,Что растёт на грядках?,1,Винегрет,Салат,Пудинг,Компот,500
2,Что умеет птичка оляпка?,0,бегать по дну,вить гнездо в песке,скользить по льду,охотиться ночью,25000
3,Сколько кубических сантиметров в кубометре?,0,миллион,тысяча,десять тысяч,сто тысяч,15000
4,Землю какой страны топчут священные коровы?,0,Индия,Япония,Турция,Тунис,5000



## boot data

In [60]:
data_2 = pd.read_csv('/Users/ruagmn9/PycharmProjects/raif-bootcamp-2021/data/boot_camp_train.csv')



In [61]:
data_2 = data_2.rename(columns={'Вопрос': 'question',
                    '1': 'variant_1',
                    '2': 'variant_2',
                    '3': 'variant_3',
                    '4': 'variant_4',
                    'Правильный ответ': 'answer_index'}).drop(columns=['Unnamed: 0'])

In [62]:
elements = [100, 200, 300, 500, 1000, 2000, 3000, 5000, 8000, 10000 ]
data_2['reward'] = np.random.choice(elements, data_2.shape[0])

In [63]:
data_2['answer_index'] -=1

In [64]:
data_2 = data_2.dropna()

In [65]:
data = pd.concat([data_1, data_2],axis=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [84]:
def str_to_embbeding(series, max_len = None):
    # todo to same padding
    # tokenize
    tokenized = series.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
    # 2  padding
    if max_len is None:
        max_len=0
        for i in tokenized.values:
            if len(i) > max_len:
                max_len = len(i)

    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    # Masking
    attention_mask = np.where(padded != 0, 1, 0)

    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)
    
    #predict
    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)

    features = last_hidden_states[0][:,0,:].numpy()

    return  pd.DataFrame(features).to_numpy().tolist()

In [85]:
%%time
data['emb_question'] = str_to_embbeding(data['question'])

CPU times: user 17min 5s, sys: 6min 39s, total: 23min 44s
Wall time: 22min 59s


In [86]:
%%time
data['emb_variant_1'] = str_to_embbeding(data['variant_1'])

CPU times: user 5min 23s, sys: 49.2 s, total: 6min 12s
Wall time: 5min 9s


In [87]:
%%time
data['emb_variant_2'] = str_to_embbeding(data['variant_2'])

CPU times: user 6min 12s, sys: 57 s, total: 7min 9s
Wall time: 5min 56s


In [88]:
%%time
data['emb_variant_3'] = str_to_embbeding(data['variant_3'])

CPU times: user 6min 16s, sys: 57 s, total: 7min 13s
Wall time: 5min 58s


In [89]:
%%time
data['emb_variant_4'] = str_to_embbeding(data['variant_4'])

CPU times: user 6min 20s, sys: 59.4 s, total: 7min 19s
Wall time: 6min 7s


max_len

In [91]:
dt = data.drop(columns=['question', 'variant_1', 'variant_2', 'variant_3', 'variant_4'])

In [92]:
dt.head()

Unnamed: 0,answer_index,reward,emb_question,emb_variant_1,emb_variant_2,emb_variant_3,emb_variant_4
0,0.0,1000,"[-0.10200223326683044, -1.4214788675308228, 0....","[-0.5423567295074463, -0.8888402581214905, 0.2...","[-0.9303133487701416, -0.40989843010902405, -0...","[-0.7537140846252441, -0.5191406607627869, -0....","[-1.007237434387207, -0.1734929084777832, 0.17..."
1,1.0,500,"[-0.4592267870903015, -0.34668463468551636, -0...","[-0.7928611636161804, -0.18484751880168915, -0...","[0.08326661586761475, -0.23326626420021057, -0...","[-0.5638677477836609, -0.13601256906986237, -0...","[-0.6220911145210266, -0.5006184577941895, -0...."
2,0.0,25000,"[-0.2850097417831421, -0.2843513488769531, -0....","[-0.47498512268066406, 0.6844162344932556, -0....","[-0.21893970668315887, 0.40654319524765015, -0...","[-0.28078991174697876, -1.3100969791412354, 0....","[-0.16805793344974518, -0.8447439074516296, -0..."
3,0.0,15000,"[-0.7940405607223511, 0.36787229776382446, -0....","[-0.5132683515548706, 0.19531279802322388, -0....","[-0.5314285159111023, 0.36896395683288574, -0....","[-0.7507291436195374, 0.3132028877735138, -0.2...","[-0.8408227562904358, 0.392622709274292, -0.01..."
4,0.0,5000,"[-0.6078025698661804, -0.07159502059221268, 0....","[-0.35910868644714355, 0.12508320808410645, 0....","[-1.0322422981262207, 0.5259677171707153, 0.63...","[-0.4389076828956604, 0.2178155481815338, 0.60...","[-0.118193618953228, -0.26641929149627686, -0...."


In [93]:
dt['state'] = dt.drop(columns=['answer_index', 'reward']).to_numpy().tolist()

dt.state = dt.state.apply(lambda x: np.array(x))

In [94]:
scaler = MinMaxScaler()


dt['reward'] = scaler.fit_transform(np.log1p(dt.reward).to_numpy().reshape(-1,1))

In [96]:
dt[['reward', 'answer_index', 'state']].to_pickle('/Users/ruagmn9/PycharmProjects/raif-bootcamp-2021/data/data_train.pkl')