In [134]:
import math
import matplotlib as plt
from typing import List
from typing import Tuple

import gym
import numpy as np
from gym import logger
from gym import spaces
from gym.utils import seeding
import pandas as pd
from sklearn.preprocessing import MinMaxScaler


import ipywidgets as widgets

from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased-sentence')
model = BertModel.from_pretrained('DeepPavlov/rubert-base-cased-sentence')

In [69]:
data = pd.read_json('/Users/ruagmn9/PycharmProjects/raif-bootcamp-2021/data/umnik.json')

variants = pd.concat([data['variants'].apply(lambda x: x[i]) for i in range(4)],axis=1)
variants.columns = ['variant_1', 'variant_2', 'variant_3', 'variant_4']

data[variants.columns] = variants

data['tags'] = data['tags'].apply(lambda x: ' '.join(x))

data['reward'] = data.tags.apply(lambda x: x.split('руб.',1)[0][-15:])

data['reward'] = data['reward'].str.replace(" ", "").str.extract('(\d+)').astype(np.int32)

data = data[(data.reward > 1) & (data.reward < 0.8e6)]

data = data.drop(columns=['variants', 'tags'])

In [132]:
def str_to_embbeding(series):
    # todo to same padding
    # tokenize
    tokenized = series.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
    # 2  padding
    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)

    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    # Masking
    attention_mask = np.where(padded != 0, 1, 0)

    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)
    
    #predict
    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)

    features = last_hidden_states[0][:,0,:].numpy()

    return  pd.DataFrame(features).to_numpy().tolist()

In [84]:
data['emb_question'] = str_to_embbeding(data['question'])

data['emb_variant_1'] = str_to_embbeding(data['variant_1'])
data['emb_variant_2'] = str_to_embbeding(data['variant_2'])
data['emb_variant_3'] = str_to_embbeding(data['variant_3'])
data['emb_variant_4'] = str_to_embbeding(data['variant_4'])

In [87]:
data = data.drop(columns=['question', 'variant_1', 'variant_2', 'variant_3', 'variant_4'])

data['state'] = data.drop(columns=['answer_index', 'reward']).to_numpy().tolist()

dt.state = dt.state.apply(lambda x: np.array(x))

In [135]:
scaler = MinMaxScaler()


dt['reward'] = scaler.fit_transform(np.log1p(dt.reward).to_numpy().reshape(-1,1))

dt[['reward', 'answer_index', 'state']].to_pickle('/Users/ruagmn9/PycharmProjects/raif-bootcamp-2021/data/embedded_data.pkl')

In [150]:
dt = pd.read_pickle('embedded_data.pkl')

In [151]:
"""
train env
"""

import gym
import numpy as np
from gym import spaces
from gym.utils import seeding


class MainEnv(gym.Env):
    """
    the main goal is
    """

    def __init__(self,
                 data,
                 action_dim=4
                 ):

        self.data_dict = data
        gb = data.groupby('reward')
        data_dict = {x: gb.get_group(x) for x in gb.groups}
        self.data_dict = data_dict

        self.seed()
        self.steps_beyond_done = None
        self.current_session = None
        self.np_random = None
        self._iter = 0

        self.action_space = spaces.Discrete(
            action_dim
            )
        self.iter_len = len(data_dict)
        self.levels = list(data_dict.keys())

        self._observation_space = None
        self.state_cols = None
        self.current_level = None
        # high = self.train_features[self.state_cols].max(axis=0).values
        # self.observation_space = spaces.Box(-high, high, dtype=np.float32)
        self.current_reward = 0

    @property
    def observation_space(self):
        high = self.data_dict[next(iter(self.data_dict))].state.iloc[0]
        return spaces.Box(-high, high, dtype=np.float32)

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)

    def step(self, action: float):

        done = self.is_done()

        if action == self.current_level['answer_index'].iloc[0]:
            reward = self.current_level['reward'].iloc[0]
            self.current_reward = reward
        elif action == 4:
            done = True
            reward = self.current_reward
        else:
            reward = 0
            done = True

        level = self.levels[self.iterator]
        self.current_level = self.data_dict[level].sample(1)

        state = self._get_state()
        if not done:
            return state, reward, done, {}

        elif self.steps_beyond_done is None:
            self.steps_beyond_done = 0
        else:
            if self.steps_beyond_done == 0:
                print(
                    "You are calling 'step()' even though this environment "
                    "has already returned done = True. You should always call "
                    "'reset()' once you receive 'done = True' -- any further "
                    "steps are undefined behavior.")
            self.steps_beyond_done += 1
            reward = 0.

        return state, reward, done, {}

    def is_done(self):
        return self._iter == self.iter_len - 1

    @property
    def iterator(self):
        self._iter = (self._iter + 1) % self.iter_len
        return self._iter

    def _get_state(self):
        return (self.current_level['state']).iloc[0]

    def reset(self):
        """
        sample emitent id
        """

        self.steps_beyond_done = None
        self.current_reward = 0
        self._iter = 0
        level = self.levels[self._iter]
        self.current_level = self.data_dict[level].sample(1)

        return self._get_state()

In [152]:
env = MainEnv(dt)


In [158]:
state[0][:5]

array([ 4.60222363e-05, -1.72591054e+00,  1.08200979e+00, -3.52031052e-01,
       -5.11471331e-01])

In [163]:
eval_episodes = 20
max_episode_steps = 20
for _ in range(eval_episodes):
    avg_session_reward = 0.
    state, done = env.reset(), False
    t = 0
    while not done and t < max_episode_steps:
        action = np.random.randint(5)
        state, reward, done, _ = env.step(action)
        print(state[0][:5], reward, done)
        avg_session_reward += reward
        t += 1


[-0.1666079  -0.15910499  0.39075381 -0.40866804 -0.33198822] 0 True
[ 0.09215854 -0.85184926 -0.22114967 -0.36713228 -0.39433855] 0 True
[-0.46650845 -1.59769881  0.2385564  -0.17435637 -0.88047463] 0 True
[-0.28783214  0.31482112 -0.45269984 -0.60699511 -0.25548476] 0 True
[-0.19080602 -0.81654906  0.45044118 -0.60847211 -1.16273761] 0 True
[-0.59890926  0.20519266  1.61185551 -0.64738327 -0.92345721] 0 True
[-0.16038889  0.36259979  0.20558883 -0.95559734  0.00549053] 0 True
[-0.60422921 -0.57331944 -0.4248988  -0.76547676  0.02314907] 0 True
[-0.08803634 -0.32092762 -0.18132989 -0.78330684 -0.87422466] 0.0 False
[-0.52494681 -0.88954484  0.81242764  0.14403158 -0.74984878] 0 True
[-0.63790303 -0.33330089  0.44721937 -0.31062505 -0.1350895 ] 0 True
[-0.44947165 -1.03878248  0.20184521 -0.53930414 -1.13021028] 0 True
[-0.46650845 -1.59769881  0.2385564  -0.17435637 -0.88047463] 0.0 False
[-0.48424703 -1.58185041  0.750579   -0.34632874 -0.76225847] 0 True
[ 0.09215854 -0.85184926 -0.