In [3]:
import numpy as np
import pandas as pd
import sys
sys.path.append("/Users/emrebaloglu/Documents/RL/basic_reinforcement_learning")
from RL_for_NLP.text_environments import TextEnvClfBert, TextEnvClf
from RL_for_NLP.text_data_pools import PartialReadingDataPoolWithTokens, PartialReadingDataPoolWithBertTokens
import NLP_utils.preprocessing as nlp_processing
from sklearn.model_selection import train_test_split

In [4]:
import json

with open("/Users/emrebaloglu/Documents/RL/basic_reinforcement_learning/NLP_datasets/RT_Polarity/data_info_bert.json", "r") as f:
    data_info = json.load(f)
data_info

{'path': '/Users/emrebaloglu/Documents/RL/basic_reinforcement_learning/NLP_datasets/RT_Polarity',
 'max_len': 50,
 'vocab_size': 28996}

In [None]:
data_train = nlp_processing.openDfFromPickle(data_info["path"] + "/rt-polarity-train-bert.pkl")
data_train.head()

In [None]:
# declare some hyperparameters

WINDOW_SIZE = 5
MAX_STEPS = int(1e+5)

In [None]:
data_test = nlp_processing.openDfFromPickle(data_info["path"] + "/rt-polarity-test-bert.pkl")
data_val = nlp_processing.openDfFromPickle(data_info["path"] + "/rt-polarity-val-bert.pkl")

train_pool = PartialReadingDataPoolWithBertTokens(data_train, "review", "label", "good", WINDOW_SIZE)
test_pool = PartialReadingDataPoolWithBertTokens(data_test, "review", "label", "good", WINDOW_SIZE)
val_pool = PartialReadingDataPoolWithBertTokens(data_val, "review", "label", "good", WINDOW_SIZE)

In [None]:
train_env = TextEnvClfBert(train_pool, MAX_STEPS)
val_env = TextEnvClfBert(val_pool, 1000)
test_env = TextEnvClfBert(test_pool, 1000)

In [None]:
train_env.current_observation

In [None]:
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import reinforce_algorithm_utils as rl_monte_carlo
import gym
import torch
from torch.optim import Adam

import imageio

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}.")

In [None]:
print(train_env.current_state_input_id.shape)
s_size = train_env.current_state_input_id.shape[0]
a_size = len(train_env.action_space)
print(s_size, a_size)

In [None]:
hyperparameters = {
    "h_sizes": [64, 32],
    "n_training_episodes": 500,
    "n_evaluation_episodes": 10,
    "max_t": 100,
    "gamma": 1.0,
    "lr": 1e-2,
    "env_id": None,
    "state_space": s_size,
    "action_space": a_size,
}

In [None]:
# Create policy and place it to the device
import policy_networks as pn
"""policy = pn.Transformer_Baseline_Policy(data_info["vocab_size"], hyperparameters["state_space"], hyperparameters["action_space"],
                                        num_heads=1, num_layers=1)"""
policy = pn.RNN_Baseline_Policy(data_info["vocab_size"], hyperparameters["state_space"], hyperparameters["action_space"])
optimizer = Adam(policy.parameters(), lr=hyperparameters["lr"])

In [None]:
scores = rl_monte_carlo.reinforce_algorithm(train_env, policy,
                   optimizer,
                   hyperparameters["n_training_episodes"], 
                   hyperparameters["max_t"],
                   hyperparameters["gamma"], 
                   50)

In [None]:
rl_monte_carlo.evaluate_agent(train_env, 10, 100, policy)

In [None]:
rl_monte_carlo.evaluate_agent(val_env, 10, 100, policy)

In [None]:
rl_monte_carlo.evaluate_on_clf(val_env, policy, pos_label="good")

In [None]:
import mlflow

In [None]:

mlflow.set_tracking_uri("file:/Users/emrebaloglu/Documents/RL/basic_reinforcement_learning/agent_models/rt-polarity")
mlflow.set_experiment("rnn")
mlflow.start_run()
mlflow.log_param("n_episodes", 500)
mlflow.pytorch.log_model(policy, "rnn-default")
mlflow.end_run()

In [None]:
mlflow.end_run()

In [None]:
mlflow.set_tracking_uri("file:/Users/emrebaloglu/Documents/RL/basic_reinforcement_learning/agent_models/rt-polarity")
uri = "runs:/9f09e3fa6df6427ba2889e89ac787bcc/rnn-default"

model = mlflow.pytorch.load_model(uri)

In [None]:
rl_monte_carlo.evaluate_agent(env, 10, 100, model)

In [None]:
rl_monte_carlo.evaluate_on_clf(env, model)

In [None]:
def eval_model(model, env):
    done = False
    obs = env.reset()
    total_reward = 0.0
    actions = []
    while not done:
        action, _states = model.predict(obs)
        action = action.item()
        obs, rewards, done, info = env.step(action)
        actions.append(env.action_space.ix_to_action(action))
        total_reward += rewards
    print("---------------------------------------------")
    print(f"Predicted Label {actions}")
    print(f"Oracle Label: {env.current_label}")
    print(f"Total Reward: {total_reward}")
    print("---------------------------------------------")


In [None]:
from stable_baselines3.dqn.policies import MlpPolicy
from stable_baselines3 import DQN

model = DQN(policy=MlpPolicy, env=train_env, learning_rate=0.001, batch_size=3)

for i in range(int(5)):
    model.learn(total_timesteps=int(1e+3), reset_num_timesteps=False)
    eval_model(model, val_env)