In [2]:
# import the libraries

import numpy as np
import pandas as pd
import platform

root_path = "C:\\Users\\mrbal\\Documents\\NLP\\RL\\basic_reinforcement_learning"
info_path = ""
sep = '/'
if platform.system() == "Windows":
    info_path = root_path + "\\NLP_datasets\\toy_data\\data_info.json"
    sep = '\\'
    print("Running on windows...")
else:
    root_path = "/Users/emrebaloglu/Documents/RL/basic_reinforcement_learning"
    info_path = root_path + "/NLP_datasets/toy_data/data_info.json"

import sys
sys.path.append(root_path)
from RL_for_NLP.text_environments import TextEnvClfWithBertTokens, TextEnvClf, TextEnvClfForBertModels
from RL_for_NLP.text_reward_functions import calculate_stats_from_cm

from RL_for_NLP.text_data_pools import PartialReadingDataPoolWithTokens, PartialReadingDataPoolWithBertTokens
import NLP_utils.preprocessing as nlp_processing
import reinforce_algorithm_utils as reinforce_utils
import policy_networks as pn

import torch
from torch.optim import Adam
import mlflow
from RL_for_NLP.text_reward_functions import calculate_stats_from_cm


from stable_baselines3 import A2C, DQN, PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.dqn.policies import MlpPolicy
from stable_baselines3 import DQN

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import json
from collections import Counter
from tqdm import tqdm

Using TensorFlow backend.


In [5]:
with open(info_path, "r") as f:
    data_info = json.load(f)


data_train = nlp_processing.openDfFromPickle(data_info["path"] + sep + "toy_data/toy-data-train.pkl")
print(data_train.shape)
data_train.head()

(80000, 5)


Unnamed: 0.1,Unnamed: 0,review,label,label_str,review_tokenized
0,75220,emphasis attraction adware paste geography usu...,0,bad,"[7501, 26, 8285, 1713, 9550, 5291, 4490, 6625,..."
1,48955,cole worcester automation cal mating recommend...,1,good,"[9235, 9430, 3566, 7007, 9631, 7841, 2612, 972..."
2,44966,functionality later wines think citysearch scr...,1,good,"[8400, 9089, 6629, 5490, 5871, 807, 6807, 5661..."
3,13568,picked knowing wishlist optional start temp du...,1,good,"[6444, 4686, 4881, 18, 165, 3181, 7675, 2799, ..."
4,92727,serves screensaver effectively gale timer teac...,1,good,"[6630, 1844, 8995, 7676, 7010, 963, 1967, 6264..."


In [6]:
# declare some hyperparameters
WINDOW_SIZE = 5
MAX_STEPS = int(1e+5)
TRAIN_STEPS = int(1e+3)
VOCAB_SIZE = data_info["vocab_size"]
REWARD_FN = "score"
print(f"Vocab size: {VOCAB_SIZE}")

Vocab size: 10002


In [7]:
data_test = nlp_processing.openDfFromPickle(data_info["path"] + sep + "toy_data/toy-data-test.pkl")
data_val = nlp_processing.openDfFromPickle(data_info["path"] + sep + "toy_data/toy-data-val.pkl")

train_pool = PartialReadingDataPoolWithTokens(data_train, "review", "label", WINDOW_SIZE, mask=False)
test_pool = PartialReadingDataPoolWithTokens(data_test, "review", "label", WINDOW_SIZE, mask=False)
val_pool = PartialReadingDataPoolWithTokens(data_val, "review", "label", WINDOW_SIZE, mask=False)

Maximum sentence length in pool: 49


Padding the data and populating samples...: 100%|██████████| 80000/80000 [00:03<00:00, 21789.90it/s]


Maximum sentence length in pool: 49


Padding the data and populating samples...: 100%|██████████| 10000/10000 [00:00<00:00, 24034.71it/s]


Maximum sentence length in pool: 49


Padding the data and populating samples...: 100%|██████████| 10000/10000 [00:00<00:00, 24295.14it/s]


In [8]:
train_env = TextEnvClf(train_pool, VOCAB_SIZE, MAX_STEPS, "score", False)
val_env = TextEnvClf(val_pool, VOCAB_SIZE, 2000, "score", False)
test_env = TextEnvClf(test_pool, VOCAB_SIZE, 2000, "score", False)
print("All environments are created.")

All environments are created.


In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}.")

Using device: cpu.


In [10]:
def eval_model(model, env, total_timesteps=10000):
    done = False
    obs = env.reset()
    total_reward = 0.0
    actions = []
    seen_samples = 0
    for _ in tqdm(range(total_timesteps)):
        action, _states = model.predict(obs)
        action = action.item()
        obs, rewards, done, info = env.step(action)
        action = env.action_space.ix_to_action(action)
        if action in env.pool.possible_actions:
            seen_samples += 1
        if done:
            obs = env.reset()
        actions.append(action)
        total_reward += rewards
    print("---------------------------------------------");
    print(f"Total Steps and seen samples: {len(actions), seen_samples}")
    print(f"Total reward: {total_reward}")
    print(f"Stats:  {calculate_stats_from_cm(env.confusion_matrix)}")
    acts = list(Counter(actions).keys())
    freqs = list(Counter(actions).values())
    total = len(actions)
    print(f"Action stats --  {[{acts[ii]: freqs[ii]/total} for ii in range(len(acts))]}")
    print("---------------------------------------------")

In [11]:
policy_kwargs = dict(
    features_extractor_class=pn.RNNExtractor,
    features_extractor_kwargs=dict(vocab_size = VOCAB_SIZE, embed_dim = 5,
                 rnn_type = "gru", rnn_hidden_size = 2, rnn_hidden_out = 2, rnn_bidirectional = True,
                 features_dim = 10, units = 10),
)


model = DQN("MlpPolicy", train_env, policy_kwargs=policy_kwargs, verbose=1, batch_size=64, gamma=1.)
for i in range(int(15)):
    model.learn(total_timesteps=int(1e+4)*10, reset_num_timesteps=True, progress_bar=True)
    print("======= On train env: ===============")
    eval_model(model, train_env)
    print("======= On val env: ===============")
    eval_model(model, val_env)

Output()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




100%|██████████| 10000/10000 [00:06<00:00, 1516.08it/s]


---------------------------------------------
Total Steps and seen samples: (10000, 9850)
Total reward: 9520.750000000025
Stats:  {'accuracy': 0.791981739309166, 'precision': 0.7832945113610537, 'recall': 0.7880666084156995, 'f1': 0.7852384170649513}
Action stats --  [{'bad': 0.3963}, {'good': 0.5887}, {'<next>': 0.015}]
---------------------------------------------


100%|██████████| 10000/10000 [00:07<00:00, 1274.39it/s]

Output()




---------------------------------------------
Total Steps and seen samples: (10000, 9829)
Total reward: -1257.2900000000013
Stats:  {'accuracy': 0.4986265133787771, 'precision': 0.49841337435099087, 'recall': 0.4983476083106628, 'f1': 0.49347745975097457}
Action stats --  [{'good': 0.4904}, {'bad': 0.4925}, {'<next>': 0.0171}]
---------------------------------------------




100%|██████████| 10000/10000 [00:07<00:00, 1276.08it/s]


---------------------------------------------
Total Steps and seen samples: (10000, 9827)
Total reward: 9446.23000000003
Stats:  {'accuracy': 0.7999437679780262, 'precision': 0.7917148120808506, 'recall': 0.7981543102766819, 'f1': 0.7940914582031421}
Action stats --  [{'bad': 0.3968}, {'good': 0.5859}, {'<next>': 0.0173}]
---------------------------------------------


100%|██████████| 10000/10000 [00:07<00:00, 1360.49it/s]

Output()




---------------------------------------------
Total Steps and seen samples: (10000, 9841)
Total reward: -1234.1600000000012
Stats:  {'accuracy': 0.4991865785460092, 'precision': 0.49936922407965906, 'recall': 0.4993430551650617, 'f1': 0.4942414943913668}
Action stats --  [{'good': 0.4949}, {'bad': 0.4892}, {'<next>': 0.0159}]
---------------------------------------------


**Reinforce Algorithm**

In [8]:
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import reinforce_algorithm_utils as rl_monte_carlo
import gym
import torch
from torch.optim import Adam

import imageio

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}.")

Using device: cpu.


In [10]:
print(train_env.current_state_input_id.shape)
s_size = train_env.current_state_input_id.shape[0]
a_size = len(train_env.action_space)
print(s_size, a_size)

(8,)
8 4


In [11]:
hyperparameters = {
    "h_sizes": [64, 32],
    "n_training_episodes": 100000,
    "n_evaluation_episodes": 10,
    "max_t": 25,
    "gamma": 1.0,
    "lr": 1e-2,
    "env_id": None,
    "state_space": s_size,
    "action_space": a_size,
}

In [12]:
# Create policy and place it to the device
import policy_networks as pn
"""policy = pn.Transformer_Baseline_Policy(data_info["vocab_size"], hyperparameters["state_space"], hyperparameters["action_space"],
                                        num_heads=1, num_layers=1)"""
policy = pn.RNN_Baseline_Policy(data_info["vocab_size"], hyperparameters["state_space"], hyperparameters["action_space"])
optimizer = Adam(policy.parameters(), lr=hyperparameters["lr"])

In [51]:
params = dict(policy.named_modules())
print(params)


{'': RNN_Baseline_Policy(
  (embed_enc): Embedding(28996, 5, max_norm=True)
  (rnn): GRU(5, 2, num_layers=2, batch_first=True, bidirectional=True)
  (flat): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=32, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=4, bias=True)
), 'embed_enc': Embedding(28996, 5, max_norm=True), 'rnn': GRU(5, 2, num_layers=2, batch_first=True, bidirectional=True), 'flat': Flatten(start_dim=1, end_dim=-1), 'fc1': Linear(in_features=32, out_features=50, bias=True), 'fc2': Linear(in_features=50, out_features=50, bias=True), 'fc3': Linear(in_features=50, out_features=4, bias=True)}


In [55]:
import mlflow
import json

params_s = str(params)
mlflow.log_dict(params_s, "params.json")

In [None]:
scores = rl_monte_carlo.reinforce_algorithm(train_env, policy,
                   optimizer,
                   hyperparameters["n_training_episodes"], 
                   hyperparameters["max_t"],
                   hyperparameters["gamma"])

In [18]:
import torch
import torch.nn as nn

x = torch.Tensor([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]]).int()
emb = nn.Embedding(20, 5)
lstm = nn.LSTM(5, 11, 7, bidirectional=True, batch_first=True)
print(x.shape)
x = emb(x)
print(x)
print(x.shape)
x, (h, _) = lstm(x)
print(x)
print(x.shape, h.shape)

torch.Size([2, 5])
tensor([[[-0.4974, -0.1800,  0.1256, -0.0997, -2.6214],
         [-1.6004,  0.7747,  0.3733,  1.3647,  0.0687],
         [-0.7068,  0.6754,  0.6912,  1.7885, -0.4849],
         [-1.1904, -0.5578,  1.5177, -0.2507, -0.8633],
         [ 1.2573,  0.4593,  1.2733,  0.4671,  0.1200]],

        [[-1.1528,  2.0933, -0.6916, -0.2654,  1.5681],
         [-1.5847, -0.2593,  0.0507,  0.8566, -0.7230],
         [-0.7984, -0.9824, -0.1521,  0.4700, -1.4515],
         [ 1.7227,  0.5693,  0.6390, -0.5363, -0.9067],
         [ 1.0751, -0.5434,  0.9034, -0.1167,  1.8581]]],
       grad_fn=<EmbeddingBackward0>)
torch.Size([2, 5, 5])
tensor([[[-1.2247e-01, -8.0512e-02, -3.4734e-02, -2.2318e-03,  7.6149e-04,
           1.1641e-01, -1.8003e-02,  5.5952e-02,  1.7178e-02, -4.6703e-02,
           2.3925e-03,  1.3036e-01,  1.6704e-01,  1.1473e-01, -7.4383e-02,
           5.7412e-02,  8.1825e-02, -4.2972e-02,  9.8029e-03, -1.9270e-01,
           5.2298e-02,  1.0932e-01],
         [-1.6308e-01

In [None]:
-1e-1*(np.log2(500000))

-1.8931568569324175

In [None]:
rl_monte_carlo.evaluate_agent(train_env, 10, 100, policy)

In [None]:
rl_monte_carlo.evaluate_agent(val_env, 10, 100, policy)

In [None]:
rl_monte_carlo.evaluate_on_clf(val_env, policy, pos_label="good")

In [None]:
def eval_model(model, env):
    done = False
    obs = env.reset()
    total_reward = 0.0
    actions = []
    while not done:
        action, _states = model.predict(obs)
        action = action.item()
        obs, rewards, done, info = env.step(action)
        actions.append(env.action_space.ix_to_action(action))
        total_reward += rewards
    print("---------------------------------------------")
    print(f"Predicted Label {actions}")
    print(f"Oracle Label: {env.current_label}")
    print(f"Total Reward: {total_reward}")
    print("---------------------------------------------")


In [None]:
eval_model(policy, val_env)

In [None]:
import mlflow

In [None]:

mlflow.set_tracking_uri("file:/Users/emrebaloglu/Documents/RL/basic_reinforcement_learning/agent_models/rt-polarity")
mlflow.set_experiment("rnn")
mlflow.start_run()
mlflow.log_param("n_episodes", 500)
mlflow.pytorch.log_model(policy, "rnn-default")
mlflow.end_run()

In [None]:
mlflow.end_run()

In [None]:
mlflow.set_tracking_uri("file:/Users/emrebaloglu/Documents/RL/basic_reinforcement_learning/agent_models/rt-polarity")
uri = "runs:/9f09e3fa6df6427ba2889e89ac787bcc/rnn-default"

model = mlflow.pytorch.load_model(uri)

In [None]:
rl_monte_carlo.evaluate_agent(env, 10, 100, model)

In [None]:
rl_monte_carlo.evaluate_on_clf(env, model)

In [None]:
from stable_baselines3.dqn.policies import MlpPolicy
from stable_baselines3 import DQN

model = DQN(policy=MlpPolicy, env=train_env, learning_rate=0.001, batch_size=3)

for i in range(int(5)):
    model.learn(total_timesteps=int(1e+3), reset_num_timesteps=False)
    eval_model(model, val_env)

In [None]:
from stable_baselines3 import A2C
from stable_baselines3.common.evaluation import evaluate_policy

In [None]:
# Get the state space and action space
s_size = train_env.observation_space.shape[0]
a_size = len(train_env.action_space.actions)
print(s_size, a_size)

In [None]:
model = A2C(policy = "MlpPolicy",
            env = train_env,
            gae_lambda = 0.9,
            gamma = 0.99,
            learning_rate = 0.00096,
            max_grad_norm = 0.5,
            n_steps = 8,
            vf_coef = 0.4,
            ent_coef = 0.0,
            tensorboard_log = "./tensorboard",
            policy_kwargs=dict(
            log_std_init=-2, ortho_init=False),
            normalize_advantage=False,
            use_rms_prop= True,
            verbose=1)

In [None]:
for i in range(3):
    model.learn(500, log_interval=250)
    print(evaluate_policy(model, val_env, 3))

In [None]:
evaluate_policy(model, val_env, n_eval_episodes=1)

In [None]:
rl_monte_carlo.evaluate_on_clf(val_env, model)