In [8]:
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import reinforce_algorithm_utils as rl_monte_carlo
import gym
import torch
from torch.optim import Adam



In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}.")

Using device: cuda:0.


## Cartpole Environment 

In [4]:
env_id = "CartPole-v1"
# Create the env
env_cartpole = gym.make(env_id)

# Create the evaluation env
eval_env_cartpole = gym.make(env_id)

# Get the state space and action space
s_size = env_cartpole.observation_space.shape[0]
a_size = env_cartpole.action_space.n

In [5]:
print("_____OBSERVATION SPACE_____ \n")
print("The State Space is: ", s_size)
print("Sample observation", env_cartpole.observation_space.sample()) # Get a random observation

_____OBSERVATION SPACE_____ 

The State Space is:  4
Sample observation [-4.7851624e+00 -1.2317310e+37  2.0815159e-01  1.1744039e+38]


In [6]:
print("\n _____ACTION SPACE_____ \n")
print("The Action Space is: ", a_size)
print("Action Space Sample", env_cartpole.action_space.sample()) # Take a random action


 _____ACTION SPACE_____ 

The Action Space is:  2
Action Space Sample 1


In [2]:
import torch

torch.device("cuda:0")

device(type='cuda', index=0)

In [12]:
l1 = np.array([1,2,3,4])
l2 = np.array([1,2,3,4])
print(np.average(2*l1*l2 / (l1+l2)))

2.5


In [13]:
import torch

# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cpu



In [7]:
cartpole_hyperparameters = {
    "h_sizes": [16],
    "n_training_episodes": 1000,
    "n_evaluation_episodes": 10,
    "max_t": 1000,
    "gamma": 1.0,
    "lr": 1e-2,
    "env_id": env_id,
    "state_space": s_size,
    "action_space": a_size,
}

In [8]:
# Create policy and place it to the device
cartpole_policy = rl_monte_carlo.Softmax_Policy_Dense_Layers(cartpole_hyperparameters["state_space"], cartpole_hyperparameters["action_space"], cartpole_hyperparameters["h_sizes"])
cartpole_optimizer = Adam(cartpole_policy.parameters(), lr=cartpole_hyperparameters["lr"])

Using device: cuda:0.


In [9]:
scores = rl_monte_carlo.reinforce_algorithm(env_cartpole, cartpole_policy,
                   cartpole_optimizer,
                   cartpole_hyperparameters["n_training_episodes"], 
                   cartpole_hyperparameters["max_t"],
                   cartpole_hyperparameters["gamma"], 
                   100)

Episode 100	Average Score: 25.69
Episode 200	Average Score: 33.21
Episode 300	Average Score: 38.77
Episode 400	Average Score: 46.19
Episode 500	Average Score: 49.11
Episode 600	Average Score: 61.80
Episode 700	Average Score: 69.54
Episode 800	Average Score: 73.69
Episode 900	Average Score: 88.63
Episode 1000	Average Score: 104.82


In [10]:
rl_monte_carlo.evaluate_agent(eval_env_cartpole, 
               cartpole_hyperparameters["max_t"], 
               cartpole_hyperparameters["n_evaluation_episodes"],
               cartpole_policy)

(120.4, 79.58165617779012)

## Pixel Copter

In [11]:
import gym_pygame

env_id = "Pixelcopter-PLE-v0"
env_pixel_copter = gym.make(env_id)
eval_env = gym.make(env_id)
s_size = env_pixel_copter.observation_space.shape[0]
a_size = env_pixel_copter.action_space.n

pygame 2.1.2 (SDL 2.0.18, Python 3.7.11)
Hello from the pygame community. https://www.pygame.org/contribute.html
couldn't import doomish
Couldn't import doom


In [12]:
print("_____OBSERVATION SPACE_____ \n")
print("The State Space is: ", s_size)
print("Sample observation", env_pixel_copter.observation_space.sample()) # Get a random observation

_____OBSERVATION SPACE_____ 

The State Space is:  7
Sample observation [ 0.81130373  0.31586105  0.29252818 -0.06413182 -0.01478996  0.71727496
 -0.5530129 ]


In [13]:
print("\n _____ACTION SPACE_____ \n")
print("The Action Space is: ", a_size)
print("Action Space Sample", env_pixel_copter.action_space.sample()) # Take a random action


 _____ACTION SPACE_____ 

The Action Space is:  2
Action Space Sample 1


In [14]:
pixelcopter_hyperparameters = {
    "h_size": [64, 32, 16],
    "n_training_episodes": 50000,
    "n_evaluation_episodes": 10,
    "max_t": 10000,
    "gamma": 0.99,
    "lr": 1e-4,
    "env_id": env_id,
    "state_space": s_size,
    "action_space": a_size,
}

In [15]:
# Create policy and place it to the device
# torch.manual_seed(50)
pixelcopter_policy = rl_monte_carlo.Softmax_Policy_Dense_Layers(pixelcopter_hyperparameters["state_space"], pixelcopter_hyperparameters["action_space"], pixelcopter_hyperparameters["h_size"])
pixelcopter_optimizer = Adam(pixelcopter_policy.parameters(), lr=pixelcopter_hyperparameters["lr"])

Using device: cuda:0.


In [16]:
scores = rl_monte_carlo.reinforce_algorithm(env_pixel_copter, pixelcopter_policy,
                   pixelcopter_optimizer,
                   pixelcopter_hyperparameters["n_training_episodes"], 
                   pixelcopter_hyperparameters["max_t"],
                   pixelcopter_hyperparameters["gamma"], 
                   100)

Episode 100	Average Score: -4.40
Episode 200	Average Score: -4.25
Episode 300	Average Score: -4.29
Episode 400	Average Score: -3.60
Episode 500	Average Score: -3.33
Episode 600	Average Score: -2.80
Episode 700	Average Score: -2.44
Episode 800	Average Score: -1.23
Episode 900	Average Score: -1.22
Episode 1000	Average Score: -1.38
Episode 1100	Average Score: -1.03
Episode 1200	Average Score: -0.68
Episode 1300	Average Score: -0.52
Episode 1400	Average Score: -0.74
Episode 1500	Average Score: -0.29
Episode 1600	Average Score: -0.60
Episode 1700	Average Score: -0.18
Episode 1800	Average Score: -0.73
Episode 1900	Average Score: -0.25
Episode 2000	Average Score: -0.67
Episode 2100	Average Score: 0.20
Episode 2200	Average Score: 0.50
Episode 2300	Average Score: -0.06
Episode 2400	Average Score: 0.55
Episode 2500	Average Score: 0.39
Episode 2600	Average Score: 0.59
Episode 2700	Average Score: 1.21
Episode 2800	Average Score: 0.34
Episode 2900	Average Score: 0.49
Episode 3000	Average Score: 0.6

In [17]:
rl_monte_carlo.evaluate_agent(eval_env, 
               pixelcopter_hyperparameters["max_t"], 
               pixelcopter_hyperparameters["n_evaluation_episodes"],
               pixelcopter_policy)

(16.8, 16.624078921853084)

In [1]:
import torch

model_dict = torch.load("C:\\Users\\mrbal\\Documents\\NLP\\NLP_Classification\\model_state_dict")

In [2]:
model_dict

OrderedDict([('embed_enc.weight',
              tensor([[-8.7591e-02, -1.0967e-01,  6.4074e-02,  ..., -2.0866e-01,
                       -1.5149e-01, -2.2125e-02],
                      [ 4.4670e-01, -5.9795e-01, -7.9977e-01,  ...,  2.5457e-01,
                        2.0292e+00,  2.8061e-01],
                      [-1.7325e+00,  9.5825e-02,  3.2361e-01,  ..., -4.5796e-04,
                       -1.1509e-01,  5.3223e-01],
                      ...,
                      [-6.8459e-01, -9.7837e-01, -4.8688e-01,  ...,  9.2992e-01,
                       -8.1525e-01,  1.5659e-01],
                      [ 1.4869e+00, -9.2087e-02,  1.0076e+00,  ..., -1.0114e+00,
                        6.7163e-01, -2.9542e+00],
                      [-1.2746e+00, -6.8976e-01,  6.1937e-01,  ...,  1.8670e+00,
                       -8.8855e-01, -1.1423e-01]])),
             ('rnn.weight_ih_l0',
              tensor([[-4.0955e-02,  5.6538e-02,  3.8464e-01, -3.8552e-01,  3.8304e-01,
                       -2.02

In [3]:
import numpy as np
import pandas as pd
import platform

root_path = "C:\\Users\\mrbal\\Documents\\NLP\\RL\\basic_reinforcement_learning"
info_path = ""
sep = '/'
if platform.system() == "Windows":
    info_path = root_path + "\\NLP_datasets\\RT_Polarity\\data_info_bert_windows.json"
    sep = '\\'
    print("Running on windows...")
else:
    root_path = "/Users/emrebaloglu/Documents/RL/basic_reinforcement_learning"
    info_path = root_path + "/NLP_datasets/RT_Polarity/data_info_bert.json"

import sys
sys.path.append(root_path)
from RL_for_NLP.text_environments import TextEnvClfBert, TextEnvClf
from RL_for_NLP.text_reward_functions import calculate_stats_from_cm

from RL_for_NLP.text_data_pools import PartialReadingDataPoolWithTokens, PartialReadingDataPoolWithBertTokens
import NLP_utils.preprocessing as nlp_processing
import reinforce_algorithm_utils as rl_monte_carlo
import policy_networks as pn

import torch
from torch.optim import Adam
import mlflow

from stable_baselines3 import A2C, DQN, PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.dqn.policies import MlpPolicy
from stable_baselines3 import DQN

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import json
from collections import Counter
from tqdm import tqdm


with open(info_path, "r") as f:
    data_info = json.load(f)


data_train = nlp_processing.openDfFromPickle(data_info["path"] + sep + "rt-polarity-train-bert.pkl")




# declare some hyperparameters
WINDOW_SIZE = 10
MAX_STEPS = int(1e+5)
VOCAB_SIZE = data_info["vocab_size"]
REWARD_FN = "score"
print(f"Vocab size: {VOCAB_SIZE}")

data_test = nlp_processing.openDfFromPickle(data_info["path"] + sep + "rt-polarity-test-bert.pkl")
data_val = nlp_processing.openDfFromPickle(data_info["path"] + sep + "rt-polarity-val-bert.pkl")

train_pool = PartialReadingDataPoolWithBertTokens(data_train, "review", "label", WINDOW_SIZE)
test_pool = PartialReadingDataPoolWithBertTokens(data_test, "review", "label", WINDOW_SIZE)
val_pool = PartialReadingDataPoolWithBertTokens(data_val, "review", "label", WINDOW_SIZE)

train_env_params = dict(data_pool=train_pool, max_time_steps=MAX_STEPS, reward_fn=REWARD_FN, random_walk=False, vocab_size=VOCAB_SIZE)
train_env = TextEnvClfBert(**train_env_params)
val_env = TextEnvClfBert(val_pool, 1000, reward_fn=REWARD_FN, vocab_size=VOCAB_SIZE)
test_env = TextEnvClfBert(test_pool, 1000, reward_fn=REWARD_FN, vocab_size=VOCAB_SIZE)

Running on windows...




Vocab size: 28996
Maximum sentence length in pool: 50
Maximum sentence length in pool: 50
Maximum sentence length in pool: 50


In [4]:
train_env.observation_space

Box([-1 -1 -1 -1 -1 -1 -1 -1 -1 -1], [1000000 1000000 1000000 1000000 1000000 1000000 1000000 1000000 1000000
 1000000], (10,), int32)

In [5]:
rnn_params = {
    "embed_dim": 50,
    "rnn_type": "gru",
    "rnn_hidden_size": 15,
    "rnn_hidden_out": 5,
    "rnn_bidirectional": True,
    "units": 64
    }
model2 = pn.RNNExtractor(train_env.observation_space, VOCAB_SIZE, **rnn_params)

In [6]:
for param in model_dict.keys():
    print(param)
    print(model_dict[param])
    break
    

embed_enc.weight
tensor([[-8.7591e-02, -1.0967e-01,  6.4074e-02,  ..., -2.0866e-01,
         -1.5149e-01, -2.2125e-02],
        [ 4.4670e-01, -5.9795e-01, -7.9977e-01,  ...,  2.5457e-01,
          2.0292e+00,  2.8061e-01],
        [-1.7325e+00,  9.5825e-02,  3.2361e-01,  ..., -4.5796e-04,
         -1.1509e-01,  5.3223e-01],
        ...,
        [-6.8459e-01, -9.7837e-01, -4.8688e-01,  ...,  9.2992e-01,
         -8.1525e-01,  1.5659e-01],
        [ 1.4869e+00, -9.2087e-02,  1.0076e+00,  ..., -1.0114e+00,
          6.7163e-01, -2.9542e+00],
        [-1.2746e+00, -6.8976e-01,  6.1937e-01,  ...,  1.8670e+00,
         -8.8855e-01, -1.1423e-01]])


In [7]:
param_list = list(model_dict.keys())[:-6]
param_list

['embed_enc.weight',
 'rnn.weight_ih_l0',
 'rnn.weight_hh_l0',
 'rnn.bias_ih_l0',
 'rnn.bias_hh_l0',
 'rnn.weight_ih_l0_reverse',
 'rnn.weight_hh_l0_reverse',
 'rnn.bias_ih_l0_reverse',
 'rnn.bias_hh_l0_reverse',
 'rnn.weight_ih_l1',
 'rnn.weight_hh_l1',
 'rnn.bias_ih_l1',
 'rnn.bias_hh_l1',
 'rnn.weight_ih_l1_reverse',
 'rnn.weight_hh_l1_reverse',
 'rnn.bias_ih_l1_reverse',
 'rnn.bias_hh_l1_reverse',
 'rnn.weight_ih_l2',
 'rnn.weight_hh_l2',
 'rnn.bias_ih_l2',
 'rnn.bias_hh_l2',
 'rnn.weight_ih_l2_reverse',
 'rnn.weight_hh_l2_reverse',
 'rnn.bias_ih_l2_reverse',
 'rnn.bias_hh_l2_reverse',
 'rnn.weight_ih_l3',
 'rnn.weight_hh_l3',
 'rnn.bias_ih_l3',
 'rnn.bias_hh_l3',
 'rnn.weight_ih_l3_reverse',
 'rnn.weight_hh_l3_reverse',
 'rnn.bias_ih_l3_reverse',
 'rnn.bias_hh_l3_reverse',
 'rnn.weight_ih_l4',
 'rnn.weight_hh_l4',
 'rnn.bias_ih_l4',
 'rnn.bias_hh_l4',
 'rnn.weight_ih_l4_reverse',
 'rnn.weight_hh_l4_reverse',
 'rnn.bias_ih_l4_reverse',
 'rnn.bias_hh_l4_reverse',
 'rnn.weight_ih_l5'

In [8]:

for param in param_list:
    print(param)
    model2.state_dict()[param].copy_(model_dict[param])
    

embed_enc.weight
rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
rnn.weight_ih_l2
rnn.weight_hh_l2
rnn.bias_ih_l2
rnn.bias_hh_l2
rnn.weight_ih_l2_reverse
rnn.weight_hh_l2_reverse
rnn.bias_ih_l2_reverse
rnn.bias_hh_l2_reverse
rnn.weight_ih_l3
rnn.weight_hh_l3
rnn.bias_ih_l3
rnn.bias_hh_l3
rnn.weight_ih_l3_reverse
rnn.weight_hh_l3_reverse
rnn.bias_ih_l3_reverse
rnn.bias_hh_l3_reverse
rnn.weight_ih_l4
rnn.weight_hh_l4
rnn.bias_ih_l4
rnn.bias_hh_l4
rnn.weight_ih_l4_reverse
rnn.weight_hh_l4_reverse
rnn.bias_ih_l4_reverse
rnn.bias_hh_l4_reverse
rnn.weight_ih_l5
rnn.weight_hh_l5
rnn.bias_ih_l5
rnn.bias_hh_l5
rnn.weight_ih_l5_reverse
rnn.weight_hh_l5_reverse
rnn.bias_ih_l5_reverse
rnn.bias_hh_l5_reverse
rnn.weight_ih_l6
rnn.we

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}.")

def eval_model(model, env, total_timesteps=10000):
    done = False
    obs = env.reset()
    total_reward = 0.0
    actions = []
    seen_samples = 0
    for _ in tqdm(range(total_timesteps)):
        action, _states = model.predict(obs)
        action = action.item()
        obs, rewards, done, info = env.step(action)
        action = env.action_space.ix_to_action(action)
        if action in env.pool.possible_actions:
            seen_samples += 1
        if done:
            obs = env.reset()
        actions.append(action)
        total_reward += rewards
    print("---------------------------------------------");
    print(f"Total Steps and seen samples: {len(actions), seen_samples}")
    print(f"Total reward: {total_reward}")
    print(f"Stats:  {calculate_stats_from_cm(env.confusion_matrix)}")
    acts = list(Counter(actions).keys())
    freqs = list(Counter(actions).values())
    total = len(actions)
    print(f"Action stats --  {[{acts[ii]: freqs[ii]/total} for ii in range(len(acts))]}")
    print("---------------------------------------------")





"""policy_kwargs = dict(
    features_extractor_class=pn.CNN1DExtractor,
    features_extractor_kwargs=dict(vocab_size = VOCAB_SIZE, embed_dim = 50, 
                                n_filter_list = [128, 64, 64, 32, 32, 16], kernel_size = 4, features_dim = 256),
)"""


Using device: cuda:0.


'policy_kwargs = dict(\n    features_extractor_class=pn.CNN1DExtractor,\n    features_extractor_kwargs=dict(vocab_size = VOCAB_SIZE, embed_dim = 50, \n                                n_filter_list = [128, 64, 64, 32, 32, 16], kernel_size = 4, features_dim = 256),\n)'

In [14]:
import stable_baselines3
import torch.nn as nn
class MyModel(nn.Module):
  def __init__(self, trained_nn: nn.Module, trained_out_dim: int, action_dim: int):
    super(MyModel, self).__init__()
    self.trained_nn = trained_nn
    self.trained_out_dim = trained_out_dim
    self.fc = nn.Linear(trained_out_dim, action_dim)
    
  def forward(self, observations):
    features = self.trained_nn(observations)
    actions = self.fc(features)
    return actions

# Create a new instance of your reinforcement learning model
model = MyModel(model2, 256, 3)

a2c = A2C(MyModel, train_env)


TypeError: __init__() got an unexpected keyword argument 'use_sde'

In [10]:
policy_kwargs = dict(
    features_extractor_class=model2,
)

# model = DQN("CnnPolicy", train_env, policy_kwargs=policy_kwargs, verbose=1, batch_size=64)
model = A2C(policy = "MlpPolicy",
            env = train_env,
            gae_lambda = 0.9,
            gamma = 0.99,
            learning_rate = 0.01,
            max_grad_norm = 0.5,
            n_steps = 1000,
            vf_coef = 0.4,
            ent_coef = 0.0,
            policy_kwargs=policy_kwargs,
            normalize_advantage=False,
            verbose=0, 
            use_rms_prop=False)
# model = PPO("MlpPolicy", train_env, policy_kwargs=policy_kwargs)

train_env.set_train_mode(False)
val_env.set_train_mode(False)
for i in range(int(5)):
    model.learn(total_timesteps=int(1e+4 )*5, reset_num_timesteps=True, progress_bar=True)
    print("======= On train env: ===============")
    eval_model(model, train_env)
    print("======= On val env: ===============")
    eval_model(model, val_env)

TypeError: unsqueeze(): argument 'input' (position 1) must be Tensor, not Box