### Model free learning

In [78]:
import numpy as np

# Define the agent class
class Agent:
    def __init__(self, num_actions, alpha=0.1, epsilon=0.1):
        self.num_actions = num_actions  # Number of possible actions
        self.alpha = alpha  # Learning rate
        self.epsilon = epsilon  # Exploration rate
        self.values = np.zeros(num_actions)  # Action values

    def choose_action(self):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.num_actions)  # Explore
        else:
            return np.argmax(self.values)  # Exploit

    def update_values(self, reward):
        self.values += self.alpha * (reward - np.mean(self.values))


# Cultural environment class
class CulturalEnvironment:
    def __init__(self, num_cultural_products):
        self.num_cultural_products = num_cultural_products  # Number of cultural products
        self.products = np.random.rand(num_cultural_products)  # Initialize cultural products
        self.products = np.array([10.9, 10.5, -90.5, 5.2, -50.4])
        self.experience = 0  # Initialize experience
        
    def interact_with_agent(self, agent):
        action = agent.choose_action()
        reward = self.products[action]  # Reward based on cultural product
        agent.update_values(reward)  # Update agent's values
        self.experience += reward  # Accumulate experience


# Define the main function for simulation
def main(num_episodes, num_actions, num_cultural_products):
    agent = Agent(num_actions)
    cultural_env = CulturalEnvironment(num_cultural_products)

    for episode in range(num_episodes):
        cultural_env.interact_with_agent(agent)

        print(f"Episode {episode + 1}: Experience = {cultural_env.experience}")

In [79]:
# Run the simulation
num_episodes = 50  # Number of episodes
num_actions = 5  # Number of possible actions
num_cultural_products = 5  # Number of cultural products

main(num_episodes, num_actions, num_cultural_products)


Episode 1: Experience = 10.9
Episode 2: Experience = 21.8
Episode 3: Experience = 32.7
Episode 4: Experience = 43.6
Episode 5: Experience = 54.5
Episode 6: Experience = 65.4
Episode 7: Experience = 76.30000000000001
Episode 8: Experience = 87.20000000000002
Episode 9: Experience = 98.10000000000002
Episode 10: Experience = 109.00000000000003
Episode 11: Experience = 18.50000000000003
Episode 12: Experience = 29.400000000000027
Episode 13: Experience = 40.300000000000026
Episode 14: Experience = 51.200000000000024
Episode 15: Experience = 62.10000000000002
Episode 16: Experience = 73.00000000000003
Episode 17: Experience = 83.90000000000003
Episode 18: Experience = 94.80000000000004
Episode 19: Experience = 105.70000000000005
Episode 20: Experience = 116.60000000000005
Episode 21: Experience = 127.50000000000006
Episode 22: Experience = 138.40000000000006
Episode 23: Experience = 148.90000000000006
Episode 24: Experience = 159.80000000000007
Episode 25: Experience = 170.70000000000007
E

### Model-based learning with RNNs and VAEs
- We use a VAE to learn a latent representation of the cultural environment or the agent's experiences.
- An RNN is employed to model sequential dependencies in the agent's experiences over time.

In [109]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
from torch.nn.utils.rnn import pack_sequence, pad_packed_sequence

# Agent class
class Agent(nn.Module):
    def __init__(self, num_actions, alpha=0.1, epsilon=0.1):
        super(Agent, self).__init__()
        self.num_actions = num_actions  # Number of possible actions
        self.alpha = alpha  # Learning rate
        self.epsilon = epsilon  # Exploration rate
        self.values = np.zeros(num_actions)  # Action values

    def choose_action(self):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.num_actions)  # Explore
        else:
            return np.argmax(self.values)  # Exploit

    def update_values(self, reward):
        self.values += self.alpha * (reward - np.mean(self.values))

# RNN-based agent class
class RNNAgent(Agent):
    def __init__(self, num_actions, input_size, hidden_size, alpha=0.1, epsilon=0.1):
        super().__init__(num_actions, alpha, epsilon)
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_actions)

    def forward(self, inputs):
        outputs, _ = self.rnn(inputs)
        outputs = self.fc(outputs[:, -1, :])  # Take the last output
        return outputs


# VAE model
class VAE(nn.Module):
    def __init__(self, input_size, latent_size):
        super(VAE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, latent_size * 2)  # 2 times latent_size for mean and variance
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_size, 64),
            nn.ReLU(),
            nn.Linear(64, input_size),
            nn.Sigmoid()  # Sigmoid for binary data, remove for continuous data
        )
    
    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std
    
    def forward(self, x):
        x = x.view(-1, input_size)
        x = self.encoder(x)
        mu, logvar = x[:, :latent_size], x[:, latent_size:]
        z = self.reparameterize(mu, logvar)
        return self.decoder(z), mu, logvar

def normalize_tensor(tensor):
    # Find minimum and maximum values
    min_value = torch.min(tensor)
    max_value = torch.max(tensor)
    
    # Shift the range so that min_value becomes 0
    shifted_tensor = tensor - min_value
    
    # Scale the range to 0-1
    normalized_tensor = shifted_tensor / (max_value - min_value)
    
    return normalized_tensor


# Define the cultural environment class
class CulturalEnvironment:
    def __init__(self, num_cultural_products):
        self.num_cultural_products = num_cultural_products  # Number of cultural products
        self.products = torch.rand(num_cultural_products)  # Initialize cultural products
        print(self.products)
        self.experience = 0  # Initialize experience

    def interact_with_agent(self, agent, vae_model):
        # Convert cultural products to tensor
        cultural_tensor = torch.tensor(self.products).unsqueeze(0).unsqueeze(0)
        
        # Use RNN-based agent to predict action
        rnn_action_logits = agent(cultural_tensor)
        action = torch.argmax(rnn_action_logits).item()
        
        reward = self.products[action]  # Reward based on cultural product
#         agent.update_values(reward)  # Update agent's values
        
        # Use VAE model to reconstruct cultural products
        reconstructed_products, _, _ = vae_model(cultural_tensor.view(-1))
        target = cultural_tensor.view(-1).expand_as(reconstructed_products)
        
        # Accumulate experience based on both reconstruction error and reward
        reconstruction_loss = nn.functional.binary_cross_entropy(reconstructed_products, target)
        self.experience += reconstruction_loss.item() + reward

# Define the main function for simulation
def main(num_episodes, num_actions, num_cultural_products, input_size, hidden_size, latent_size):
    rnn_agent = RNNAgent(num_actions, input_size, hidden_size)
    vae_model = VAE(input_size, latent_size)
    
    # Define optimizer for RNN-based agent
    rnn_optimizer = optim.Adam(rnn_agent.parameters(), lr=0.1)
    
    # Define optimizer for VAE model
    vae_optimizer = optim.Adam(vae_model.parameters(), lr=0.1)
    cultural_env = CulturalEnvironment(num_cultural_products)

    for episode in range(num_episodes):        
        # Convert cultural products to tensor
        cultural_tensor = torch.tensor(cultural_env.products).unsqueeze(0).unsqueeze(0)
        
        # Train RNN-based agent
        rnn_optimizer.zero_grad()
        rnn_action_logits = rnn_agent(cultural_tensor)
        action = torch.argmax(rnn_action_logits).item()
        
        # Calculate reward based on the predicted action
        reward = cultural_env.products[action]
        
        # Calculate loss based on the negative reward
        rnn_loss = -torch.tensor(reward, dtype=torch.float, requires_grad=True)  # Convert reward to tensor
        
        # Backpropagate and update RNN-based agent's weights
        rnn_loss.backward()
        rnn_optimizer.step()
        
        # Train VAE model
        vae_optimizer.zero_grad()
        reconstructed_products, mu, logvar = vae_model(cultural_tensor.view(-1))
        target = cultural_tensor.view(-1).expand_as(reconstructed_products)
        vae_loss = nn.functional.binary_cross_entropy(reconstructed_products, target)
        vae_loss += -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
        vae_loss.backward()
        vae_optimizer.step()
        
        # Interact with the cultural environment
        cultural_env.interact_with_agent(rnn_agent, vae_model)

        print(f"Episode {episode + 1}: Experience = {cultural_env.experience}")


In [110]:
num_episodes = 1000  # Number of episodes
num_actions = 5  # Number of possible actions
num_cultural_products = 5  # Number of cultural products
input_size = num_cultural_products
hidden_size = 64
latent_size = 16

main(num_episodes, num_actions, num_cultural_products, input_size, hidden_size, latent_size)

tensor([0.2371, 0.8024, 0.2419, 0.5803, 0.4890])
Episode 1: Experience = 1.2347447872161865
Episode 2: Experience = 2.465806007385254
Episode 3: Experience = 3.784947395324707
Episode 4: Experience = 5.171415328979492
Episode 5: Experience = 6.605747699737549
Episode 6: Experience = 7.955357074737549
Episode 7: Experience = 9.804990768432617
Episode 8: Experience = 11.242969512939453
Episode 9: Experience = 12.719045639038086
Episode 10: Experience = 14.79166030883789
Episode 11: Experience = 15.98777961730957
Episode 12: Experience = 17.536455154418945
Episode 13: Experience = 18.914997100830078
Episode 14: Experience = 20.199386596679688
Episode 15: Experience = 21.933517456054688
Episode 16: Experience = 23.532747268676758
Episode 17: Experience = 25.19172477722168
Episode 18: Experience = 26.63846778869629
Episode 19: Experience = 28.028663635253906
Episode 20: Experience = 29.306621551513672
Episode 21: Experience = 31.186582565307617
Episode 22: Experience = 32.593360900878906
Ep

  cultural_tensor = torch.tensor(cultural_env.products).unsqueeze(0).unsqueeze(0)
  rnn_loss = -torch.tensor(reward, dtype=torch.float, requires_grad=True)  # Convert reward to tensor
  cultural_tensor = torch.tensor(self.products).unsqueeze(0).unsqueeze(0)


Episode 235: Experience = 345.6907958984375
Episode 236: Experience = 346.86572265625
Episode 237: Experience = 348.0833740234375
Episode 238: Experience = 349.2586975097656
Episode 239: Experience = 350.5083312988281
Episode 240: Experience = 352.4148864746094
Episode 241: Experience = 353.59228515625
Episode 242: Experience = 354.7670593261719
Episode 243: Experience = 355.9439392089844
Episode 244: Experience = 357.1185302734375
Episode 245: Experience = 358.3800048828125
Episode 246: Experience = 359.6277160644531
Episode 247: Experience = 360.8028564453125
Episode 248: Experience = 361.9930419921875
Episode 249: Experience = 363.1926574707031
Episode 250: Experience = 364.380859375
Episode 251: Experience = 365.5575866699219
Episode 252: Experience = 366.7651062011719
Episode 253: Experience = 367.9941101074219
Episode 254: Experience = 369.199951171875
Episode 255: Experience = 370.3858642578125
Episode 256: Experience = 371.5624084472656
Episode 257: Experience = 372.79660034179

Episode 522: Experience = 688.6367797851562
Episode 523: Experience = 689.8116455078125
Episode 524: Experience = 690.9861450195312
Episode 525: Experience = 692.16064453125
Episode 526: Experience = 693.3351440429688
Episode 527: Experience = 694.5143432617188
Episode 528: Experience = 695.6915283203125
Episode 529: Experience = 696.8666381835938
Episode 530: Experience = 698.0827026367188
Episode 531: Experience = 699.2572631835938
Episode 532: Experience = 700.4412841796875
Episode 533: Experience = 701.6174926757812
Episode 534: Experience = 702.7920532226562
Episode 535: Experience = 703.9666137695312
Episode 536: Experience = 705.1411743164062
Episode 537: Experience = 706.3157348632812
Episode 538: Experience = 707.4902954101562
Episode 539: Experience = 708.6764526367188
Episode 540: Experience = 709.8576049804688
Episode 541: Experience = 711.0326538085938
Episode 542: Experience = 712.2349243164062
Episode 543: Experience = 713.4141845703125
Episode 544: Experience = 714.5888

Episode 814: Experience = 1033.31005859375
Episode 815: Experience = 1034.4853515625
Episode 816: Experience = 1035.705078125
Episode 817: Experience = 1036.8807373046875
Episode 818: Experience = 1038.0555419921875
Episode 819: Experience = 1039.2303466796875
Episode 820: Experience = 1040.406005859375
Episode 821: Experience = 1041.580810546875
Episode 822: Experience = 1042.755615234375
Episode 823: Experience = 1043.930419921875
Episode 824: Experience = 1045.105224609375
Episode 825: Experience = 1046.2801513671875
Episode 826: Experience = 1047.4552001953125
Episode 827: Experience = 1048.6318359375
Episode 828: Experience = 1049.807373046875
Episode 829: Experience = 1051.0001220703125
Episode 830: Experience = 1052.1759033203125
Episode 831: Experience = 1053.3519287109375
Episode 832: Experience = 1054.52783203125
Episode 833: Experience = 1055.7037353515625
Episode 834: Experience = 1056.8792724609375
Episode 835: Experience = 1058.0548095703125
Episode 836: Experience = 1059