In [3]:
import numpy as np
import torch
import torch.nn as nn

import gensim
from gensim.models import Word2Vec, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [4]:
glove_input_file = 'weights/glove.6B.100d.txt'
word2vec_output_file = 'weights/glove.6B.100d.txt.word2vec'

glove2word2vec(glove_input_file, word2vec_output_file)
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

  glove2word2vec(glove_input_file, word2vec_output_file)


In [6]:
class LanguageModel(nn.Module):
    def __init__(self, word2vec_model, embedding_dim):
        super(LanguageModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(word2vec_model.vectors))
        self.embedding_dim = embedding_dim

    def forward(self, input_text):
        return self.embedding(input_text)


In [7]:
class VisionModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(VisionModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_image):
        x = torch.relu(self.fc1(input_image))
        return self.fc2(x)


In [8]:
class QNetwork(nn.Module):
    def __init__(self, language_model, vision_model, output_dim):
        super(QNetwork, self).__init__()
        self.language_model = language_model
        self.vision_model = vision_model
        self.fc1 = nn.Linear(language_model.embedding_dim + output_dim, 64)
        self.fc2 = nn.Linear(64, output_dim)

    def forward(self, language_input, vision_input):
        language_embedding = self.language_model(language_input)
        x = torch.relu(self.fc1(language_embedding))
        x = torch.cat((x, vision_input), dim=1)
        x = torch.relu(self.fc2(x))
        return x


In [9]:
class Environment:
    def __init__(self):
        self.state = torch.tensor([1.0, 2.0], dtype=torch.float32)

    def step(self, action):
        if action == 0:
            reward = 1.0
        else:
            reward = 0.0
        return self.state, reward

class QLearningAgent:
    def __init__(self, q_network, num_actions):
        self.q_network = q_network
        self.num_actions = num_actions
        self.optimizer = torch.optim.Adam(q_network.parameters(), lr=0.001)

    def select_action(self, state):
        q_values = self.q_network(state)
        return torch.argmax(q_values).item()

    def update_q_function(self, state, action, reward, next_state):
        predicted_q_values = self.q_network(state)
        next_q_values = self.q_network(next_state)
        target_q = predicted_q_values.clone()
        target_q[0, action] = reward + 0.9 * torch.max(next_q_values)

        loss = torch.nn.functional.mse_loss(predicted_q_values, target_q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()


In [10]:
language_model = LanguageModel(word2vec_model, embedding_dim=100)
vision_model = VisionModel(input_dim=10, hidden_dim=32, output_dim=8)
q_network = QNetwork(language_model, vision_model, output_dim=2)
agent = QLearningAgent(q_network, num_actions=2)
env = Environment()

for episode in range(100):
    state = env.state
    total_reward = 0

    for _ in range(100):  # Maximum of 100 steps per episode
        action = agent.select_action(state)
        next_state, reward = env.step(action)
        agent.update_q_function(state, action, reward, next_state)
        state = next_state
        total_reward += reward

    print(f"Episode {episode + 1}, Total Reward: {total_reward}")


TypeError: QNetwork.forward() missing 1 required positional argument: 'vision_input'

---