In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import itertools
from pulp import LpProblem, LpMinimize, LpVariable, lpSum, PULP_CBC_CMD




# based on mask and multiply operation
def Validate( X,m,t,rectified = False, includeValidTag = False):
    # mask and multiply
    X = np.array(X,dtype=int)
    m_nr,t_nr = X.shape
    mm = np.zeros(t_nr)
    for tag in range(t_nr):
        mask = m[np.where(X[:,tag] == 1)] # mask for tag
        mm[tag] = np.prod(mask) # multiply the mask
    A = np.matmul(X,(mm*t).transpose())

    A = np.array( [1 if x > 1 else x for x in A]) if rectified else A
    return (A, mm*t) if includeValidTag else A


def Latency( X,m,t,lost_penalty = 40):
    X = np.array(X,dtype=int)
    m_nr,t_nr = X.shape
    A,validTags = Validate(X,m,t,includeValidTag=True)
    L = [np.where(X[:,np.where((X[msg,:]*validTags)>0)[0][0]] == 1)[0][-1]-msg if A[msg] >0 else lost_penalty for msg in range(m_nr)]

    return np.array(L)




def  Get_Strength_Number(X):
    X = np.array(X,dtype=int)

    n_msg, n_tag = X.shape
    A = np.transpose(X)
    B = np.ones(n_tag)
    C = np.ones(n_msg)

    prob = LpProblem("Binary_LP_Problem", LpMinimize)

    # Define the variables
    x = [LpVariable(f'x{i}', cat='Binary') for i in range(len(C))]

    # Define the objective function
    prob += lpSum(C[i] * x[i] for i in range(len(C)))

    # Define the constraints
    for i in range(len(A)):
        prob += lpSum(A[i][j] * x[j] for j in range(len(C))) >= B[i]

    # Solve the problem
    prob.solve(PULP_CBC_CMD(msg=0))

    # Print the results
    print("Status:", prob.status)
    print("Objective value:", prob.objective.value())

    result = []
    for i in range(len(C)):
        #print(f"x{i+1}:", x[i].value())
        if x[i].value() == 1:
            result.append(i+1)
    return len(result)

def Reward(X,m,t,rectified_A = False, lost_penalty = 40, a = 1,l = 1, o = 100, s=1):
    X = np.array(X,dtype=int)
    A = Validate(X,m,t,rectified=rectified_A)
    L = Latency(X,A,t,lost_penalty=lost_penalty)

    m_nr,t_nr = X.shape
    r = a*np.sum(A) - l*np.sum(L) - o * t_nr/m_nr + s*Get_Strength_Number(X) 

    return r



class MatrixEnvironment:
    def __init__(self, m, n, p):
        self.m = m
        self.n = n
        self.p = p

    def reset(self):
        self.state = np.random.choice([0, 1], size=(self.m,), p=[1-self.p, self.p])
        return self.state

    def step(self, action_matrix):
        random_vector = np.random.choice([0, 1], size=(self.m,), p=[1-self.p, self.p])
        reward = self.calculate_reward(random_vector, action_matrix)
        done = True  # Episode is done after one step in this simple environment
        return random_vector, reward, done

    def calculate_reward(self, random_vector, action_matrix):
        # User-defined reward function
        reward = Reward(X = action_matrix, m = random_vector, t = np.ones(self.n) , rectified_A=False, lost_penalty=40, a=1, l=1, o=100, s=1)
        return reward

In [2]:
class PolicyNetwork(nn.Module):
    def __init__(self, m, n):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(m * n, 128)
        self.fc2 = nn.Linear(128, m * n)
        self.m = m
        self.n = n

    def forward(self, x):
        x = torch.flatten(x)
        x = torch.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x.view(self.m, self.n)

# Example usage
m, n = 100, 100
p = 0.99
env = MatrixEnvironment(m, n, p)
policy_net = PolicyNetwork(m, n)


In [3]:
def train(env, policy_net, num_episodes=1000, learning_rate=0.01):
    optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
    loss_fn = nn.MSELoss()  # Using MSE as a placeholder; adapt as needed for the reward structure

    for episode in range(num_episodes):
        env.reset()
        
        optimizer.zero_grad()
        
        # Generate an action matrix from the policy network
        action_matrix = torch.bernoulli(policy_net(torch.zeros(env.m, env.n)))
        
        random_vector, reward, done = env.step(action_matrix.detach().numpy())
        
        reward = torch.tensor(reward, dtype=torch.float32)
        
        loss = loss_fn(action_matrix, torch.zeros(env.m, env.n))  # Placeholder loss; adapt based on reward
        loss.backward()
        optimizer.step()
        
        if episode % 100 == 0:
            print(f"Episode {episode}, Loss: {loss.item()}")

train(env, policy_net)


Status: 1
Objective value: 4.0
Episode 0, Loss: 0.49239999055862427
Status: 1
Objective value: 3.0
Status: 1
Objective value: 4.0
Status: 1
Objective value: 4.0
Status: 1
Objective value: 4.0
Status: 1
Objective value: 3.0
Status: 1
Objective value: 4.0
Status: 1
Objective value: 4.0
Status: 1
Objective value: 4.0
