# Power Simulation

In [2]:
import numpy as np
import random

In [3]:
def init_state_matrix(number_states:int, number_actions:int):
    return np.zeros((number_states, number_actions))

def encode_state(power_level, data_flag, time):
    return (power_level * N_MINUTES * N_DATA_COLLECT_FLAG) + (time * N_DATA_COLLECT_FLAG) + data_flag

def decode_state(state):
    data_flag = state % N_DATA_COLLECT_FLAG
    state = state // N_DATA_COLLECT_FLAG
    time = state % N_MINUTES
    power_level = state // N_MINUTES
    return power_level, data_flag, time

def transition(state, action):
    power_level, data_flag, time = decode_state(state)
    power_level = max(0, power_level - power_usage[action])

    # Update data_flag based on action
    if action == 2:  # Transmit data
        data_flag = 2  # Data transmitted
    elif action in [1, 3] and data_flag != 2:  # Collect data but not if data already transmitted
        data_flag = 1

    # Increment time and reset flags as necessary
    time = (time + 1) % N_MINUTES
    if time % 10 == 0:  # Reset data flag every 10 minutes
        data_flag = 0

    return encode_state(power_level, data_flag, time)

def get_reward(state, action):
    power_level, data_flag, time = decode_state(state)
    reward = 0

    if power_level <= 0:
        reward -= 10  # Penalty for running out of power
    if action == 2 and data_flag == 1:  # Reward for transmitting data
        reward += 1
    if data_flag == 0:  # Penalty for not collecting data in the last 10 minutes
        reward -= 1
    if action in [1, 3] and data_flag == 2:  # Penalize for collecting data when it's not needed
        reward -= 2

    return reward


action_list = ['sleep', 'measure', 'transmit', 'measure + transmit']

N_DATA_COLLECT_FLAG = 2
N_ACTIONS = len(action_list)
N_POWER_LEVELS = 100
N_MINUTES = 60*24

N_STATES = N_POWER_LEVELS * N_MINUTES * N_DATA_COLLECT_FLAG

power_usage = {0: 1, 1:5, 2:15, 3:20}




# Learning parameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration rate
num_episodes = 100  # Total number of episodes


Q_matrix = init_state_matrix(N_STATES, N_ACTIONS)



for episode in range(num_episodes):
    current_state = encode_state(50, 0, 0)  # Starting state: 50 power, no data, at midnight

    for t in range(N_MINUTES * 3):  # Loop for 3 days
        if random.uniform(0, 1) < epsilon:
            # Exploration: choose a random action
            action = random.randint(0, N_ACTIONS - 1)
        else:
            # Exploitation: choose the best action based on current Q-values
            action = np.argmax(Q_matrix[current_state])

        # Perform the action
        next_state = transition(current_state, action)
        reward = get_reward(next_state, action)

        # Q-learning update
        best_next_action = np.argmax(Q_matrix[next_state])
        td_target = reward + gamma * Q_matrix[next_state][best_next_action]
        td_error = td_target - Q_matrix[current_state][action]
        Q_matrix[current_state][action] += alpha * td_error

        # Move to the next state
        current_state = next_state







In [4]:
Q_matrix

array([[-27.99710871, -28.34697314, -28.37695739, -27.95124024],
       [  0.        ,   0.        ,   0.        ,   0.        ],
       [-24.45364891, -24.51989884, -24.01787865, -24.11224495],
       ...,
       [  0.        ,   0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ,   0.        ]])