In [1]:
import random
import numpy as np
import pandas as pd
import copy

# 소요시간 설정
times = {
    ('SP', 'A'): 5, ('SP', 'B'): 8, ('SP', 'C'): 14, ('SP', 'D'): 16, ('SP', 'CP'): 7,
    ('A', 'A'): 0, ('A', 'B'): 7, ('A', 'C'): 13, ('A', 'D'): 14, ('A', 'CP'): 7, ('A', 'EP'): 11,
    ('B', 'A'): 6, ('B', 'B'): 0, ('B', 'C'): 11, ('B', 'D'): 12, ('B', 'CP'): 7, ('B', 'EP'): 9,
    ('C', 'A'): 10, ('C', 'B'): 9, ('C', 'C'): 0, ('C', 'D'): 8, ('C', 'CP'): 2, ('C', 'EP'): 6,
    ('D', 'A'): 13, ('D', 'B'): 8, ('D', 'C'): 10, ('D', 'D'): 0, ('D', 'CP'): 8, ('D', 'EP'): 3,
    ('CP', 'A'): 10, ('CP', 'B'): 11, ('CP', 'C'): 8, ('CP', 'D'): 12, ('CP', 'CP'): 0, ('CP', 'EP'): 7}

max_energy = 50
process_nodes = ['A','B','C','D','CP']

In [2]:
class State:
    def __init__(self, node, process_nodes, current_energy):
        self.node = node
        self.process_nodes = process_nodes
        self.current_energy = current_energy

    def __repr__(self):
        return f"current_node: {self.node}, remaining_nodes: {self.process_nodes}, current_energy: {self.current_energy}"

In [3]:
class QLearning:
    def __init__(self, alpha, gamma, epsilon, decay):
        self.alpha = alpha  # 학습률
        self.gamma = gamma  # 할인 계수
        self.epsilon = epsilon  # 탐험 비율
        self.decay = decay
        self.q_table = {}  # 상태-행동 값(Q-값)을 저장하는 테이블
        self.rewards_table = {}  # 보상을 추적하는 테이블
        
    def param_update(self):
        self.alpha = self.alpha
        self.gamma = self.gamma*self.decay  # 할인 계수
        self.epsilon = self.epsilon*self.decay  # 탐험 비율

    def get_q_value(self, state, next_node):
        # 만약 상태 또는 행동이 Q-테이블에 없으면 기본값 0을 반환
        return self.q_table.get((state, next_node), 0.0)

    def action(self,state):
        if np.random.uniform() < self.epsilon:
            return random.choice(state.process_nodes)
        else:
            # 가장 높은 Q-값을 가진 행동 선택
            max_q_value = float('-inf')
            best_action = None
            best_next_node = None
            for action in state.process_nodes:
                q_value = self.get_q_value(state, action)
                if q_value > max_q_value:
                    max_q_value = q_value
                    best_action = action
            return best_action

    def learn(self, state, next_node, reward, next_state):
        if next_state.process_nodes:
            # 공정 처리 노드가 남아 있을 때
            max_q_value = max([self.get_q_value(next_state, next_action) for next_action in next_state.process_nodes])
        else:
            # 처리할 노드가 없을 때
            max_q_value = 0.0
        td_target = reward + self.gamma * max_q_value
        td_error = td_target - self.get_q_value(state, next_node)
        q_key = (state, next_node)
        if q_key not in self.q_table:
            self.q_table[q_key] = 0.0
        self.q_table[q_key] += self.alpha * td_error
        self.rewards_table[q_key] = reward

In [4]:
q_learning = QLearning(0.1,0.9,0.9,0.9)
for episode in range(10):
    node = 'SP'
    process_nodes = ['A', 'B', 'C', 'D', 'CP']
    current_energy = 20
    total_spent_time = 0
    
    state = State(node, process_nodes, current_energy)
    while state.process_nodes:
        next_node = q_learning.action(state)
        
        process_nodes = state.process_nodes.copy()
        process_nodes.remove(next_node)
        
        if next_node =='CP':
            spent_time = times[(state.node,next_node)] + 28
        else:
            spent_time = times[(state.node,next_node)]
        
        total_spent_time += spent_time
        reward =100-(spent_time + total_spent_time)
            
        next_state = State(next_node,process_nodes,current_energy)
            
        if next_state.current_energy < 0:
            reward -= 100
            
        q_learning.learn(state,next_node,reward,next_state)
        state = next_state
        
    next_node = 'EP'
    spent_time = times[(state.node, next_node)]
    total_spent_time +=spent_time
    state.current_energy -= spent_time
    
    next_state = State(next_node,process_nodes,current_energy)
    
    reward =100-(spent_time + total_spent_time)
    
    if state.current_energy <10:
        reward -= 100
    
    q_learning.learn(state,next_node,reward,next_state)
    print(next_state)
        
        
        
    # Q-learning 에이전트로부터 다음 동작 선택

current_node: EP, remaining_nodes: [], current_energy: 20
current_node: EP, remaining_nodes: [], current_energy: 20
current_node: EP, remaining_nodes: [], current_energy: 20
current_node: EP, remaining_nodes: [], current_energy: 20
current_node: EP, remaining_nodes: [], current_energy: 20
current_node: EP, remaining_nodes: [], current_energy: 20
current_node: EP, remaining_nodes: [], current_energy: 20
current_node: EP, remaining_nodes: [], current_energy: 20
current_node: EP, remaining_nodes: [], current_energy: 20
current_node: EP, remaining_nodes: [], current_energy: 20


In [31]:
a = q_learning.q_table.item()
a

AttributeError: 'dict' object has no attribute 'item'

In [30]:
state = State('SP',process_nodes,20)
action = 'A'

print(Q_value(state,action,q_learning))

None
