In [None]:
import pandas as pd
import random
import anthropic
import os
import pickle
import ast
import json
import re
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from env import CellTowerEnv
from agent import SingleAgent

In [None]:
# power level to capacity mappings
MAPPING1 = {
        1: 25,
        2: 35,
        3: 43,
        4: 50
    }
MAPPING2 = {
        1: 8,
        2: 11,
        3: 14,
        4: 17,
        5: 21,
        6: 25,
        7: 28,
        8: 30
    }

### test

In [None]:
llm_ag1 = pd.DataFrame(columns=['state', 'rl_action', 'llm_action', 
                                'decision', 'reward', 'handled', 'dropped',
                                'trust_before', 'trust_after'])
llm_ag2 = pd.DataFrame(columns=['state', 'rl_action', 'llm_action', 
                                'decision', 'reward', 'handled', 'dropped',
                                'trust_before', 'trust_after'])

In [None]:
c_client = anthropic.Anthropic(api_key='')

def claude_action(ag1_state, max_p1, ag2_state, max_p2):
    prompt = f"""
    You control the TARGET base station. Output ONLY (power_level,offload_flag) as integers, do not output thinking process.

    - power_level ∈ [1,{max_p1}]
    - offload_flag ∈ {0,1} where 1=offload to neighbor (free), 0=keep local.
    - Offloading is FREE (no cost, no energy penalty).
    - Coverage: 0=good, 1=fair, 2=poor. Capacity: 0=headroom, 1=maxed.

    NEIGHBOR: p={ag2_state[0]}, cov={ag2_state[1]}, cap={ag2_state[2]}, drops={ag2_state[3]}
    TARGET:   p={ag1_state[0]}, cov={ag1_state[1]}, cap={ag1_state[2]}, drops={ag1_state[3]}

    Priority: Avoid drops > everything. If uncertain, default offload_flag=1.

    Hard rules:
    - If TARGET cap==1 and NEIGHBOR cap==0 → offload_flag=1.
    - If TARGET drops>0 → offload_flag=1 and power_level = min(p+1,{max_p1}).
    - If TARGET cap==0 and drops==0 → you MAY reduce power by 1 if still avoids drops.
    - Otherwise keep current power.

    Return only the tuple: (power_level,offload_flag)
    """

    message = c_client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=100,
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ]
    )

    output = message.content[0].text
    print(output)
    output = ast.literal_eval(output)
    return output

In [None]:
def testing_llm(df1, df2):
    # 43-46 dbm
    env1 = CellTowerEnv(MAPPING1, 50)
    # 30-38 dbm
    env2 = CellTowerEnv(MAPPING2, 30)
    
    agent1 = SingleAgent(
        num_power_level=env1.max_power,
        action_size=env1.action_space_size,
        learning_rate=0.7,
        discount_factor=0.95,
        epsilon=1.0,
        epsilon_decay=0.995,
        epsilon_min=0.01,
        table='ag1_qtable.npy'
    )
    agent2 = SingleAgent(
        num_power_level=env2.max_power,
        action_size=env2.action_space_size,
        learning_rate=0.7,
        discount_factor=0.95,
        epsilon=1.0,
        epsilon_decay=0.995,
        epsilon_min=0.01,
        table='ag2_qtable.npy'
    )

    max_steps = len(df1)
    for t in range(max_steps):
        print(t)
        calls1 = df1.iloc[t]['calls']
        # print(f'call1: {len(calls1)}')
        calls2 = df2.iloc[t]['calls']
        # print(f'call2: {len(calls2)}')

        # --- stations handle their own calls ---
        state1 = env1.get_state()
        state2 = env2.get_state()

        rl_action1 = agent1.choose_action(state1)
        # llm_action1 = gpt_action(state1, env1.max_power, state2, env2.max_power)
        llm_action1 = claude_action(state1, env1.max_power, state2, env2.max_power)
        decision1 = agent1.make_decision(state1, llm_action1, rl_action1)
        rl_action2 = agent2.choose_action(state2)
        # llm_action2 = gpt_action(state2, env2.max_power, state1, env1.max_power)
        llm_action2 = claude_action(state2, env2.max_power, state1, env1.max_power)
        decision2 = agent2.make_decision(state2, llm_action2, rl_action2)

        handled1, dropped1= env1.apply_action(decision1, calls1)
        handled2, dropped2= env2.apply_action(decision2, calls2)

        # --- stations handle handoffs ---
        if dropped1 >= 0 and decision1[1] == 1:
            added2, failed2 = env2.add_requests(calls1[handled1:])
        else:
            added2 = 0
            failed2 = 0
        if dropped2 >= 0 and decision2[1] == 1:
            added1, failed1 = env1.add_requests(calls2[handled2:])
        else:
            added1 = 0
            failed1 = 0

        # --- update how many requests got handled and dropped ---
        handled1 += added1
        dropped2 -= added1
        handled2 += added2
        dropped1 -= added2

        # --- compute rewards ---
        reward1 = env1.compute_reward(decision1, handled1, dropped1)
        reward2 = env2.compute_reward(decision2, handled2, dropped2)

        # --- update trusts ---
        t_old1, t_new1 = agent1.update_trust(state1, decision1, reward1, llm_action1, rl_action1)
        t_old2, t_new2 = agent2.update_trust(state2, decision2, reward2, llm_action2, rl_action2)

        llm_ag1.loc[len(llm_ag1)] = [state1, rl_action1, llm_action1, 
                                decision1, reward1, handled1, dropped1,
                                t_old1, t_new1]
        llm_ag2.loc[len(llm_ag2)] = [state2, rl_action2, llm_action2, 
                                decision2, reward2, handled2, dropped2,
                                t_old2, t_new2]

In [None]:
df1 = pd.read_csv('data/htraffic_station1.csv')
df1['calls'] = df1['calls'].apply(ast.literal_eval)
df2 = pd.read_csv('data/htraffic_station2.csv')
df2['calls'] = df2['calls'].apply(ast.literal_eval)
testing_llm(df1.iloc[:200], df2.iloc[:200])

In [None]:
print(f'Agent1 handled: {llm_ag1['handled'].sum()}')
print(f'Agent2 handled: {llm_ag2['handled'].sum()}')
print(f'Agent1 dropped: {llm_ag1['dropped'].sum()}')
print(f'Agent2 dropped: {llm_ag2['dropped'].sum()}')

In [None]:
rl_ag1 = pd.DataFrame(columns=['state', 'action', 'reward', 'handled', 'dropped'])
rl_ag2 = pd.DataFrame(columns=['state', 'action', 'reward', 'handled', 'dropped'])

In [None]:
def testing_rl(df1, df2):
    # 43-46 dbm
    env1 = CellTowerEnv(MAPPING1, 50)
    # 30-38 dbm
    env2 = CellTowerEnv(MAPPING2, 30)
    
    agent1 = SingleAgent(
        num_power_level=env1.max_power,
        action_size=env1.action_space_size,
        learning_rate=0.7,
        discount_factor=0.95,
        epsilon=1.0,
        epsilon_decay=0.995,
        epsilon_min=0.01,
        table='ag1_qtable.npy'
    )
    agent2 = SingleAgent(
        num_power_level=env2.max_power,
        action_size=env2.action_space_size,
        learning_rate=0.7,
        discount_factor=0.95,
        epsilon=1.0,
        epsilon_decay=0.995,
        epsilon_min=0.01,
        table='ag2_qtable.npy'
    )

    max_steps = len(df1)
    
    for t in range(max_steps):
        calls1 = df1.iloc[t]['calls']
        calls2 = df2.iloc[t]['calls']

        # --- stations handle their own calls ---
        state1 = env1.get_state()
        action1 = agent1.choose_action(state1)
        handled1, dropped1= env1.apply_action(action1, calls1)

        state2 = env2.get_state()
        action2 = agent2.choose_action(state2)
        handled2, dropped2= env2.apply_action(action2, calls2)

        # --- stations handle handoffs ---
        if dropped1 >= 0 and action1[1] == 1:
            added2, failed2 = env2.add_requests(calls1[handled1:])
        else:
            added2 = 0
            failed2 = 0
        if dropped2 >= 0 and action2[1] == 1:
            added1, failed1 = env1.add_requests(calls2[handled2:])
        else:
            added1 = 0
            failed1 = 0
        # --- update how many requests got handled and dropped ---
        handled1 += added1
        dropped2 -= added1
        handled2 += added2
        dropped1 -= added2

        # --- compute rewards ---
        reward1 = env1.compute_reward(action1, handled1, dropped1)
        reward2 = env2.compute_reward(action2, handled2, dropped2)

        rl_ag1.loc[len(rl_ag1)] = [state1, action1, reward1, handled1, dropped1]
        rl_ag2.loc[len(rl_ag2)] = [state2, action2, reward2, handled2, dropped2]

In [None]:
df1 = pd.read_csv('data/ntraffic_station1.csv')
df1['calls'] = df1['calls'].apply(ast.literal_eval)
df2 = pd.read_csv('data/ntraffic_station2.csv')
df2['calls'] = df2['calls'].apply(ast.literal_eval)
testing_rl(df1.iloc[800:1000], df2.iloc[800:1000])

### delayed-reward

In [None]:
llm_ag1 = pd.DataFrame(columns=['state', 'rl_action', 'llm_action', 
                                'decision', 'reward', 'handled', 'dropped',
                                'trust_before', 'trust_after'])
llm_ag2 = pd.DataFrame(columns=['state', 'rl_action', 'llm_action', 
                                'decision', 'reward', 'handled', 'dropped',
                                'trust_before', 'trust_after'])

In [None]:
c_client = anthropic.Anthropic(api_key='')

def claude_action(ag1_state, max_p1, ag2_state, max_p2):
    prompt = f"""
    You control the TARGET base station. Output ONLY (power_level,offload_flag) as integers, do not output thinking process.

    - power_level ∈ [1,{max_p1}]
    - offload_flag ∈ {0,1} where 1=offload to neighbor (free), 0=keep local.
    - Offloading is FREE (no cost, no energy penalty).
    - Coverage: 0=good, 1=fair, 2=poor. Capacity: 0=headroom, 1=maxed.

    NEIGHBOR: p={ag2_state[0]}, cov={ag2_state[1]}, cap={ag2_state[2]}, drops={ag2_state[3]}
    TARGET:   p={ag1_state[0]}, cov={ag1_state[1]}, cap={ag1_state[2]}, drops={ag1_state[3]}

    Priority: Avoid drops > everything. If uncertain, default offload_flag=1.

    Hard rules:
    - If TARGET cap==1 and NEIGHBOR cap==0 → offload_flag=1.
    - If TARGET drops>0 → offload_flag=1 and power_level = min(p+1,{max_p1}).
    - If TARGET cap==0 and drops==0 → you MAY reduce power by 1 if still avoids drops.
    - Otherwise keep current power.

    Return only the tuple: (power_level,offload_flag)
    """

    message = c_client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=100,
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ]
    )

    output = message.content[0].text
    print(f'LLM action: {output}')
    output = ast.literal_eval(output)
    return output


def llm_reward(ag1_hist, ag2_hist):
    prompt = f'''
        You are evaluating the performance of two cellular base stations (agent_1 and agent_2).

        The agents operate for several time steps. Each agent's record for a time step contains:
        - "state": a tuple (power_level, cover_index, capacity, unserved_last_time)
            * power_level is an integer, higher values mean more transmit power (more energy use, but more coverage).
            * coverage quality: 0 - good, 1 - fair, 2 - poor.
            * capacity: whether or not the station had reach its maximum capacity: 0 - no, 1 - yes.
            * unserved_last_time: how many requests were not handled due to the capacity limit
        - "action": a tuple (power_level, handoff_decision)
            * power_level is an integer, higher values mean more transmit power (more energy use, but more coverage).
            * handoff_decision is 0 or 1, where 1 means the agent tries to hand off unserved requests to another station.
        - "reward": the environment's immediate reward at this step (already computed).
        - "total": total number of user requests received at this step.
        - "handled": how many of those requests this agent successfully served.

        You will receive two separate lists:
        - One list for agent_0 (its records across k steps)
        - One list for agent_1 (its records across the same k steps)

        Task:
        - Evaluate each agent's performance *individually* over the interval.
        - Consider throughput (handled/total), fairness between agents, and efficiency 
        (avoid rewarding unnecessary power use).
        - Based on this, assign a **correctional reward** for each agent between -1.0 (very poor) 
        and +1.0 (excellent). The reward can be decimals such as 0.37 or -0.15.

        Agent records:
        - agent_1: {ag1_hist}
        - agent_2: {ag2_hist}
        
        Return ONLY valid JSON with the following structure:
        {{
        "agent_1": numeric score,
        "agent_2": numeric score
        }}. 
        Do not include thinking process.
        '''
    
    message = c_client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=100,
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ]
    )

    output = message.content[0].text.strip(" `")
    match = re.search(r"\{[\s\S]*?\}", output)
    try:
        result = json.loads(match.group(0))
        reward1 = float(result["agent_1"])
        reward2 = float(result["agent_2"])
        print(f'Delayed-rewards: {reward1}, {reward2}')
        return reward1, reward2
    except Exception as e:
        raise ValueError(f"Could not parse LLM response: {match.group(1)}") from e
    

def split_reward(total_reward, records):
    k = len(records)
    if k == 0:
        return []

    handled_sum = sum(max(0, int(r.get("handled", 0))) for r in records)
    uniform = total_reward / k

    if handled_sum == 0:
        return [uniform] * k

    return [
        0.5 * uniform + 0.5 * total_reward * (r.get("handled", 0) / handled_sum)
        for r in records
    ]


In [None]:
def with_delayed_reward(df1, df2, log1, log2):
    # 43-46 dbm
    env1 = CellTowerEnv(MAPPING1, 50)
    # 30-38 dbm
    env2 = CellTowerEnv(MAPPING2, 30)
    
    agent1 = SingleAgent(
        num_power_level=env1.max_power,
        action_size=env1.action_space_size,
        learning_rate=0.7,
        discount_factor=0.95,
        epsilon=1.0,
        epsilon_decay=0.995,
        epsilon_min=0.01,
        table='ag1_qtable.npy'
    )
    agent2 = SingleAgent(
        num_power_level=env2.max_power,
        action_size=env2.action_space_size,
        learning_rate=0.7,
        discount_factor=0.95,
        epsilon=1.0,
        epsilon_decay=0.995,
        epsilon_min=0.01,
        table='ag2_qtable.npy'
    )

    ag1_hist = []
    ag2_hist = []

    max_steps = len(df1)
    for t in range(max_steps):
        print(t)
        calls1 = df1.iloc[t]['calls']
        calls2 = df2.iloc[t]['calls']

        # --- stations handle their own calls ---
        state1 = env1.get_state()
        state2 = env2.get_state()

        rl_action1 = agent1.choose_action(state1)
        llm_action1 = claude_action(state1, env1.max_power, state2, env2.max_power)
        decision1 = agent1.make_decision(state1, llm_action1, rl_action1)
        rl_action2 = agent2.choose_action(state2)
        llm_action2 = claude_action(state2, env2.max_power, state1, env1.max_power)
        decision2 = agent2.make_decision(state2, llm_action2, rl_action2)

        handled1, dropped1= env1.apply_action(decision1, calls1)
        handled2, dropped2= env2.apply_action(decision2, calls2)

        # --- stations handle handoffs ---
        if dropped1 >= 0 and decision1[1] == 1:
            added2, failed2 = env2.add_requests(calls1[handled1:])
        else:
            added2 = 0
            failed2 = 0
        if dropped2 >= 0 and decision2[1] == 1:
            added1, failed1 = env1.add_requests(calls2[handled2:])
        else:
            added1 = 0
            failed1 = 0

        # --- update how many requests got handled and dropped ---
        handled1 += added1
        dropped2 -= added1
        handled2 += added2
        dropped1 -= added2

        # --- compute rewards ---
        reward1 = env1.compute_reward(decision1, handled1, dropped1)
        reward2 = env2.compute_reward(decision2, handled2, dropped2)

        # --- update trusts ---
        t_old1, t_new1 = agent1.update_trust(state1, decision1, reward1, llm_action1, rl_action1)
        t_old2, t_new2 = agent2.update_trust(state2, decision2, reward2, llm_action2, rl_action2)

        # --- log ---
        llm_ag1.loc[len(llm_ag1)] = [state1, rl_action1, llm_action1, 
                                decision1, reward1, handled1, dropped1,
                                t_old1, t_new1]
        llm_ag2.loc[len(llm_ag2)] = [state2, rl_action2, llm_action2, 
                                decision2, reward2, handled2, dropped2,
                                t_old2, t_new2]
        
        # --- delayed rewards ---
        ag1_hist.append(dict(state=state1, action=decision1, reward=reward1, total=calls1, handled=handled1))
        ag2_hist.append(dict(state=state2, action=decision2, reward=reward2, total=calls2, handled=handled2))

        if len(ag1_hist) == 3:
            d_reward1, d_reward2 = llm_reward(ag1_hist, ag2_hist)
            split1 = split_reward(d_reward1, ag1_hist)
            agent1.apply_delayed_reward(ag1_hist, split1, 0.15)
            split2 = split_reward(d_reward2, ag2_hist)
            agent2.apply_delayed_reward(ag2_hist, split2, 0.15)

            log1.loc[len(log1)] = [ag1_hist, d_reward1, split1]
            log2.loc[len(log1)] = [ag2_hist, d_reward2, split2]
            ag1_hist = []
            ag2_hist = []


In [None]:
df1 = pd.read_csv('data/ltraffic_station1.csv')
df1['calls'] = df1['calls'].apply(ast.literal_eval)
df2 = pd.read_csv('data/ltraffic_station2.csv')
df2['calls'] = df2['calls'].apply(ast.literal_eval)
log1 = pd.DataFrame(columns=['Records', 'Rewards', 'Splits'])
log2 = pd.DataFrame(columns=['Records', 'Rewards', 'Splits'])
with_delayed_reward(df1.iloc[:200], df2.iloc[:200], log1, log2)

In [None]:
print(f'Agent1 handled: {llm_ag1['handled'].sum()}')
print(f'Agent2 handled: {llm_ag2['handled'].sum()}')
print(f'Agent1 dropped: {llm_ag1['dropped'].sum()}')
print(f'Agent2 dropped: {llm_ag2['dropped'].sum()}')

### adapt

In [None]:
def adapt_llm(df1, df2):
    # 43-46 dbm
    env1 = CellTowerEnv(MAPPING1, 50)
    # 30-38 dbm
    env2 = CellTowerEnv(MAPPING2, 30)
    
    agent1 = SingleAgent(
        num_power_level=env1.max_power,
        action_size=env1.action_space_size,
        learning_rate=0.7,
        discount_factor=0.95,
        epsilon=1.0,
        epsilon_decay=0.995,
        epsilon_min=0.01,
        table='ag1_qtable.npy'
    )
    agent2 = SingleAgent(
        num_power_level=env2.max_power,
        action_size=env2.action_space_size,
        learning_rate=0.7,
        discount_factor=0.95,
        epsilon=1.0,
        epsilon_decay=0.995,
        epsilon_min=0.01,
        table='ag2_qtable.npy'
    )

    max_steps = len(df1)
    for t in range(max_steps):
        print(t)
        calls1 = df1.iloc[t]['calls']
        calls2 = df2.iloc[t]['calls']

        # --- stations handle their own calls ---
        state1 = env1.get_state()
        state2 = env2.get_state()

        rl_action1 = agent1.choose_action(state1)
        llm_action1 = claude_action(state1, env1.max_power, state2, env2.max_power)
        decision1 = agent1.make_decision(state1, llm_action1, rl_action1)
        rl_action2 = agent2.choose_action(state2)
        llm_action2 = claude_action(state2, env2.max_power, state1, env1.max_power)
        decision2 = agent2.make_decision(state2, llm_action2, rl_action2)

        handled1, dropped1= env1.apply_action(decision1, calls1)
        handled2, dropped2= env2.apply_action(decision2, calls2)

        # --- stations handle handoffs ---
        if dropped1 >= 0 and decision1[1] == 1:
            added2, failed2 = env2.add_requests(calls1[handled1:])
        else:
            added2 = 0
            failed2 = 0
        if dropped2 >= 0 and decision2[1] == 1:
            added1, failed1 = env1.add_requests(calls2[handled2:])
        else:
            added1 = 0
            failed1 = 0

        # --- update how many requests got handled and dropped ---
        handled1 += added1
        dropped2 -= added1
        handled2 += added2
        dropped1 -= added2

        # --- compute rewards ---
        reward1 = env1.compute_reward(decision1, handled1, dropped1)
        reward2 = env2.compute_reward(decision2, handled2, dropped2)

        # --- Q-learning update ---
        agent1.learn(state1, decision1, reward1, env1.get_state())
        agent2.learn(state2, decision2, reward2, env2.get_state())

        # --- update trusts ---
        t_old1, t_new1 = agent1.update_trust(state1, decision1, reward1, llm_action1, rl_action1)
        t_old2, t_new2 = agent2.update_trust(state2, decision2, reward2, llm_action2, rl_action2)

        llm_ag1.loc[len(llm_ag1)] = [state1, rl_action1, llm_action1, 
                                decision1, reward1, handled1, dropped1,
                                t_old1, t_new1]
        llm_ag2.loc[len(llm_ag2)] = [state2, rl_action2, llm_action2, 
                                decision2, reward2, handled2, dropped2,
                                t_old2, t_new2]
    return agent1, agent2

In [None]:
llm_ag1 = pd.DataFrame(columns=['state', 'rl_action', 'llm_action', 
                                'decision', 'reward', 'handled', 'dropped',
                                'trust_before', 'trust_after'])
llm_ag2 = pd.DataFrame(columns=['state', 'rl_action', 'llm_action', 
                                'decision', 'reward', 'handled', 'dropped',
                                'trust_before', 'trust_after'])

In [None]:
df1 = pd.read_csv('data/htraffic_station1.csv')
df1['calls'] = df1['calls'].apply(ast.literal_eval)
df2 = pd.read_csv('data/htraffic_station2.csv')
df2['calls'] = df2['calls'].apply(ast.literal_eval)
llm1, llm2 = adapt_llm(df1.iloc[:200], df2.iloc[:200])

In [None]:
print(f'Agent1 handled: {llm_ag1['handled'].sum()}')
print(f'Agent2 handled: {llm_ag2['handled'].sum()}')
print(f'Agent1 dropped: {llm_ag1['dropped'].sum()}')
print(f'Agent2 dropped: {llm_ag2['dropped'].sum()}')
print('\n')
print(f'Avg Reward1: {llm_ag1['reward'].mean()}')
print(f'Avg Reward2: {llm_ag2['reward'].mean()}')

In [None]:
def adapt_rl(df1, df2):
    # 43-46 dbm
    env1 = CellTowerEnv(MAPPING1, 50)
    # 30-38 dbm
    env2 = CellTowerEnv(MAPPING2, 30)
    
    agent1 = SingleAgent(
        num_power_level=env1.max_power,
        action_size=env1.action_space_size,
        learning_rate=0.7,
        discount_factor=0.95,
        epsilon=1.0,
        epsilon_decay=0.995,
        epsilon_min=0.01,
        table='ag1_qtable.npy'
    )
    agent2 = SingleAgent(
        num_power_level=env2.max_power,
        action_size=env2.action_space_size,
        learning_rate=0.7,
        discount_factor=0.95,
        epsilon=1.0,
        epsilon_decay=0.995,
        epsilon_min=0.01,
        table='ag2_qtable.npy'
    )

    max_steps = len(df1)
    
    for t in range(max_steps):
        calls1 = df1.iloc[t]['calls']
        calls2 = df2.iloc[t]['calls']

        # --- stations handle their own calls ---
        state1 = env1.get_state()
        action1 = agent1.choose_action(state1)
        handled1, dropped1= env1.apply_action(action1, calls1)

        state2 = env2.get_state()
        action2 = agent2.choose_action(state2)
        handled2, dropped2= env2.apply_action(action2, calls2)

        # --- stations handle handoffs ---
        if dropped1 >= 0 and action1[1] == 1:
            added2, failed2 = env2.add_requests(calls1[handled1:])
        else:
            added2 = 0
            failed2 = 0
        if dropped2 >= 0 and action2[1] == 1:
            added1, failed1 = env1.add_requests(calls2[handled2:])
        else:
            added1 = 0
            failed1 = 0
        # --- update how many requests got handled and dropped ---
        handled1 += added1
        dropped2 -= added1
        handled2 += added2
        dropped1 -= added2

        # --- compute rewards ---
        reward1 = env1.compute_reward(action1, handled1, dropped1)
        reward2 = env2.compute_reward(action2, handled2, dropped2)

        # --- Q-learning update ---
        agent1.learn(state1, action1, reward1, env1.get_state())
        agent2.learn(state2, action2, reward2, env2.get_state())

        rl_ag1.loc[len(rl_ag1)] = [state1, action1, reward1, handled1, dropped1]
        rl_ag2.loc[len(rl_ag2)] = [state2, action2, reward2, handled2, dropped2]
    return agent1, agent2

In [None]:
rl_ag1 = pd.DataFrame(columns=['state', 'action', 'reward', 'handled', 'dropped'])
rl_ag2 = pd.DataFrame(columns=['state', 'action', 'reward', 'handled', 'dropped'])

In [None]:
df1 = pd.read_csv('data/htraffic_station1.csv')
df1['calls'] = df1['calls'].apply(ast.literal_eval)
df2 = pd.read_csv('data/htraffic_station2.csv')
df2['calls'] = df2['calls'].apply(ast.literal_eval)
rl1, rl2 = adapt_rl(df1.iloc[:200], df2.iloc[:200])

In [None]:
print(f'Agent1 handled: {rl_ag1['handled'].sum()}')
print(f'Agent2 handled: {rl_ag2['handled'].sum()}')
print(f'Agent1 dropped: {rl_ag1['dropped'].sum()}')
print(f'Agent2 dropped: {rl_ag2['dropped'].sum()}')
print('\n')
print(f'Avg Reward1: {rl_ag1['reward'].mean()}')
print(f'Avg Reward2: {rl_ag2['reward'].mean()}')

In [None]:
def plot_lines(df1, col1, df2, col2):
    plt.figure(figsize=(8,5))
    plt.plot(df1.index, df1[col1], label=f"LLM")
    plt.plot(df2.index, df2[col2], label=f"RL")
    plt.xlabel("Step")
    plt.ylabel("Reward")
    plt.legend()
    plt.show()

In [None]:
plot_lines(llm_ag2, 'reward', rl_ag2, 'reward')

In [None]:
llm_ag1 = pd.DataFrame(columns=['state', 'rl_action', 'llm_action', 
                                'decision', 'reward', 'handled', 'dropped',
                                'trust_before', 'trust_after'])
llm_ag2 = pd.DataFrame(columns=['state', 'rl_action', 'llm_action', 
                                'decision', 'reward', 'handled', 'dropped',
                                'trust_before', 'trust_after'])

df1 = pd.read_csv('data/ltraffic_station1.csv')
df1['calls'] = df1['calls'].apply(ast.literal_eval)
df2 = pd.read_csv('data/ltraffic_station2.csv')
df2['calls'] = df2['calls'].apply(ast.literal_eval)
llm1, llm2 = adapt_llm(df1.iloc[:200], df2.iloc[:200])

In [None]:
print(f'Agent1 handled: {llm_ag1['handled'].sum()}')
print(f'Agent2 handled: {llm_ag2['handled'].sum()}')
print(f'Agent1 dropped: {llm_ag1['dropped'].sum()}')
print(f'Agent2 dropped: {llm_ag2['dropped'].sum()}')
print('\n')
print(f'Avg Reward1: {llm_ag1['reward'].mean()}')
print(f'Avg Reward2: {llm_ag2['reward'].mean()}')

In [None]:
rl_ag1 = pd.DataFrame(columns=['state', 'action', 'reward', 'handled', 'dropped'])
rl_ag2 = pd.DataFrame(columns=['state', 'action', 'reward', 'handled', 'dropped'])

df1 = pd.read_csv('data/ltraffic_station1.csv')
df1['calls'] = df1['calls'].apply(ast.literal_eval)
df2 = pd.read_csv('data/ltraffic_station2.csv')
df2['calls'] = df2['calls'].apply(ast.literal_eval)
rl1, rl2 = adapt_rl(df1.iloc[:200], df2.iloc[:200])

In [None]:
print(f'Agent1 handled: {rl_ag1['handled'].sum()}')
print(f'Agent2 handled: {rl_ag2['handled'].sum()}')
print(f'Agent1 dropped: {rl_ag1['dropped'].sum()}')
print(f'Agent2 dropped: {rl_ag2['dropped'].sum()}')
print('\n')
print(f'Avg Reward1: {rl_ag1['reward'].mean()}')
print(f'Avg Reward2: {rl_ag2['reward'].mean()}')

In [None]:
rl_ag1.to_csv('learning/l_rl_ag1.csv', index=False)
rl_ag2.to_csv('learning/l_rl_ag2.csv', index=False)

np.save('learning/l_rl_ag1.npy', rl1.Q)
np.save('learning/l_rl_ag2.npy', rl2.Q)