# Q Learning Approach

In this notebook, we simplify our problem as MDP where the states are fully observable and solve our task using q-learning. 

In [23]:
#We load the necessary libraries
import pandas as pd
import sys
#We set the path to the project folder
sys.path.append('/Users/mariehuynh/Desktop/PhD/Year 1/Fall Quarter/CS238/cs238_final_project/')
from src.preprocessing import * 
from src.environment import *
from src.policy_generator import *
import json
import numpy as np
from src.evaluation import *

In [14]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### I. We split our data and hold out a test set to evaluate our policy at the end.

In [15]:
transition_df = pd.read_csv('data/transition_matrix_train_int.csv', index_col=0)

In [16]:
transition_df.head()

Unnamed: 0,s,a,r,sp
0,1,1.0,0,2
1,2,1.0,2,2
2,2,1.0,2,3
3,3,1.0,0,3
4,3,1.0,4,2


In [17]:
num_states = transition_df['s'].nunique()
num_actions = transition_df['a'].nunique()
gamma = 0.9
print(f"Number of states: {num_states}")
print(f"Number of actions: {num_actions}")

Number of states: 20
Number of actions: 6


In [18]:
ql = QLearning(num_states, num_actions, gamma=gamma)
#We train the Q-learning object
ql.train(transition_df, num_iterations=10)
#We write the policy to a file
ql.write_policy("hiv")

Iteration 0/10
Iteration 1/10
Iteration 2/10
Iteration 3/10
Iteration 4/10
Iteration 5/10
Iteration 6/10
Iteration 7/10
Iteration 8/10
Iteration 9/10


### II. Now, we want to evaluate our policy. 

In [19]:
test = pd.read_csv('data/test.csv', index_col=0)

In [20]:
test

Unnamed: 0,VL,CD4,Base Drug Combo,PatientID,Timepoints,VL_Binned,CD4_Binned
0,1141.895800,1070.03560,0.0,0,0,200+,500-1500
1,134.190550,444.54190,0.0,0,1,75-200,300-500
2,47.274055,230.53404,0.0,0,2,20-75,200-300
3,120.055940,419.28403,1.0,0,3,75-200,300-500
4,27.249084,230.72127,1.0,0,4,20-75,200-300
...,...,...,...,...,...,...,...
534895,163.896420,232.44673,0.0,8914,55,75-200,200-300
534896,33.685238,813.52545,0.0,8914,56,20-75,500-1500
534897,50.742954,680.18713,0.0,8914,57,20-75,500-1500
534898,148.255130,222.81631,0.0,8914,58,75-200,200-300


In [21]:
transition_df_test = create_transitions(test)

Processed 0 patients...
Processed 500 patients...
Processed 700 patients...
Processed 800 patients...
Processed 1400 patients...
Processed 3100 patients...
Processed 4100 patients...
Processed 4700 patients...
Processed 4900 patients...
Processed 5100 patients...
Processed 5700 patients...
Processed 6000 patients...
Processed 7000 patients...
Processed 8100 patients...
Processed 8200 patients...
Processed 8500 patients...
Processed 8800 patients...


In [22]:
transition_df_test

Unnamed: 0,s,a,r,sp
0,"(500-1500, 200+)",1.0,2,"(300-500, 75-200)"
1,"(300-500, 75-200)",1.0,2,"(200-300, 20-75)"
2,"(200-300, 20-75)",1.0,0,"(300-500, 75-200)"
3,"(300-500, 75-200)",2.0,2,"(200-300, 20-75)"
4,"(200-300, 20-75)",2.0,-2,"(200-300, 20-75)"
...,...,...,...,...
105251,"(500-1500, 75-200)",1.0,-2,"(200-300, 75-200)"
105252,"(200-300, 75-200)",1.0,4,"(500-1500, 20-75)"
105253,"(500-1500, 20-75)",1.0,-2,"(500-1500, 20-75)"
105254,"(500-1500, 20-75)",1.0,-2,"(200-300, 75-200)"


In [33]:
#We load the mapping of states to int
with open('data/state_to_int.json', 'r') as f:
    state_mapping = json.load(f)
len(state_mapping)

20

In [73]:
def get_state_mapping(state, state_mapping):
    return state_mapping[str(state)]

In [74]:
#We apply this mapping to the test data
transition_df_test['s'] = transition_df_test['s'].apply(get_state_mapping, args=(state_mapping,))
transition_df_test['sp'] = transition_df_test['sp'].apply(get_state_mapping, args=(state_mapping,))
transition_df_test.head()

Unnamed: 0,s,a,r,sp
0,2,1.0,2,10
1,10,1.0,2,14
2,14,1.0,0,10
3,10,2.0,2,14
4,14,2.0,-2,14


In [112]:
#We read the policy from the file
policy = read_policy("hiv.policy")
policy

{1: 6,
 2: 6,
 3: 6,
 4: 6,
 5: 2,
 6: 1,
 7: 4,
 8: 4,
 9: 6,
 10: 6,
 11: 5,
 12: 3,
 13: 5,
 14: 6,
 15: 4,
 16: 6,
 17: 6,
 18: 6,
 19: 2,
 20: 4}

In [113]:
# Suppose `heldout_data` is your dataset
# Initialize a dictionary to store action counts for each state
from collections import defaultdict
action_counts = defaultdict(lambda: defaultdict(int))
state_counts = defaultdict(int)

# Count occurrences of each action for each state
for i in range(len(transition_df_test)):
    row = transition_df_test.iloc[i]
    state = row['s']
    action = row['a']
    action_counts[state][action] += 1
    state_counts[state] += 1

# Compute behavior policy probabilities
behavior_policy = {}
for state, actions in action_counts.items():
    behavior_policy[state] = {action: count / state_counts[state] for action, count in actions.items()}

# Now `behavior_policy` maps (state, action) pairs to probabilities
print(behavior_policy)

{2.0: {1.0: 0.9035260055837039, 6.0: 0.009409575018095336, 2.0: 0.06896908282494055, 4.0: 0.012098025023265433, 3.0: 0.005790507703443284, 5.0: 0.00020680384655154586}, 10.0: {1.0: 0.8501369180980831, 2.0: 0.07966143888473985, 4.0: 0.04356484939009211, 6.0: 0.013940751804829475, 5.0: 0.009210853871048046, 3.0: 0.0034851879512073688}, 14.0: {1.0: 0.7528854080791426, 2.0: 0.06801319043693323, 4.0: 0.052967848309975266, 6.0: 0.07893652102225886, 5.0: 0.04513602638087387, 3.0: 0.002061005770816158}, 7.0: {2.0: 0.16747572815533981, 4.0: 0.06844660194174758, 1.0: 0.7063106796116505, 6.0: 0.04951456310679612, 3.0: 0.005339805825242718, 5.0: 0.002912621359223301}, 19.0: {4.0: 0.06606942889137737, 2.0: 0.254630028426221, 1.0: 0.5491429063657507, 5.0: 0.036609527091050045, 6.0: 0.06348522697906797, 3.0: 0.030062882246532863}, 18.0: {4.0: 0.06867311596799236, 1.0: 0.5616863728651619, 2.0: 0.16111310163621162, 5.0: 0.12528364982682433, 6.0: 0.05661053385883196, 3.0: 0.026633225844977904}, 16.0: {4

In [119]:
def evaluate_deterministic_policy_heldout_wis(heldout_data, deterministic_policy, behavior_policy):
    weighted_rewards = []
    weights = []

    for i in range(len(heldout_data)):
        data = heldout_data.iloc[i]
        state = data['s']
        action = data['a']
        reward = data['r']
        deterministic_policy_action = deterministic_policy[state]

        if (deterministic_policy_action == action):
            behavior_prob = behavior_policy[state][action]
            weight = 1 / behavior_prob if behavior_prob > 0 else 0
            weights.append(weight)
            weighted_rewards.append(weight * reward)
        else:
            weights.append(0)
            weighted_rewards.append(0)

    # Normalize by sum of weights
    return sum(weighted_rewards) / sum(weights) if sum(weights) > 0 else 0

In [116]:
transition_df_test['s']

0          2
1         10
2         14
3         10
4         14
          ..
105251     6
105252    12
105253     9
105254     9
105255    12
Name: s, Length: 105256, dtype: int64

In [117]:
transition_df_test

Unnamed: 0,s,a,r,sp
0,2,1.0,2,10
1,10,1.0,2,14
2,14,1.0,0,10
3,10,2.0,2,14
4,14,2.0,-2,14
...,...,...,...,...
105251,6,1.0,-2,12
105252,12,1.0,4,9
105253,9,1.0,-2,9
105254,9,1.0,-2,12


In [120]:
# Example usage
policy_value_wis = evaluate_deterministic_policy_heldout_wis(transition_df_test, policy, behavior_policy)
print(f"Estimated Policy Value (WIS): {policy_value_wis}")

Estimated Policy Value (WIS): 2.0937152477033094


In [121]:
#Can we do a random policy?
random_policy = {}
for state in action_counts.keys():
    random_policy[state] = np.random.choice(list(action_counts[state].keys()))

random_policy

{2.0: 6.0,
 10.0: 3.0,
 14.0: 2.0,
 7.0: 1.0,
 19.0: 6.0,
 18.0: 6.0,
 16.0: 4.0,
 15.0: 5.0,
 13.0: 6.0,
 9.0: 5.0,
 17.0: 2.0,
 4.0: 4.0,
 1.0: 2.0,
 12.0: 1.0,
 20.0: 1.0,
 3.0: 4.0,
 6.0: 4.0,
 5.0: 1.0,
 8.0: 2.0,
 11.0: 3.0}

In [122]:
# Example usage
random_policy_wis = evaluate_deterministic_policy_heldout_wis(transition_df_test, random_policy, behavior_policy)
print(f"Estimated Random Policy Value (WIS): {random_policy_wis}")

Estimated Random Policy Value (WIS): 1.1972979143885312
