In [3]:
import numpy as np
import pandas as pd

# Load your data
df = pd.read_excel('dataset2_for_mp.xlsx')

# Normalize and preprocess the dataset
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_normalized = scaler.fit_transform(df.drop(columns=['FSHB c.-211 G>T', 'FSHR c.2039 A>G']))  # Drop non-numeric columns for normalization

# Define the state space, action space, and Q-table
states = df_normalized
actions = ['protocol_A', 'protocol_B', 'protocol_C']  # Example actions
num_states = len(states)
num_actions = len(actions)
Q = np.zeros((num_states, num_actions))

# Define hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration factor
num_episodes = 1000

# Reward function (simplified for illustration)
def get_reward(state, action):
    # Reward logic based on treatment success
    if action == 'protocol_A':
        return 1  # Example reward
    else:
        return 0

# Q-Learning Algorithm
for episode in range(num_episodes):
    state = np.random.choice(num_states)  # Random initial state
    done = False
    
    while not done:
        if np.random.rand() < epsilon:
            action = np.random.choice(num_actions)  # Explore
        else:
            action = np.argmax(Q[state, :])  # Exploit

        next_state = (state + 1) % num_states  # Simplified state transition
        reward = get_reward(state, actions[action])
        
        # Update Q-value
        Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state, :]) - Q[state, action])
        
        state = next_state
        if state == 0:
            done = True

# Dynamic Programming for Policy Evaluation and Improvement
def policy_evaluation(policy, states, actions, Q, gamma):
    for state in range(num_states):
        action = policy[state]
        Q[state, action] = sum([prob * (reward + gamma * Q[next_state, action])
                                for next_state, prob, reward in next_states_rewards(state, action)])

def policy_improvement(states, actions, Q):
    policy = np.zeros(num_states, dtype=int)
    for state in range(num_states):
        policy[state] = np.argmax(Q[state, :])
    return policy

def next_states_rewards(state, action):
    # Define next states, probabilities, and rewards (simplified)
    return [(state, 1, get_reward(state, actions[action]))]

# Initial random policy
policy = np.random.choice(num_actions, num_states)

# Policy Evaluation and Improvement
for _ in range(100):  # Iterations for policy evaluation and improvement
    policy_evaluation(policy, states, actions, Q, gamma)
    policy = policy_improvement(states, actions, Q)

# Print the learned policy
print("Learned Policy:", policy)

Learned Policy: [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
