In [4]:
from random import randint
import numpy as np
import random
import datetime
from math import log, sqrt
import copy
import shortuuid
from farmgame import Farm, Transition, Action

class PaperAlignedMCTS:
    def __init__(self, **kwargs):
        self.id = shortuuid.uuid()
        self.identity = kwargs.get('color', 'red')
        self.policy = kwargs.get('policy', "selfish")
        self.calculation_time = datetime.timedelta(seconds=kwargs.get('time', 5))
        self.max_sims = kwargs.get('nsims', 1000000)
        self.max_moves = kwargs.get('max_moves')
        self.exploration_weight = kwargs.get('C', 1.41)
        
        # State tracking
        self.states = []
        self.visit_counts = {}  # (player, state_hash) -> visits
        self.action_values = {}  # (player, state_hash, action_id) -> value
        self.state_hash_table = {}  # hash -> state
        
    def get_action_value(self, state: Farm, action: Action, player: str) -> float:
        """Calculate action value incorporating cost-benefit analysis."""
        state_hash = self.hash_state(state)
        action_key = (player, state_hash, action.id)
        
        if action_key not in self.action_values:
            # Initialize based on action properties
            self.action_values[action_key] = self.evaluate_action_potential(state, action)
            
        # Calculate UCB value
        value = self.action_values[action_key]
        visits = self.visit_counts.get((player, state_hash), 0)
        action_visits = sum(1 for k in self.action_values if k[0] == player and k[1] == state_hash)
        
        if action_visits == 0:
            exploration = float('inf')
        else:
            exploration = self.exploration_weight * sqrt(log(visits + 1) / action_visits)
            
        return value + exploration
        
    def evaluate_action_potential(self, state: Farm, action: Action) -> float:
        """Evaluate action potential considering costs and future benefits."""
        current_player = state.whose_turn()
        steps_cost = state.get_steps(action) * state.stepcost
        is_helping = Transition(state, action).is_helping()
        
        # Base value starts negative with movement cost
        value = -steps_cost
        
        # Add potential immediate rewards
        if action.color == current_player["color"]:
            value += 100  # Potential reward for own vegetable
        
        # Evaluate helping value
        if is_helping:
            if self.policy == "altruistic":
                value += 100  # Direct value for helping
            elif self.policy == "collaborative":
                value += 50   # Moderate value for helping
            else:  # selfish
                # Value helping based on:
                # 1. How close we are to the helping action
                # 2. Whether we've completed our own tasks
                # 3. Whether the other player has helped us
                if state.all_objects_picked_up(current_player["color"]):
                    value += 30  # Higher value if we're done
                elif steps_cost < 3:  # Low cost helping opportunity
                    value += 20
                elif state.opponent_has_helped(current_player["color"]):
                    value += 25  # Reciprocate help
                    
        return value

    def choose_action(self):
        """Select best action using MCTS."""
        if not self.states:
            return None
            
        current_state = self.states[-1]
        legal_actions = current_state.legal_actions()
        
        if not legal_actions:
            return None
        if len(legal_actions) == 1:
            return legal_actions[0]
            
        # Run simulations
        num_sims = 0
        start_time = datetime.datetime.utcnow()
        
        while (datetime.datetime.utcnow() - start_time < self.calculation_time and 
               num_sims < self.max_sims):
            self.run_simulation()
            num_sims += 1
            
        # Select action based on average value
        return max(legal_actions, 
                  key=lambda a: self.action_values.get(
                      (self.identity, self.hash_state(current_state), a.id), 
                      float('-inf')
                  ))
        
    def run_simulation(self):
        """Run a single MCTS simulation."""
        states = [copy.deepcopy(self.states[-1])]
        actions = []
        players = []
        total_cost = 0
        
        # Selection and Expansion
        while len(states) < self.max_moves:
            current_state = states[-1]
            current_player = current_state.whose_turn()["name"]
            legal_actions = current_state.legal_actions()
            
            if not legal_actions:
                break
                
            # Select action
            action = self.select_action(current_state, legal_actions)
            
            # Track simulation path
            actions.append(action)
            players.append(current_player)
            
            # Update total cost
            total_cost += current_state.get_cost(action)
            
            # Take action
            next_state = current_state.take_action(action, inplace=False)
            states.append(next_state)
            
            # Check if game is done
            if next_state.is_done():
                break
                
        # Get final rewards
        final_state = states[-1]
        red_reward, _ = final_state.reward("red")
        purple_reward, _ = final_state.reward("purple")
        
        # Backpropagate
        self.backpropagate(
            states, actions, players,
            {
                "red": red_reward,
                "purple": purple_reward,
                "cost": total_cost
            }
        )
        
    def select_action(self, state: Farm, legal_actions: list) -> Action:
        """Select action during simulation."""
        # With small probability, select random action for exploration
        if random.random() < 0.1:
            return random.choice(legal_actions)
            
        current_player = state.whose_turn()["name"]
        state_hash = self.hash_state(state)
        
        # Calculate action values
        values = {
            action: self.get_action_value(state, action, current_player)
            for action in legal_actions
        }
        
        return max(legal_actions, key=lambda a: values[a])
        
    def backpropagate(self, states: list, actions: list, players: list, rewards: dict):
        """Backpropagate results through visited states."""
        for i in range(len(actions)):
            state = states[i]
            action = actions[i]
            player = players[i]
            state_hash = self.hash_state(state)
            
            # Update visit counts
            self.visit_counts[(player, state_hash)] = \
                self.visit_counts.get((player, state_hash), 0) + 1
                
            # Calculate reward based on policy
            if self.policy == "selfish":
                value = rewards[player] - rewards["cost"]
            elif self.policy == "altruistic":
                other_player = "purple" if player == "red" else "red"
                value = rewards[other_player] - rewards["cost"]
            else:  # collaborative
                value = rewards["red"] + rewards["purple"] - rewards["cost"]
                
            # Update action value with exponential moving average
            key = (player, state_hash, action.id)
            if key not in self.action_values:
                self.action_values[key] = value
            else:
                old_value = self.action_values[key]
                self.action_values[key] = old_value + 0.1 * (value - old_value)
                
    def update(self, state):
        """Add new state to history."""
        self.states.append(state)
        
    def hash_state(self, state: Farm) -> str:
        """Hash and store state."""
        state_hash = str(hash(state))
        self.state_hash_table[state_hash] = state
        return state_hash

In [None]:
# imports
from random import randint
import numpy as np
import random
import datetime
import math
import copy
import matplotlib.pyplot as plt
import csv
import sys
import pandas as pd

import farmgame

# First define the policies
policies = ["selfish", "altruistic", "collaborative"]

def simulate_game_with_helping(red_policy, purple_policy):
    print(f"Simulating a game: Red Policy = {red_policy}, Purple Policy = {purple_policy}")
    game = farmgame.configure_game(
        layer="Items00", 
        resourceCond="even", 
        costCond="low", 
        visibilityCond="full", 
        redFirst=True
    )
    
    # Create agents with new implementation
    red_agent = PaperAlignedMCTS(
        time=2.0,
        max_sims=5000000,
        exploration_weight=1.41,
        max_moves=10,
        color="red",
        policy=red_policy
    )
    
    purple_agent = PaperAlignedMCTS(
        time=2.0,
        max_sims=5000000,
        exploration_weight=1.41,
        max_moves=10,
        color="purple",
        policy=purple_policy
    )
    
    # Initialize agents
    red_agent.update(game)
    purple_agent.update(game)
    
    state = game
    done = False
    turn_count = 0
    red_rewards = []
    purple_rewards = []
    
    helping_actions = {
        "red": {"before": 0, "after": 0},
        "purple": {"before": 0, "after": 0}
    }
    
    objects_picked_up = {"red": False, "purple": False}
    
    while not done:
        current_player = state.players[state.turn]["name"]
        
        if current_player == "red":
            action = red_agent.choose_action()
        else:
            action = purple_agent.choose_action()
            
        # Check if helping action
        transition = farmgame.Transition(state, action)
        is_helping = transition.is_helping()
        
        if is_helping:
            if not objects_picked_up[current_player]:
                helping_actions[current_player]["before"] += 1
            else:
                helping_actions[current_player]["after"] += 1
                
        # Take action
        state = state.take_action(action, inplace=True)
        red_agent.update(state)
        purple_agent.update(state)
        
        # Check object pickup completion
        if not objects_picked_up[current_player]:
            if state.all_objects_picked_up(current_player):
                objects_picked_up[current_player] = True
                
        # Get rewards
        red_rwd, red_done = state.reward("red")
        purple_rwd, purple_done = state.reward("purple")
        done = red_done and purple_done
        
        red_rewards.append(red_rwd)
        purple_rewards.append(purple_rwd)
        
        turn_count += 1
        
    print(f"Game completed: Red Policy = {red_policy}, Purple Policy = {purple_policy}")
    print(f"Total Red Reward: {sum(red_rewards)}, Total Purple Reward: {sum(purple_rewards)}")
    print(f"Helping Actions Before - Red: {helping_actions['red']['before']}, Purple: {helping_actions['purple']['before']}")
    print(f"Helping Actions After - Red: {helping_actions['red']['after']}, Purple: {helping_actions['purple']['after']}")
    
    return red_rewards, purple_rewards, helping_actions

# Run simulations
results = []
for red_policy in policies:
    for purple_policy in policies:
        print(f"\nStarting simulations: Red Policy = {red_policy}, Purple Policy = {purple_policy}")
        for i in range(30):  # Run 100 games for each policy combination
            print(f"  Simulation {i+1}: Red Policy = {red_policy}, Purple Policy = {purple_policy}")
            red_rewards, purple_rewards, helping_actions = simulate_game_with_helping(red_policy, purple_policy)
            results.append({
                "Red Policy": red_policy,
                "Purple Policy": purple_policy,
                "Red Reward": sum(red_rewards),
                "Purple Reward": sum(purple_rewards),
                "Total Reward": sum(red_rewards) + sum(purple_rewards),
                "Red Helping Actions Before": helping_actions["red"]["before"],
                "Purple Helping Actions Before": helping_actions["purple"]["before"],
                "Red Helping Actions After": helping_actions["red"]["after"],
                "Purple Helping Actions After": helping_actions["purple"]["after"]
            })

# Convert to DataFrame and display summary
results_df = pd.DataFrame(results)

print("\nResults Summary by Policy Combination:")
print(results_df.groupby(['Red Policy', 'Purple Policy']).mean())

print("\nAverage Helping Actions by Policy:")
for policy in policies:
    print(f"\n{policy} policy:")
    print("As Red Player:")
    red_data = results_df[results_df['Red Policy'] == policy]
    print(f"  Before: {red_data['Red Helping Actions Before'].mean():.2f}")
    print(f"  After: {red_data['Red Helping Actions After'].mean():.2f}")
    print("As Purple Player:")
    purple_data = results_df[results_df['Purple Policy'] == policy]
    print(f"  Before: {purple_data['Purple Helping Actions Before'].mean():.2f}")
    print(f"  After: {purple_data['Purple Helping Actions After'].mean():.2f}")


Starting simulations: Red Policy = selfish, Purple Policy = selfish
  Simulation 1: Red Policy = selfish, Purple Policy = selfish
Simulating a game: Red Policy = selfish, Purple Policy = selfish


  start_time = datetime.datetime.utcnow()
  while (datetime.datetime.utcnow() - start_time < self.calculation_time and


Game completed: Red Policy = selfish, Purple Policy = selfish
Total Red Reward: 160, Total Purple Reward: 168
Helping Actions Before - Red: 0, Purple: 0
Helping Actions After - Red: 0, Purple: 0
  Simulation 2: Red Policy = selfish, Purple Policy = selfish
Simulating a game: Red Policy = selfish, Purple Policy = selfish
Game completed: Red Policy = selfish, Purple Policy = selfish
Total Red Reward: 224, Total Purple Reward: 220
Helping Actions Before - Red: 1, Purple: 1
Helping Actions After - Red: 0, Purple: 0
  Simulation 3: Red Policy = selfish, Purple Policy = selfish
Simulating a game: Red Policy = selfish, Purple Policy = selfish
Game completed: Red Policy = selfish, Purple Policy = selfish
Total Red Reward: 208, Total Purple Reward: 236
Helping Actions Before - Red: 0, Purple: 0
Helping Actions After - Red: 0, Purple: 0
  Simulation 4: Red Policy = selfish, Purple Policy = selfish
Simulating a game: Red Policy = selfish, Purple Policy = selfish
Game completed: Red Policy = selfi