# ST449 Final Project

## Connect4 Best Bots and Modifications

In [9]:
# imports
import numpy as np
import sys
import pygame
import math
import random
import pandas as pd
from collections import namedtuple, defaultdict, deque
import time
import multiprocessing
# from aima_python_master.utils4e  import *
# from aima_python_master.games4e  import *


### Generating the Board

#### Creating Connect Four Game class

In [10]:
GameState = namedtuple('GameState', 'to_move, utility, board, moves')

class Game:
    """A game is similar to a problem, but it has a utility for each
    state and a terminal test instead of a path cost and a goal
    test. To create a game, subclass this class and implement actions,
    result, utility, and terminal_test. You may override display and
    successors or you can inherit their default methods. You will also
    need to set the .initial attribute to the initial state; this can
    be done in the constructor."""

    def actions(self, state):
        """Return a list of the allowable moves at this point."""
        raise NotImplementedError

    def result(self, state, move):
        """Return the state that results from making a move from a state."""
        raise NotImplementedError

    def utility(self, state, player):
        """Return the value of this final state to player."""
        raise NotImplementedError

    def terminal_test(self, state):
        """Return True if this is a final state for the game."""
        return not self.actions(state)

    def to_move(self, state):
        """Return the player whose move it is in this state."""
        return state.to_move

    def display(self, state):
        """Print or otherwise display the state."""
        print(state)

    def __repr__(self):
        return '<{}>'.format(self.__class__.__name__)

    def play_game(self, *players):
        """Play an n-person, move-alternating game."""
        state = self.initial
        while True:
            for player in players:
                move = player(self, state)
                state = self.result(state, move)
                if self.terminal_test(state):
                    self.display(state)
                    return self.utility(state, self.to_move(self.initial))
                    

In [11]:
class C4(Game):
    """A TicTacToe-like game in which you can only make a move on the bottom
    row, or in a square directly above an occupied square. Traditionally
    played on a 6*7 board and requiring 4 in a row."""

    def __init__(self, h=6, v=7, k=4):
        self.h = h
        self.v = v
        self.k = k
        moves = [(x, y) for x in range(1, h + 1)
                 for y in range(1, v + 1)]
        self.initial = GameState(to_move='X', utility=0, board={}, moves=moves)

    def actions(self, state):
        """ If we write (x, y) as the coordinate on the board,
        then the bottom row correspond to x=7, or equivalently x=self.h
        Recall that state.board is a dict and the keys are occupied locations. """
        return [(x, y) for (x, y) in state.moves
                if x == self.h or (x + 1 , y) in state.board]

    def result(self, state, move):
        if move not in state.moves:
            return state  # Illegal move has no effect
        board = state.board.copy()
        board[move] = state.to_move
        moves = list(state.moves)
        moves.remove(move)
        return GameState(to_move=('O' if state.to_move == 'X' else 'X'),
                         utility=self.compute_utility(board, move, state.to_move),
                         board=board, moves=moves)

    def utility(self, state, player):
        """Return the value to player; 1 for win, -1 for loss, 0 otherwise."""
        return state.utility if player == 'X' else -state.utility

    def terminal_test(self, state):
        """A state is terminal if it is won or there are no empty squares."""
        return state.utility != 0 or len(state.moves) == 0

    def display(self, state):
        board = state.board
        for x in range(1, self.h + 1):
            for y in range(1, self.v + 1):
                print(board.get((x, y), '.'), end=' ')
            print()

    def compute_utility(self, board, move, player):
        """If 'X' wins with this move, return 1; if 'O' wins return -1; else return 0."""
        if (self.k_in_row(board, move, player, (0, 1)) or
                self.k_in_row(board, move, player, (1, 0)) or
                self.k_in_row(board, move, player, (1, -1)) or
                self.k_in_row(board, move, player, (1, 1))):
            return +1 if player == 'X' else -1
        else:
            return 0

    def k_in_row(self, board, move, player, delta_x_y):
        """Return true if there is a line through move on board for player."""
        (delta_x, delta_y) = delta_x_y
        x, y = move
        n = 0  # n is number of moves in row
        while board.get((x, y)) == player:
            n += 1
            x, y = x + delta_x, y + delta_y
        x, y = move
        while board.get((x, y)) == player:
            n += 1
            x, y = x - delta_x, y - delta_y
        n -= 1  # Because we counted move itself twice
        return n >= self.k
        

#### Evaluation function

In [12]:
def generate_segments(h=6, v=7, k=4):
    """ generate all segments of length k=4 on this board;
        segment is a list of lists of length 4 """
    segments = []

    # generate the vertical segments
    for y in range(1, v + 1):
        for x in range(1, h - k + 2):
            segment = []
            for t in range(k):
                segment.append((x + t, y))
            segments.append(segment)

    # generate the horizontal segments
    for x in range(1, h + 1):
        for y in range(1, v - k + 2):
            segment = []
            for t in range(k):
                segment.append((x, y + t))
            segments.append(segment)

    # generate the bottom left to top right diagonal segments
    for x in range(k, h + 1):
        for y in range(1, v - k + 2):
            segment = []
            for t in range(k):
                segment.append((x - t, y + t))
            segments.append(segment)

    # generate the top left to bottom right diagonal segments
    for y in range(1, v - k + 2):
        for x in range(1, h - k + 2):
            segment = []
            for t in range(k):
                segment.append((x + t, y + t))
            segments.append(segment)

    return segments

all_segments = generate_segments()

def count_in_segment(segment, state):
    """  Returns the count of 1's & 2's in a segment """
    """  Returns the count of X's & O's in a segment """
    X_count, O_count = 0, 0
    for x, y in segment:
        if state.board.get((x, y)) == 'X':
            X_count += 1
        elif state.board.get((x, y)) == 'O':
            O_count += 1
    return X_count, O_count

def eval_segment(segment, state, player):
    """ Returns the evaluation score for a segment """
    X_count, O_count = count_in_segment(segment, state)
    if X_count > 0 and O_count > 0:
        return 0   # mixed segments are neutral

    count = max(X_count, O_count)
    score = 0

    if count == 1:  # open segments with 1 in a row (small chance)
        score = 1
    elif count == 2:  # open segments with 2 in a row (medium chance)
        score = 10
    elif count == 3:  # open segments with 3 in a row (big chance)
        score = 100
    elif count == 4:   # open segments with 4 in a row (game over)
        score = 100000

    if X_count > O_count:
        dominant = 'X'
    else:
        dominant = 'O'

    if dominant == player:
        return score
    else:
        return -score

def eval_fn(state, player):
    """ The evaluation function """
    total = 0
    for segment in all_segments:
        total += eval_segment(segment, state, player)
    return total


### Search algorithms

#### Alpha-beta cutoff search

In [13]:
def alpha_beta_cutoff_search(state, game, d=4, cutoff_test=None, eval_fn=None):
    """Search game to determine best action; use alpha-beta pruning.
    This version cuts off search and uses an evaluation function."""

    player = game.to_move(state)

    # Functions used by alpha_beta
    def max_value(state, alpha, beta, depth):
        if cutoff_test(state, depth):
            return eval_fn(state, player)
        v = -np.inf
        for a in game.actions(state):
            v = max(v, min_value(game.result(state, a), alpha, beta, depth + 1))
            if v >= beta:
                return v
            alpha = max(alpha, v)
        return v

    def min_value(state, alpha, beta, depth):
        if cutoff_test(state, depth):
            return eval_fn(state, player)
        v = np.inf
        for a in game.actions(state):
            v = min(v, max_value(game.result(state, a), alpha, beta, depth + 1))
            if v <= alpha:
                return v
            beta = min(beta, v)
        return v

    # Body of alpha_beta_cutoff_search starts here:
    # The default test cuts off at depth d or at a terminal state
    cutoff_test = (cutoff_test or (lambda state, depth: depth > d or game.terminal_test(state)))
    eval_fn = eval_fn or (lambda state, player: game.utility(state, player))
    best_score = -np.inf
    beta = np.inf
    best_action = None
    for a in game.actions(state):
        v = min_value(game.result(state, a), best_score, beta, 1)
        if v > best_score:
            best_score = v
            best_action = a
    return best_action

#### Monte Carlo tree search

In [14]:
class MCT_Node:
    """Node in the Monte Carlo search tree, keeps track of the children states."""

    def __init__(self, parent=None, state=None, U=0, N=0):
        self.__dict__.update(parent=parent, state=state, U=U, N=N)
        self.children = {}
        self.actions = None


def ucb(n, C=1.4):
    return np.inf if n.N == 0 else n.U / n.N + C * np.sqrt(np.log(n.parent.N) / n.N)

def monte_carlo_tree_search(state, game, N=20000):
    def select(n):
        """select a leaf node in the tree"""
        if n.children:
            return select(max(n.children.keys(), key=ucb))
        else:
            return n

    def expand(n):
        """expand the leaf node by adding all its children states"""
        if not n.children and not game.terminal_test(n.state):
            n.children = {MCT_Node(state=game.result(n.state, action), parent=n): action
                          for action in game.actions(n.state)}
        return select(n)

    def simulate(game, state):
        """simulate the utility of current state by random picking a step"""
        player = game.to_move(state)
        while not game.terminal_test(state):
            action = random.choice(list(game.actions(state)))
            state = game.result(state, action)
        v = game.utility(state, player)
        return -v

    def backprop(n, utility):
        """passing the utility back to all parent nodes"""
        if utility > 0:
            n.U += utility
        # if utility == 0:
        #     n.U += 0.5
        n.N += 1
        if n.parent:
            backprop(n.parent, -utility)

    root = MCT_Node(state=state)

    for _ in range(N):
        leaf = select(root)
        child = expand(leaf)
        result = simulate(game, child.state)
        backprop(child, result)

    max_state = max(root.children, key=lambda p: p.N)

    return root.children.get(max_state)


#### Define the bots

In [15]:
def standard_MC_bot(game, state):
    return monte_carlo_tree_search(state, game, N = 1000)

def standard_alpha_beta_bot(game, state):
    return alpha_beta_cutoff_search(state, game, d = 5)

def standard_alpha_beta_eval_bot(game, state):
    return alpha_beta_cutoff_search(state, game, d = 5, eval_fn = eval_fn)
    

### Play Game

#### Standard

In [16]:
testC4game = C4()

#### Hyperparameter Tuning

In [17]:
#players = [test_MC_bot]

number_of_games = 100 

def test_MC_bot_1000(game, state):
    return monte_carlo_tree_search(state, game, N = 1000)

def test_MC_bot_10000(game, state):
    return monte_carlo_tree_search(state, game, N = 10000)

def test_MC_bot_5000(game, state):
    return monte_carlo_tree_search(state, game, N = 5000)

def test_alpha_beta_bot_3(game, state):
    return alpha_beta_cutoff_search(state, game, d = 3)

def test_alpha_beta_bot_4(game, state):
    return alpha_beta_cutoff_search(state, game, d = 4)

def test_alpha_beta_bot_5(game, state):
    return alpha_beta_cutoff_search(state, game, d = 5)

def test_alpha_beta_eval_bot_3(game, state):
    return alpha_beta_cutoff_search(state, game, d = 3, eval_fn = eval_fn)

def test_alpha_beta_eval_bot_4(game, state):
    return alpha_beta_cutoff_search(state, game, d = 4, eval_fn = eval_fn)

def test_alpha_beta_eval_bot_5(game, state):
    return alpha_beta_cutoff_search(state, game, d = 5, eval_fn = eval_fn)


In [None]:
import numpy as np
import pandas as pd
from collections import deque
import time

# Function to calculate heuristic score
def calculate_heuristic(wins, total_time):
    return wins / total_time if total_time > 0 else 0

# Simulation cache and results DataFrame
simulation_cache = {}
results_df = pd.DataFrame(columns=[
    "Bot1", "Bot2", "Bot1_Wins", "Bot2_Wins", "Bot1_Time", "Bot2_Time", "Bot1_Heuristic", "Bot2_Heuristic"
])

# Function to run simulation and calculate heuristic scores
def run_simulation_and_calculate_scores(bot1, bot2, num_games=30):
    bot_pair_key = (bot1[1], bot2[1])  # Use bot labels as cache key

    # Check if results are already cached
    if bot_pair_key in simulation_cache:
        return simulation_cache[bot_pair_key]

    results = {"bot1_wins": 0, "bot2_wins": 0, "bot1_time": 0, "bot2_time": 0}

    # Each bot alternates as the starting player for half the games
    for i in range(num_games):
        if i % 2 == 0:
            game_results = run_simulation(1, [(bot1[0], bot1[1]), (bot2[0], bot2[1])])
        else:
            game_results = run_simulation(1, [(bot2[0], bot2[1]), (bot1[0], bot1[1])])

        results["bot1_wins"] += game_results[bot1[1]]["wins"]
        results["bot2_wins"] += game_results[bot2[1]]["wins"]
        results["bot1_time"] += game_results[bot1[1]]["time_per_move"]
        results["bot2_time"] += game_results[bot2[1]]["time_per_move"]

    bot1_heuristic = calculate_heuristic(results["bot1_wins"], results["bot1_time"])
    bot2_heuristic = calculate_heuristic(results["bot2_wins"], results["bot2_time"])

    # Cache the results
    simulation_cache[bot_pair_key] = (bot1_heuristic, bot2_heuristic)

    # Append results to DataFrame
    results_df.loc[len(results_df)] = [
        bot1[1], bot2[1],
        results["bot1_wins"], results["bot2_wins"],
        results["bot1_time"], results["bot2_time"],
        bot1_heuristic, bot2_heuristic
    ]
    
    return bot1_heuristic, bot2_heuristic

# Function to create heuristic matrix and format as DataFrame
def create_heuristic_matrix_df(bots, opponents):
    bot_names = [bot[1] for bot in bots]
    opponent_names = [opponent[1] for opponent in opponents]

    # Initialize matrix with zeros
    matrix = np.zeros((len(bots), len(opponents) + 1))

    for i, bot in enumerate(bots):
        total_heuristic = 0
        for j, opponent in enumerate(opponents):
            if bot != opponent:
                bot_heuristic, _ = run_simulation_and_calculate_scores(bot, opponent)
                matrix[i, j] = bot_heuristic
                total_heuristic += bot_heuristic

        # Calculate average heuristic score
        if len(opponents) > 0:
            matrix[i, -1] = total_heuristic / len(opponents)

    # Create DataFrame with column headers
    column_headers = opponent_names + ["Average"]
    heuristic_df = pd.DataFrame(matrix, columns=column_headers)
    heuristic_df.insert(0, "Bot", bot_names)  # Insert bot names as the first column

    return heuristic_df

# Function to run a simulation of games
def run_simulation(num_games, bot_functions_with_labels):
    results = {label: {"wins": 0, "total_time": 0, "total_moves": 0} for _, label in bot_functions_with_labels}
    bot_order = deque([bot for bot, _ in bot_functions_with_labels])
    label_order = deque([label for _, label in bot_functions_with_labels])

    for i in range(num_games):
        bot_order.rotate(-1)
        label_order.rotate(-1)
        X_bot, O_bot = bot_order
        X_label, O_label = label_order
        game = C4(h=6, v=7, k=4)
        state = game.initial
        current_player = 'X'
        game_over = False
        player_to_bot = {'X': X_bot, 'O': O_bot}
        player_to_label = {'X': X_label, 'O': O_label}

        while not game_over:
            bot = player_to_bot[current_player]
            start_time = time.time()
            move = bot(game, state)
            state = game.result(state, move)
            move_time = time.time() - start_time
            bot_label = player_to_label[current_player]
            results[bot_label]["total_time"] += move_time
            results[bot_label]["total_moves"] += 1

            if game.terminal_test(state):
                winner = game.utility(state, game.to_move(game.initial))
                if winner == 1:
                    results[X_label]["wins"] += 1
                elif winner == -1:
                    results[O_label]["wins"] += 1
                game_over = True

            current_player = 'O' if current_player == 'X' else 'X'

    for bot in results:
        if results[bot]["total_moves"] > 0:
            results[bot]["time_per_move"] = results[bot]["total_time"] / results[bot]["total_moves"]
        else:
            results[bot]["time_per_move"] = 0

    return results

# Define the bots
monte_carlo_bots = [
    (test_MC_bot_1000, "MC_1000"),
    (test_MC_bot_10000, "MC_10000"),
    (test_MC_bot_5000, "MC_5000")
]

alpha_beta_bots = [
    (test_alpha_beta_bot_4, "AB_4"),
    (test_alpha_beta_bot_5, "AB_5"),
    (test_alpha_beta_eval_bot_3, "AB_3")
]

alpha_beta_eval_bots = [
    (test_alpha_beta_eval_bot_4, "ABE_4"),
    (test_alpha_beta_eval_bot_5, "ABE_5"),
    (test_alpha_beta_eval_bot_3, "ABE_3")
]

# Create matrices for each type of bots
monte_carlo_df = create_heuristic_matrix_df(monte_carlo_bots, alpha_beta_bots + alpha_beta_eval_bots)
alpha_beta_df = create_heuristic_matrix_df(alpha_beta_bots, monte_carlo_bots + alpha_beta_eval_bots)
alpha_beta_eval_df = create_heuristic_matrix_df(alpha_beta_eval_bots, monte_carlo_bots + alpha_beta_bots)

# Display the results
print("\nMonte Carlo Bots Heuristic Matrix:")
print(monte_carlo_df)

print("\nAlpha Beta Bots Heuristic Matrix:")
print(alpha_beta_df)

print("\nAlpha Beta Eval Bots Heuristic Matrix:")
print(alpha_beta_eval_df)

In [None]:
display(results_df)

#### Different sized boards

In [None]:
h_test = 6 # height of the new board
v_test = 10 # width of the new board

In [None]:
# Dynamically update the utility function used for evaluation in the alpha_beta_eval_bot to account for different sized boards
def generate_segments_diff_size(h=h_test, v=v_test, k=4):
    """ generate all segments of length k=4 on this board;
        segment is a list of lists of length 4 """
    segments = []

    # generate the vertical segments
    for y in range(1, v + 1):
        for x in range(1, h - k + 2):
            segment = []
            for t in range(k):
                segment.append((x + t, y))
            segments.append(segment)

    # generate the horizontal segments
    for x in range(1, h + 1):
        for y in range(1, v - k + 2):
            segment = []
            for t in range(k):
                segment.append((x, y + t))
            segments.append(segment)

    # generate the bottom left to top right diagonal segments
    for x in range(k, h + 1):
        for y in range(1, v - k + 2):
            segment = []
            for t in range(k):
                segment.append((x - t, y + t))
            segments.append(segment)

    # generate the top left to bottom right diagonal segments
    for y in range(1, v - k + 2):
        for x in range(1, h - k + 2):
            segment = []
            for t in range(k):
                segment.append((x + t, y + t))
            segments.append(segment)

    return segments

all_segments_diff_size = generate_segments_diff_size()

def count_in_segment_diff_size(segment, state):
    """  Returns the count of 1's & 2's in a segment """
    """  Returns the count of X's & O's in a segment """
    X_count, O_count = 0, 0
    for x, y in segment:
        if state.board.get((x, y)) == 'X':
            X_count += 1
        elif state.board.get((x, y)) == 'O':
            O_count += 1
    return X_count, O_count

def eval_segment_diff_size(segment, state, player):
    """ Returns the evaluation score for a segment """
    X_count, O_count = count_in_segment_diff_size(segment, state)
    if X_count > 0 and O_count > 0:
        return 0   # mixed segments are neutral

    count = max(X_count, O_count)
    score = 0

    if count == 1:  # open segments with 1 in a row (small chance)
        score = 1
    elif count == 2:  # open segments with 2 in a row (medium chance)
        score = 10
    elif count == 3:  # open segments with 3 in a row (big chance)
        score = 100
    elif count == 4:   # open segments with 4 in a row (game over)
        score = 100000

    if X_count > O_count:
        dominant = 'X'
    else:
        dominant = 'O'

    if dominant == player:
        return score
    else:
        return -score

def eval_fn_diff_size(state, player):
    """ The evaluation function """
    total = 0
    for segment in all_segments_diff_size:
        total += eval_segment_diff_size(segment, state, player)
    return total


In [None]:
testC4game_diff_size = C4(h = h_test, v = v_test)

def diff_size_MC_bot(game, state):
    return monte_carlo_tree_search(state, game, N = 1000)

def diff_size_alpha_beta_bot(game, state):
    return alpha_beta_cutoff_search(state, game, d = 5)

def diff_size_alpha_beta_eval_bot(game, state):
    return alpha_beta_cutoff_search(state, game, d = 5, eval_fn = eval_fn_diff_size)

In [None]:
testC4game_diff_size.play_game(diff_size_MC_bot, diff_size_alpha_beta_eval_bot)

In [None]:
## This will all be in a loop, testing various values for 'h' and 'v' both in the initialization of the C4() class,
## and in the generation of the evaluation function for alpha_beta_eval_bot

#### 3-players

In [None]:
class C4_3_player(Game):
    """
    A TicTacToe-like game in which you can only make a move on the bottom
    row, or in a square directly above an occupied square. This game introduces a third player that will play, the players take turns sequentially 1,2,3
    """

    # def __init__(self, h=3, v=3, k=3):
    def __init__(self, h=6, v=7, k=4):
        self.h = h
        self.v = v
        self.k = k
        moves = [(x, y) for x in range(1, h + 1)
                 for y in range(1, v + 1)]
        self.initial = GameState(to_move='X', utility=0, board={}, moves=moves)

    def actions(self, state):
        # """Legal moves are any square not yet taken."""
        """ If we write (x, y) as the coordinate on the board,
        then the bottom row correspond to x=7, or equivalently x=self.h
        Recall that state.board is a dict and the keys are occupied locations. """
        # return state.moves
        return [(x, y) for (x, y) in state.moves
                if x == self.h or (x + 1 , y) in state.board]

    def result(self, state, move):
        """Apply a move and return the new state."""
        if move not in state.moves:
            return state  # Illegal move has no effect
        board = state.board.copy()
        board[move] = state.to_move
        moves = list(state.moves)
        moves.remove(move)

        # Determine the next player
        next_player = self.get_next_player(state.to_move)

        return GameState(to_move=next_player,
                         utility=self.compute_utility(board, move, state.to_move),
                         board=board, moves=moves)

    def utility(self, state, player):
        """Return the utility value for the given player."""
        if state.utility == 1:  # Player 1 (X) wins
            return 1 if player == 'X' else -1
        elif state.utility == -1:  # Player 2 (O) wins
            return 1 if player == 'O' else -1
        elif state.utility == 2:  # Player 3 (3) wins
            return 2 if player == '3' else -2
        return 0  # No winner yet
    

    def terminal_test(self, state):
        """A state is terminal if it is won or there are no empty squares."""
        return state.utility != 0 or len(state.moves) == 0

    def display(self, state):
        board = state.board
        for x in range(1, self.h + 1):
            for y in range(1, self.v + 1):
                print(board.get((x, y), '.'), end=' ')
            print()

    def compute_utility(self, board, move, player):
        """If a player wins with this move, return a specific utility."""
        if (self.k_in_row(board, move, player, (0, 1)) or  # Horizontal
                self.k_in_row(board, move, player, (1, 0)) or  # Vertical
                self.k_in_row(board, move, player, (1, -1)) or  # Diagonal /
                self.k_in_row(board, move, player, (1, 1))):  # Diagonal \
            if player == 'X':
                return 1  # X wins
            elif player == 'O':
                return -1  # O wins
            elif player == '3':
                return 2  # Player 3 wins
        return 0  # No winner


    def k_in_row(self, board, move, player, delta_x_y):
        """Return true if there is a line through move on board for player."""
        (delta_x, delta_y) = delta_x_y
        x, y = move
        n = 0  # n is number of moves in row
        while board.get((x, y)) == player:
            n += 1
            x, y = x + delta_x, y + delta_y
        x, y = move
        while board.get((x, y)) == player:
            n += 1
            x, y = x - delta_x, y - delta_y
        n -= 1  # Because we counted move itself twice
        return n >= self.k
    
    def get_next_player(self, current_player):
        """Cycle through the three players: X -> O -> 3 -> X."""
        return {'X': 'O', 'O': '3', '3': 'X'}[current_player]

Eval Function For 3 Player

In [None]:
def generate_segments_3_player(h=6, v=7, k=4):
    """ generate all segments of length k=4 on this board;
        segment is a list of lists of length 4 """
    segments = []

    # generate the vertical segments
    for y in range(1, v + 1):
        for x in range(1, h - k + 2):
            segment = []
            for t in range(k):
                segment.append((x + t, y))
            segments.append(segment)

    # generate the horizontal segments
    for x in range(1, h + 1):
        for y in range(1, v - k + 2):
            segment = []
            for t in range(k):
                segment.append((x, y + t))
            segments.append(segment)

    # generate the bottom left to top right diagonal segments
    for x in range(k, h + 1):
        for y in range(1, v - k + 2):
            segment = []
            for t in range(k):
                segment.append((x - t, y + t))
            segments.append(segment)

    # generate the top left to bottom right diagonal segments
    for y in range(1, v - k + 2):
        for x in range(1, h - k + 2):
            segment = []
            for t in range(k):
                segment.append((x + t, y + t))
            segments.append(segment)

    return segments

all_segments_three = generate_segments_3_player()

def count_in_segment_three(segment, state):
    """  Returns the count of 1's & 2's in a segment """
    """  Returns the count of X's & O's in a segment """
    X_count, O_count, three_count = 0, 0, 0
    for x, y in segment:
        if state.board.get((x, y)) == 'X':
            X_count += 1
        elif state.board.get((x, y)) == 'O':
            O_count += 1
        elif state.board.get((x, y)) == '3':
            three_count += 1
    return X_count, O_count, three_count

def eval_segment_three(segment, state, player):
    """ Returns the evaluation score for a segment """
    X_count, O_count, three_count = count_in_segment_three(segment, state)
    if (X_count > 0 and O_count > 0) or (X_count > 0 and three_count >0) or (O_count > 0 and three_count > 0):
        return 0   # mixed segments are neutral

    count = max(X_count, O_count, three_count)
    score = 0

    if count == 1:  # open segments with 1 in a row (small chance)
        score = 1
    elif count == 2:  # open segments with 2 in a row (medium chance)
        score = 10
    elif count == 3:  # open segments with 3 in a row (big chance)
        score = 100
    elif count == 4:   # open segments with 4 in a row (game over)
        score = 100000

    if (X_count > O_count) and (X_count > three_count):
        dominant = 'X'
    elif (O_count > three_count) and (O_count > X_count):
        dominant = 'O'
    else:
        dominant = '3'
        
    if dominant == player:
        return score
    else:
        return -score

def eval_fn_three(state, player):
    """ The evaluation function """
    total = 0
    for segment in all_segments_three:
        total += eval_segment_three(segment, state, player)
    return total


In [None]:
def ucb(n, C=1.4):
    return np.inf if n.N == 0 else n.U / n.N + C * np.sqrt(np.log(n.parent.N) / n.N)

def monte_carlo_tree_search_three(state, game, N=20000):
    def select(n):
        """Select a leaf node in the tree"""
        if n.children:
            return select(max(n.children.keys(), key=ucb))
        else:
            return n

    def expand(n):
        """Expand the leaf node by adding all its children states"""
        if not n.children and not game.terminal_test(n.state):
            n.children = {MCT_Node(state=game.result(n.state, action), parent=n): action
                          for action in game.actions(n.state)}
        return select(n)

    def simulate(game, state):
        """Simulate the utility of current state by random picking a step"""
        player = game.to_move(state)
        while not game.terminal_test(state):
            action = random.choice(list(game.actions(state)))
            state = game.result(state, action)
            player = game.to_move(state)  # Update the player turn after each action
        v = game.utility(state, player)  # Evaluate the utility for the current player
        return -v  # Assume the utility for the player is returned

    def backprop(n, utility, player):
        """Passing the utility back to all parent nodes"""
        if utility > 0:
            n.U += utility
        n.N += 1
        if n.parent:
            backprop(n.parent, -utility, player)

    root = MCT_Node(state=state)

    for _ in range(N):
        leaf = select(root)
        child = expand(leaf)
        result = simulate(game, child.state)
        backprop(child, result, game.to_move(child.state))  # Propagate with the correct player

    max_state = max(root.children, key=lambda p: p.N)

    return root.children.get(max_state)


In [157]:
def test_MC_bot_three(game, state):
    return monte_carlo_tree_search_three(state, game, N = 1000)

def test_alpha_beta_bot_three(game, state):
    return alpha_beta_cutoff_search(state, game, d = 5)

def test_alpha_beta_eval_bot_three(game, state):
    return alpha_beta_cutoff_search(state, game, d = 5, eval_fn = eval_fn_three)

# def random_bot(game, state):
#     return random.choice


    

In [None]:
testC4game_3 = C4_3_player()
testC4game_3.play_game(test_MC_bot_three, test_alpha_beta_eval_bot_three, test_alpha_beta_bot_three)

In [None]:
import time
from collections import deque, defaultdict
from itertools import permutations

def run_simulation(num_games, bot_functions_with_labels):
    """
    Run a simulation of games, alternating which bot goes first, second, and third.
    The bots are passed along with their custom labels.
    
    Parameters:
    - num_games: The number of games to simulate.
    - bot_functions_with_labels: A list of tuples where each tuple contains a bot function and its custom label.
    
    Returns:
    - results: A dictionary with the bot labels as keys, storing the number of wins and average time per move.
    """
    # Initialize results structure for each bot
    results = {label: {"wins": 0, "total_time": 0, "total_moves": 0} for _, label in bot_functions_with_labels}

    # Generate all permutations of bot positions
    bot_permutations = list(permutations(bot_functions_with_labels))
    num_permutations = len(bot_permutations)
    
    # Ensure an equal number of all permutations
    games_per_permutation = num_games // num_permutations

    for i in range(num_games):
        # Determine the current permutation to use
        current_permutation = bot_permutations[i % num_permutations]

        # Assign bots to positions (X, O, 3)
        X_bot, O_bot, T_bot = [bot for bot, _ in current_permutation]
        X_label, O_label, T_label = [label for _, label in current_permutation]

        # Print the player assignments for this game
        print(f"Game {i+1}: X = {X_label}, O = {O_label}, 3 = {T_label}")

        # Now we play the game with X_bot, O_bot, T_bot as players
        game = C4_3_player(h=6, v=7, k=4)  # Standard Connect 4 board with 6x7 grid
        state = game.initial
        
        current_player = 'X'
        game_over = False
        player_to_bot = {'X': X_bot, 'O': O_bot, '3': T_bot}
        player_to_label = {'X': X_label, 'O': O_label, '3': T_label}

        while not game_over:
            # Get the bot for the current player and measure time for the move
            bot = player_to_bot[current_player]
            start_time = time.time()
            
            # Bot makes a move
            move = bot(game, state)  # Assuming this function returns the best move
            state = game.result(state, move)  # Apply the move and get the new game state
            
            move_time = time.time() - start_time
            bot_label = player_to_label[current_player]  # Get the label for the current bot
            results[bot_label]["total_time"] += move_time  # Add time to the corresponding bot
            results[bot_label]["total_moves"] += 1  # Increment the move count

            # Check if the game is over
            if game.terminal_test(state):
                winner = game.utility(state, game.to_move(game.initial))
                if winner == 1:
                    results[X_label]["wins"] += 1
                elif winner == -1:
                    results[O_label]["wins"] += 1
                elif winner == -2:
                    results[T_label]["wins"] += 1
                game_over = True

            # Rotate players: X -> O -> 3 -> X
            current_player = game.get_next_player(current_player)

        # Print the final board state
        print("Final board state:")
        game.display(state)
        print("-" * 40)

    # After all games, calculate average times per move
    for bot in results:
        if results[bot]["total_moves"] > 0:
            results[bot]["time_per_move"] = results[bot]["total_time"] / results[bot]["total_moves"]
        else:
            results[bot]["time_per_move"] = 0

    return results

# Function to display results
def display_results(results):
    print("Simulation Results:")
    for bot, data in results.items():
        print(f"Bot {bot}:")
        print(f"  Wins: {data['wins']}")
        print(f"  Average Time per Move: {data['time_per_move']:.4f} seconds")
        print("-" * 40)

# Running the simulation for n games with any bots
bot_functions_with_labels = [
    (test_MC_bot_three, "MCT"),
    (test_alpha_beta_bot, "AlphaBeta"),
    (test_alpha_beta_eval_bot_three, "AlphaBetaEval")
]

num_games = 6
results = run_simulation(num_games, bot_functions_with_labels)

# Display the results
display_results(results)

#### Random "blocks"

In [None]:
class C4_obstacles(Game):
    """A TicTacToe-like game in which you can only make a move on the bottom
    row, or in a square directly above an occupied square. Traditionally
    played on a 6*7 board and requiring 4 in a row."""

    def __init__(self, h=6, v=7, k=4, obstacles=None):
        self.h = h
        self.v = v
        self.k = k
        self.obstacles = obstacles or [] # List of obstacle positions (x, y)
        moves = [(x, y) for x in range(1, h + 1) for y in range(1, v + 1)]
        self.initial = GameState(to_move='X', utility=0, board={}, moves=moves)

    def actions(self, state):
        """ If we write (x, y) as the coordinate on the board,
        then the bottom row correspond to x=7, or equivalently x=self.h
        Recall that state.board is a dict and the keys are occupied locations. """
        valid_moves = []
        for (x, y) in state.moves:
            if x == self.h and (x, y) not in self.obstacles:
                    valid_moves.append((x, y))
            elif (x + 1, y) in state.board or (x + 1, y) in self.obstacles:
                valid_move = True
                for row in range(x + 1, self.h + 1):
                    if (row, y) not in state.board and (row, y) not in self.obstacles:
                        valid_move = False
                        break
                if valid_move and (x, y) not in self.obstacles:
                    valid_moves.append((x, y))
        return valid_moves

    def result(self, state, move):
        if move not in state.moves:
            return state  # Illegal move has no effect
        board = state.board.copy()
        board[move] = state.to_move
        moves = list(state.moves)
        moves.remove(move)
        return GameState(to_move=('O' if state.to_move == 'X' else 'X'),
                         utility=self.compute_utility(board, move, state.to_move),
                         board=board, moves=moves)

    def utility(self, state, player):
        """Return the value to player; 1 for win, -1 for loss, 0 otherwise."""
        return state.utility if player == 'X' else -state.utility

    def terminal_test(self, state):
        """A state is terminal if it is won or there are no empty squares."""
        return state.utility != 0 or not any(self.actions(state))

    def display(self, state):
        board = state.board
        for x in range(1, self.h + 1):
            for y in range(1, self.v + 1):
                if (x, y) in self.obstacles:
                    print('#', end=' ') # Display the obstacles as '#'s
                else:
                    print(board.get((x, y), '.'), end=' ')
            print()

    def compute_utility(self, board, move, player):
        """If 'X' wins with this move, return 1; if 'O' wins return -1; else return 0."""
        if (self.k_in_row(board, move, player, (0, 1)) or
                self.k_in_row(board, move, player, (1, 0)) or
                self.k_in_row(board, move, player, (1, -1)) or
                self.k_in_row(board, move, player, (1, 1))):
            return + 1 if player == 'X' else -1
        else:
            return 0

    def k_in_row(self, board, move, player, delta_x_y):
        """Return true if there is a line through move on board for player."""
        (delta_x, delta_y) = delta_x_y
        x, y = move
        n = 0  # n is number of moves in row
        while (x, y) in board and board.get((x, y)) == player:
            n += 1
            x, y = x + delta_x, y + delta_y
        x, y = move
        while (x, y) in board and board.get((x, y)) == player:
            n += 1
            x, y = x - delta_x, y - delta_y
        n -= 1  # Because we counted move itself twice
        return n >= self.k

    def play_game(self, *players):
        """Play an n-person, move-alternating game."""
        state = self.initial
        while True:
            for player in players:
                move = player(self, state)
                state = self.result(state, move)
                if self.terminal_test(state):
                    print(state.board)
                    self.display(state)
                    return self.utility(state, self.to_move(self.initial))
        

In [None]:
# Dynamically update the utility function used for evaluation in the alpha_beta_eval_bot to account for the "obstacles"
def generate_segments_obstacles(h=6, v=7, k=4):
    """ generate all segments of length k=4 on this board;
        segment is a list of lists of length 4 """
    segments = []

    # generate the vertical segments
    for y in range(1, v + 1):
        for x in range(1, h - k + 2):
            segment = []
            for t in range(k):
                segment.append((x + t, y))
            segments.append(segment)

    # generate the horizontal segments
    for x in range(1, h + 1):
        for y in range(1, v - k + 2):
            segment = []
            for t in range(k):
                segment.append((x, y + t))
            segments.append(segment)

    # generate the bottom left to top right diagonal segments
    for x in range(k, h + 1):
        for y in range(1, v - k + 2):
            segment = []
            for t in range(k):
                segment.append((x - t, y + t))
            segments.append(segment)

    # generate the top left to bottom right diagonal segments
    for y in range(1, v - k + 2):
        for x in range(1, h - k + 2):
            segment = []
            for t in range(k):
                segment.append((x + t, y + t))
            segments.append(segment)

    return segments

all_segments_obstacles = generate_segments_obstacles()

def count_in_segment_obstacles(segment, state, game):
    """  Returns the count of 1's & 2's in a segment """
    """  Returns the count of X's & O's in a segment """
    X_count, O_count, obstacle_count = 0, 0, 0
    for x, y in segment:
        if state.board.get((x, y)) == 'X':
            X_count += 1
        elif state.board.get((x, y)) == 'O':
            O_count += 1
        elif (x, y) in game.obstacles:
            obstacle_count += 1
    return X_count, O_count, obstacle_count

def eval_segment_obstacles(segment, state, player, game):
    """ Returns the evaluation score for a segment """
    X_count, O_count, obstacle_count = count_in_segment_obstacles(segment, state, game)

    if obstacle_count > 0:
        return 0
    
    if X_count > 0 and O_count > 0:
        return 0   # mixed segments are neutral

    count = max(X_count, O_count)
    score = 0

    if count == 1:  # open segments with 1 in a row (small chance)
        score = 1
    elif count == 2:  # open segments with 2 in a row (medium chance)
        score = 10
    elif count == 3:  # open segments with 3 in a row (big chance)
        score = 100
    elif count == 4:   # open segments with 4 in a row (game over)
        score = 100000

    if X_count > O_count:
        dominant = 'X'
    else:
        dominant = 'O'

    if dominant == player:
        return score
    else:
        return -score

def eval_fn_obstacles(state, player, game):
    """ The evaluation function """
    total = 0
    for segment in all_segments_obstacles:
        total += eval_segment_obstacles(segment, state, player, game)
    return total


In [None]:
obstacle_list = [(2,3), (5,1), (6,6), (6,1), (6,3), (2,2), (5, 3), (2, 6), (4, 6), (4, 5), (4, 4), (6, 2), (6, 2)]
testC4game_obstacles = C4_obstacles(obstacles = obstacle_list)
testC4game_obstacles.display(testC4game_obstacles.initial)

def obstacles_MC_bot(game, state):
    return monte_carlo_tree_search(state, game, N = 1000)

def obstacles_alpha_beta_bot(game, state):
    return alpha_beta_cutoff_search(state, game, d = 5)

def obstacles_alpha_beta_eval_bot(game, state):
    return alpha_beta_cutoff_search(state, game, d = 5, eval_fn = lambda state, player: eval_fn_obstacles(state, player, game))
    # return alpha_beta_cutoff_search(state, game, d = 5, eval_fn = eval_fn)

In [None]:
testC4game_obstacles.play_game(obstacles_MC_bot, obstacles_alpha_beta_eval_bot)

## Reinforcement Learning 

In [8]:
import numpy as np
import pandas as pd
from collections import deque
import time

# Function to calculate heuristic score
def calculate_heuristic(wins, total_time):
    return wins / total_time if total_time > 0 else 0

# Simulation cache and results DataFrame
simulation_cache = {}
results_df = pd.DataFrame(columns=[
    "Bot1", "Bot2", "Bot1_Wins", "Bot2_Wins", "Bot1_Time", "Bot2_Time", "Bot1_Heuristic", "Bot2_Heuristic"
])

# Function to run simulation and calculate heuristic scores
def run_simulation_and_calculate_scores(bot1, bot2, num_games=10):
    bot_pair_key = (bot1[1], bot2[1])  # Use bot labels as cache key

    # Check if results are already cached
    if bot_pair_key in simulation_cache:
        return simulation_cache[bot_pair_key]

    results = {"bot1_wins": 0, "bot2_wins": 0, "bot1_time": 0, "bot2_time": 0}

    # Each bot alternates as the starting player for half the games
    for i in range(num_games):
        if i % 2 == 0:
            game_results = run_simulation(1, [(bot1[0], bot1[1]), (bot2[0], bot2[1])])
        else:
            game_results = run_simulation(1, [(bot2[0], bot2[1]), (bot1[0], bot1[1])])

        results["bot1_wins"] += game_results[bot1[1]]["wins"]
        results["bot2_wins"] += game_results[bot2[1]]["wins"]
        results["bot1_time"] += game_results[bot1[1]]["time_per_move"]
        results["bot2_time"] += game_results[bot2[1]]["time_per_move"]

    bot1_heuristic = calculate_heuristic(results["bot1_wins"], results["bot1_time"])
    bot2_heuristic = calculate_heuristic(results["bot2_wins"], results["bot2_time"])

    # Cache the results
    simulation_cache[bot_pair_key] = (bot1_heuristic, bot2_heuristic)

    # Append results to DataFrame
    results_df.loc[len(results_df)] = [
        bot1[1], bot2[1],
        results["bot1_wins"], results["bot2_wins"],
        results["bot1_time"], results["bot2_time"],
        bot1_heuristic, bot2_heuristic
    ]
    
    return bot1_heuristic, bot2_heuristic

# Function to create heuristic matrix and format as DataFrame
def create_heuristic_matrix_df(bots, opponents):
    bot_names = [bot[1] for bot in bots]
    opponent_names = [opponent[1] for opponent in opponents]

    # Initialize matrix with zeros
    matrix = np.zeros((len(bots), len(opponents) + 1))

    for i, bot in enumerate(bots):
        total_heuristic = 0
        for j, opponent in enumerate(opponents):
            if bot != opponent:
                bot_heuristic, _ = run_simulation_and_calculate_scores(bot, opponent)
                matrix[i, j] = bot_heuristic
                total_heuristic += bot_heuristic

        # Calculate average heuristic score
        if len(opponents) > 0:
            matrix[i, -1] = total_heuristic / len(opponents)

    # Create DataFrame with column headers
    column_headers = opponent_names + ["Average"]
    heuristic_df = pd.DataFrame(matrix, columns=column_headers)
    heuristic_df.insert(0, "Bot", bot_names)  # Insert bot names as the first column

    return heuristic_df

# Function to run a simulation of games
def run_simulation(num_games, bot_functions_with_labels):
    results = {label: {"wins": 0, "total_time": 0, "total_moves": 0} for _, label in bot_functions_with_labels}
    bot_order = deque([bot for bot, _ in bot_functions_with_labels])
    label_order = deque([label for _, label in bot_functions_with_labels])

    for i in range(num_games):
        bot_order.rotate(-1)
        label_order.rotate(-1)
        X_bot, O_bot = bot_order
        X_label, O_label = label_order
        game = C4(h=6, v=7, k=4)
        state = game.initial
        current_player = 'X'
        game_over = False
        player_to_bot = {'X': X_bot, 'O': O_bot}
        player_to_label = {'X': X_label, 'O': O_label}

        while not game_over:
            bot = player_to_bot[current_player]
            start_time = time.time()
            move = bot(game, state)
            state = game.result(state, move)
            move_time = time.time() - start_time
            bot_label = player_to_label[current_player]
            results[bot_label]["total_time"] += move_time
            results[bot_label]["total_moves"] += 1

            if game.terminal_test(state):
                winner = game.utility(state, game.to_move(game.initial))
                if winner == 1:
                    results[X_label]["wins"] += 1
                elif winner == -1:
                    results[O_label]["wins"] += 1
                game_over = True

            current_player = 'O' if current_player == 'X' else 'X'

    for bot in results:
        if results[bot]["total_moves"] > 0:
            results[bot]["time_per_move"] = results[bot]["total_time"] / results[bot]["total_moves"]
        else:
            results[bot]["time_per_move"] = 0

    return results

# Define the bots
monte_carlo_bots = [
    (test_MC_bot_1000, "MC_1000"),
    (test_MC_bot_10000, "MC_10000"),
    (test_MC_bot_5000, "MC_5000")
]

alpha_beta_bots = [
    (test_alpha_beta_bot_4, "AB_4"),
    (test_alpha_beta_bot_5, "AB_5"),
    (test_alpha_beta_eval_bot_3, "AB_3")
]

alpha_beta_eval_bots = [
    (test_alpha_beta_eval_bot_4, "ABE_4"),
    (test_alpha_beta_eval_bot_5, "ABE_5"),
    (test_alpha_beta_eval_bot_3, "ABE_3")
]

# Create matrices for each type of bots
monte_carlo_df = create_heuristic_matrix_df(monte_carlo_bots, alpha_beta_bots + alpha_beta_eval_bots)
alpha_beta_df = create_heuristic_matrix_df(alpha_beta_bots, monte_carlo_bots + alpha_beta_eval_bots)
alpha_beta_eval_df = create_heuristic_matrix_df(alpha_beta_eval_bots, monte_carlo_bots + alpha_beta_bots)

# Display the results
print("\nMonte Carlo Bots Heuristic Matrix:")
print(monte_carlo_df)

print("\nAlpha Beta Bots Heuristic Matrix:")
print(alpha_beta_df)

print("\nAlpha Beta Eval Bots Heuristic Matrix:")
print(alpha_beta_eval_df)

NameError: name 'test_MC_bot_1000' is not defined

In [None]:
class QLearningAgent:
    def _init_(self, actions, alpha=0.1, gamma=0.9, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
        self.actions = actions  # List of valid actions (which are tuples (x, y), where y is the column)
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.epsilon_decay = epsilon_decay  # Decay rate for exploration
        self.epsilon_min = epsilon_min  # Minimum exploration rate
        
        # Initialize Q-table (state-action pair => Q-value)
        self.q_table = defaultdict(lambda: np.zeros(len(actions)))

    def get_action(self, state, valid_actions):
        """Choose an action using epsilon-greedy policy."""
        if random.uniform(0, 1) < self.epsilon:  # Exploration
            return random.choice(valid_actions)
        else:  # Exploitation
            state_tuple = self._get_state_tuple(state)
            
            # Extract column indices from (x, y) tuples in valid_actions
            valid_column_indices = [y - 1 for _, y in valid_actions]  # Convert to 0-based column indices
            q_values = self.q_table[state_tuple][valid_column_indices]  # Access the Q-values of valid actions
            best_action_idx = np.argmax(q_values)
            
            # Return the action corresponding to the best Q-value
            return valid_actions[best_action_idx]

    def update_q_value(self, state, action, reward, next_state, valid_actions):
        """Update Q-value using the Q-learning update rule."""
        state_tuple = self._get_state_tuple(state)
        next_state_tuple = self._get_state_tuple(next_state)
        
        # Extract column indices from (x, y) tuples in valid_actions
        valid_column_indices = [y - 1 for _, y in valid_actions]  # Convert to 0-based column indices
        
        # Find the best possible future Q-value from valid actions in the next state
        future_q = np.max(self.q_table[next_state_tuple][valid_column_indices])
        
        # Get the action's current Q-value (using the column index from the action)
        action_column_index = action[1] - 1  # Extract column index and adjust to 0-based
        current_q = self.q_table[state_tuple][action_column_index]
        
        # Q-learning update rule
        self.q_table[state_tuple][action_column_index] = current_q + self.alpha * (reward + self.gamma * future_q - current_q)

    def decay_epsilon(self):
        """Decay epsilon to reduce exploration over time."""
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def _get_state_tuple(self, state):
        """Convert the board state to a tuple to use as a dictionary key."""
        return tuple(sorted(state.board.keys()))  # Use sorted keys as a simple hashable representation

In [None]:
import matplotlib.pyplot as plt


def train(agent, game, episodes):
    # Example of epsilon decay (in training loop):
    epsilon = 1.0  # Start with full exploration
    epsilon_min = 0.1  # Minimum value for epsilon
    epsilon_decay = 0.995  # Decay rate

# In your training loop, decay epsilon after each episode
    for episode in range(episodes):
        state = game.initial
        done = False
        total_reward = 0

        while not done:
            valid_actions = game.actions(state)
            action = agent.get_action(state, valid_actions)  # Agent's move
        
            next_state = game.result(state, action)  # Apply the move
            reward = next_state.utility if game.terminal_test(next_state) else 0  # Reward calculation
            total_reward += reward

        # Update the Q-table
            agent.update_q_value(state, action, reward, next_state, valid_actions)
        
        # Decay epsilon
            epsilon = max(epsilon_min, epsilon * epsilon_decay)

            state = next_state
        
            if game.terminal_test(state):
                episode_rewards.append(total_reward)
            #print(f"Episode {episode+1} finished. Reward: {total_reward}")

        print("Finished training")

In [None]:
# Example usage
game = C4()  # Create a Connect 4 game
actions = list(range(1, game.v + 1))  # List of valid columns (1 to 7)
agent = QLearningAgent(actions)  # Initialize the Q-learning agent

train(agent, game, episodes=10000)  # Train the agent for 1000 episodes

In [None]:
state = game.initial
state_tuple = agent._get_state_tuple(state)

# Print the Q-values for the initial state
print(f"Q-values for state {state_tuple}: {agent.q_table[state_tuple]}")