<a href="https://colab.research.google.com/github/makaronaaa/DataSciencePython/blob/main/TicTacToe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import numpy as np
import tensorflow as tf
#enviroment
class TicTacToe:
    def __init__(game): #initializes the game state
        game.board = np.zeros((3, 3), dtype=np.int8)  #create a 3x3 board 
        game.current_player = 1 #set the current_player to 1.

      #resets the game state by clearing the board and setting the current_player back to 1.
    def reset(game):
        game.board = np.zeros((3, 3), dtype=np.int8)
        game.current_player = 1

    def get_state(game):  #returns a copy of the current board state.
        return np.copy(game.board)

    def make_move(game, row, col): #allows a player to make a move on the board.
        if game.board[row][col] == 0: #if position is empty (marked as 0), 
            game.board[row][col] = game.current_player   #updates the board with the current player's marker (1 or -1) 
            game.current_player = -game.current_player #and changes the current_player to the opposite player. 

            return True #if the move is valid
        return False #otherwise

    def is_game_over(game):  #whether the game is over or not.
    #all possible winning combinations
        for i in range(3):
            if game.board[i][0] == game.board[i][1] == game.board[i][2] != 0:
                return game.board[i][0]

        for i in range(3):
            if game.board[0][i] == game.board[1][i] == game.board[2][i] != 0:
                return game.board[0][i]

        if game.board[0][0] == game.board[1][1] == game.board[2][2] != 0:
            return game.board[0][0]
        if game.board[0][2] == game.board[1][1] == game.board[2][0] != 0:
            return game.board[0][2]

        if np.count_nonzero(game.board) == 9:
            return 0  #0 if the game is a draw
        return None # game isnt over yet

In [15]:
#     Reinforcement Learning Agent
class RLAgent:
    def __init__(game):
        game.model = game.build_model()  #create a neural network model using the build_model function
        game.optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

    def build_model(game):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(64, activation='relu', input_shape=(9,)),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(9, activation='softmax')
        ])
        model.compile(loss='categorical_crossentropy', optimizer=game.optimizer)
        return model

    def get_action(game, state):
        q_values = game.model.predict(np.expand_dims(state.flatten(), axis=0))
        return np.argmax(q_values)

    def train(game, states, target_values):
        with tf.GradientTape() as tape:
            predictions = game.model(states, training=True)
            loss = tf.reduce_mean(tf.square(target_values - predictions)) #MSE= (x-pred(x))^2/n
        gradients = tape.gradient(loss, game.model.trainable_variables)
        game.optimizer.apply_gradients(zip(gradients, game.model.trainable_variables)) #optimizer applies the gradients to update the model weights.

    def train_with_human_feedback(game, states, human_labels):
        target_values = np.zeros((len(states), 9))
        for i, label in enumerate(human_labels):
            target_values[i][label] = 1
        game.train(states, target_values)

def get_human_move():
    while True:
        try:
            move = int(input("Enter your move (   ): "))
            row = (move - 1) // 3  #calculate the row index
            col = (move - 1) % 3 #calculate the column index
            return row, col
        except ValueError:
            print("Invalid input. Try again.")

In [16]:
class RLAgent:
    def __init__(game, optimizer):
        game.model = game.build_model(optimizer)
        game.optimizer = optimizer

    def build_model(game, optimizer):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(64, activation='relu', input_shape=(9,)),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(9, activation='softmax')
        ])
        model.compile(loss='categorical_crossentropy', optimizer=optimizer)
        return model

    def get_action(game, state):
        q_values = game.model.predict(np.expand_dims(state.flatten(), axis=0))
        return np.argmax(q_values)

In [None]:
#main
env = TicTacToe()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
agent = RLAgent(optimizer)

num_episodes = 1
epsilon = 0.2

for episode in range(num_episodes):
    env.reset()
    done = False
    states = []
    human_labels = []

    while not done:
        # agent's turn
        state = env.get_state()
        action = agent.get_action(state)
        states.append(state.flatten())

        #human's turn
        human_row, human_col = get_human_move()
        human_labels.append(human_row * 3 + human_col)
        env.make_move(human_row, human_col)
        print("Human's move:")
        print(env.board)

        done = env.is_game_over()
        if done:
            break

        #agent makes a move
        if np.random.random() < epsilon:
            rl_row, rl_col = np.random.randint(0, 3), np.random.randint(0, 3)
        else:
            q_values = agent.model.predict(np.expand_dims(state.flatten(), axis=0))
            valid_moves = np.where(env.board == 0)
            valid_moves = np.column_stack(valid_moves)
            valid_q_values = [q_values[0][move[0] * 3 + move[1]] for move in valid_moves]
            best_move_idx = np.argmax(valid_q_values)
            rl_row, rl_col = valid_moves[best_move_idx]

        env.make_move(rl_row, rl_col)
        print("RL Agent's move:")
        print(env.board)

        done = env.is_game_over()

    #update RL Agent with human feedback
    agent.train_with_human_feedback(np.array(states), np.array(human_labels))

    if done == 1:
        print("agent wins!")
    elif done == -1:
        print("real human wins!")
    else:
        print("draw!")

    if episode % 100 == 0:
        print(f"Episode: {episode}")

print("Training Finito!")
