In [2]:
from tensorflow.keras import layers, models
import numpy as np
import random

In [3]:
print(random.uniform(1,6))

2.7129553183846196


# Dice Blackjack class (continuous)

In [4]:
class DiceBlackjack:
    DICE = 6
    THRESHOLD = 11
    HIT = 1
    STAY = 0
    LOST = -1


    def __init__(self):
        self.sum = 0
        self.status = self.HIT


    def roll_dice(self):
        return random.uniform(1, 6) # was randint in the previous version


    def make(self, move: int):
        if move == self.STAY:
            #print("Stay")
            self.status = self.STAY
            return

        else:
            #print("Hit")
            self.sum += self.roll_dice()
            if self.sum > self.THRESHOLD:
                self.sum = 0
                self.status = self.LOST
            elif self.sum == self.THRESHOLD:
                self.status = self.STAY
        return


    def clone(self):
        c = DiceBlackjack()
        c.sum = self.sum
        c.status = self.status

    def is_game_ended(self):
        if self.status == self.HIT:
            return False
        return True


# CNN model creation
### 1 input 1 output

In [5]:
def create_q_model():
    model = models.Sequential([
        layers.Input(shape=(1,)),
        layers.Dense(32, activation='relu'),
        layers.Dense(32, activation='relu'),
        layers.Dense(1, activation='linear')
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

# Model training
### notice: normalization sum/11, training takes 20minutes

In [6]:
NUM_EPISODES = 5000
EPSILON = 0.1

model = create_q_model()

print("Training started...")

for episode in range(NUM_EPISODES):
    game = DiceBlackjack()
    hit_states = [] #  HIT

    while not game.is_game_ended():
        if random.random() < EPSILON:
            action = random.choice([game.HIT, game.STAY])
        else:
            state_input = np.array([[game.sum / 11]])
            q_hit_value = model.predict(state_input, verbose=0)[0][0]
            if q_hit_value > game.sum:
                action = game.HIT
            else: action = game.STAY

        if action == game.HIT:
            hit_states.append(game.sum)

        game.make(action)

    final_reward = game.sum

    if len(hit_states) > 0:
        X = np.array(hit_states) / 11.0
        y = np.array([final_reward] * len(hit_states))
        model.train_on_batch(X, y)  # can use model.fit BUT train_on_batch is faster for updating one specific game

    if (episode + 1) % 500 == 0:
        print(f"Episode {episode + 1}/{NUM_EPISODES} completed.")

print("Training finished!")

Training started...
Episode 500/5000 completed.
Episode 1000/5000 completed.
Episode 1500/5000 completed.
Episode 2000/5000 completed.
Episode 2500/5000 completed.
Episode 3000/5000 completed.
Episode 3500/5000 completed.
Episode 4000/5000 completed.
Episode 4500/5000 completed.
Episode 5000/5000 completed.
Training finished!


# Evaluation


In [15]:
EVAL_EPISODES = 100
total_rewards = 0

print("\nStarting Evaluation (", EVAL_EPISODES," episodes)...")

for i in range(EVAL_EPISODES):
    game = DiceBlackjack()

    while not game.is_game_ended():
        state_input = np.array([[game.sum / 11.0]])
        q_hit_value = model(state_input, training=False).numpy()[0][0]
        if q_hit_value > game.sum:
            action = game.HIT
        else:
            action = game.STAY
        game.make(action)

    total_rewards += game.sum

avg_reward = total_rewards / EVAL_EPISODES
print(f"Average Reward over {EVAL_EPISODES} episodes: {avg_reward}")


Starting Evaluation ( 100  episodes)...
Average Reward over 100 episodes: 8.366180211944638


### Pretty print made by llm

In [26]:
print("\n Learned Policy")
print(f"{'Sum (S)':}    | {'Q(S, hit)':}    | {'Value(Stay)':}  | {'Decision'}")
print("-" * 55)

test_sums = np.linspace(0, 11, 23)

threshold_found = None

for s in test_sums:
    state_input = np.array([[s / 11.0]])
    q_hit = model.predict(state_input, verbose=0)[0][0]

    val_stay = s

    if q_hit > val_stay:
        decision = "HIT"
    else:
        decision = "STAY"
        if threshold_found is None: # saw STAY for the first time
            threshold_found = s

    print(f"{s:<10.1f} | {q_hit:<12.3f} | {val_stay:<12.1f} | {decision}")

print("-" * 55)
print(f"\nEstimated Threshold: The agent stops hitting around {threshold_found}")


 Learned Policy
Sum (S)    | Q(S, hit)    | Value(Stay)  | Decision
-------------------------------------------------------
0.0        | 7.354        | 0.0          | HIT
0.5        | 7.415        | 0.5          | HIT
1.0        | 7.476        | 1.0          | HIT
1.5        | 7.536        | 1.5          | HIT
2.0        | 7.558        | 2.0          | HIT
2.5        | 7.563        | 2.5          | HIT
3.0        | 7.558        | 3.0          | HIT
3.5        | 7.535        | 3.5          | HIT
4.0        | 7.500        | 4.0          | HIT
4.5        | 7.464        | 4.5          | HIT
5.0        | 7.393        | 5.0          | HIT
5.5        | 7.297        | 5.5          | HIT
6.0        | 7.202        | 6.0          | HIT
6.5        | 7.106        | 6.5          | HIT
7.0        | 7.011        | 7.0          | HIT
7.5        | 6.915        | 7.5          | STAY
8.0        | 6.820        | 8.0          | STAY
8.5        | 6.724        | 8.5          | STAY
9.0        | 6.629        