In [11]:
import gymnasium as gym
from stable_baselines3 import DQN, PPO
import numpy as np

class ShapedLunarLander(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
        self.prev_obs = None
        
    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        self.prev_obs = obs
        return obs, info
    
    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        
        if self.prev_obs is not None:
            x, y, vx, vy = obs[:4]
            velocity_mag = np.sqrt(vx**2 + vy**2)
            
            # Hover penalty (çok sabit kalmasın)
        if y > 0.15 and velocity_mag < 0.05:
            reward -= 0.02   # küçük ceza, -4 çok fazlaydı

        # Fuel penalty (gerçekçi yakıt kullanımı)
        if action == 2:      # main engine
            reward -= 0.1
        elif action in [1,3]: # side engines
            reward -= 0.015

        
        self.prev_obs = obs
        return obs, reward, terminated, truncated, info
# Environment oluştur
normal_env = gym.make("LunarLander-v3")
rnormal_env = gym.make("LunarLander-v3",render_mode="human")
shaped_env = ShapedLunarLander(gym.make("LunarLander-v3"))
rshaped_env = ShapedLunarLander(gym.make("LunarLander-v3",render_mode="human"))

model = PPO(
        policy="MlpPolicy",
        env=env,
        verbose=1,
        learning_rate=3e-4,        # PPO için tipik
        n_steps=2048,              # rollout buffer length
        batch_size=64,
        n_epochs=10,
        gamma=0.99,
        gae_lambda=0.95,
        clip_range=0.2,
    )
# Eğitilmiş agenti yükle (veya halihazırda model değişkenindeyse onu kullanabilirsin)
#model = PPO.load(r"E:\UNITY\BombermanTower\python\Jupyter\PPO\fresh_51200_model", env)
model.learn(total_timesteps=50000,reset_num_timesteps=True)
size = model.num_timesteps
model.save(rf"E:\UNITY\BombermanTower\python\Jupyter\PPO\fresh_{size}_model")
rshaped_env.close()

obs, _ = rnormal_env.reset(seed=42)
done = False

while not done:
    # Agent aksiyonu tahmin eder (deterministic=True, yani rastgelelik yok)
    action, _ = model.predict(obs, deterministic=True)
    
    # Ortamda aksiyonu uygula
    obs, reward, terminated, truncated, _ = rnormal_env.step(action)
    
    # Oyunun bitip bitmediğini kontrol et
    done = terminated or truncated

    # Render ile ekranda göster
    rnormal_env.render()
rnormal_env.close()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 96       |
|    ep_rew_mean     | -173     |
| time/              |          |
|    fps             | 2109     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 95.8        |
|    ep_rew_mean          | -172        |
| time/                   |             |
|    fps                  | 1504        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007470076 |
|    clip_fraction        | 0.0448      |
|    clip_range           | 0.2         |
|    entropy_loss   

In [5]:
print(f"Model epsilon: {model.exploration_rate}")

# Modeli yükledikten sonra
print(f"Toplam eğitim adımı: {model.num_timesteps}")
print(f"Exploration rate: {model.exploration_rate}")

# Eğer num_timesteps düşükse, model yeterince eğitilmemiş olabilir

AttributeError: 'PPO' object has no attribute 'exploration_rate'

In [11]:
# Model değişkeni hala var mı kontrol edin
try:
    print(f"Model timesteps: {model.num_timesteps}")
    print("✅ Model hala bellekte")
    size = model.num_timesteps
    # Hemen save edin
    model.save(rf"E:\UNITY\BombermanTower\python\Jupyter\fresh_{size}_model")
    print("Acil durum save yapıldı!")
    
except NameError:
    print("❌ Model değişkeni kaybolmuş")
except Exception as e:
    print(f"❌ Model bozulmuş: {e}")


Model timesteps: 51200
✅ Model hala bellekte
Acil durum save yapıldı!


In [None]:
import gymnasium as gym
import numpy as np
from stable_baselines3 import DQN, PPO
from stable_baselines3.common.evaluation import evaluate_policy
class AntiHoveringLunarLander(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
        self.prev_obs = None
        self.hovering_count = 0
        self.step_count = 0
        self.altitude_history = []
        
    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        self.prev_obs = obs
        self.hovering_count = 0
        self.step_count = 0
        self.altitude_history = []
        return obs, info
    
    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        self.step_count += 1
        
        if self.prev_obs is not None:
            x, y, vx, vy, angle, angular_vel, leg1, leg2 = obs
            velocity_mag = np.sqrt(vx**2 + vy**2)
            
            # Altitude history tutla
            self.altitude_history.append(y)
            if len(self.altitude_history) > 20:
                self.altitude_history.pop(0)
            
            # AGRESIF HOVERING DETECTION
            is_hovering = (
                y > 0.15 and                    # Yeterince yüksekte
                velocity_mag < 0.08 and         # Çok yavaş
                abs(vx) < 0.05 and             # Yatay hareket az
                abs(vy) < 0.05                 # Dikey hareket az
            )
            
            if is_hovering:
                self.hovering_count += 1
                
                # Hovering süresine göre artan ceza
                base_penalty = -0.1
                time_penalty = -0.02 * self.hovering_count
                
                # Uzun süre hovering = episode sonlandır
                if self.hovering_count > 30:  # 30 step = yaklaşık 1 saniye
                    reward = -200  # Büyük ceza
                    terminated = True
                    info['hovering_termination'] = True
                else:
                    reward += base_penalty + time_penalty
                
            else:
                self.hovering_count = max(0, self.hovering_count - 2)  # Yavaşça azalt
            
            # FUEL AGRESIF CEZA
            if action == 2:  # Ana motor
                reward -= 0.03
            elif action in [1, 3]:  # Yan motorlar
                reward -= 0.015
            
            # PROGRESS ZORLAMA
            target_x = 0.0
            distance_to_target = abs(x - target_x)
            
            # Çok uzak kalırsa ceza
            if distance_to_target > 0.8:
                reward -= 0.05
            
            # Aşağı gitmeyi teşvik et
            if len(self.altitude_history) >= 10:
                recent_avg = np.mean(self.altitude_history[-10:])
                older_avg = np.mean(self.altitude_history[-20:-10]) if len(self.altitude_history) >= 20 else recent_avg
                
                if recent_avg < older_avg:  # Aşağı iniyor
                    reward += 0.05
                elif recent_avg > older_avg:  # Yukarı çıkıyor
                    reward -= 0.03
            
            # MAX EPISODE LIMIT
            if self.step_count > 500:  # Normal limit 1000, biz 500 yapalım
                reward -= 0.1
                if self.step_count > 600:
                    terminated = True
                    info['time_limit_termination'] = True
        
        self.prev_obs = obs
        return obs, reward, terminated, truncated, info
class ShapedLunarLander(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
        self.prev_obs = None
        
    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        self.prev_obs = obs
        return obs, info
    
    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        
        if self.prev_obs is not None:
            x, y, vx, vy = obs[:4]
            velocity_mag = np.sqrt(vx**2 + vy**2)
            
            # Hovering penalty
            if y > 0.1 and velocity_mag < 0.05:
                reward -= 0.02
            
            # Fuel penalty
            if action == 2:  # Main engine
                reward -= 0.01
            elif action in [1, 3]:  # Side engines
                reward -= 0.005
        
        self.prev_obs = obs
        return obs, reward, terminated, truncated, info

# 25K modelinin üzerine shaped env ile eğitim
def train_on_shaped_env():
    # Shaped environment oluştur
    shaped_env = ShapedLunarLander(gym.make("LunarLander-v3"))
    
    # 25K modelini yükle
    model = DQN.load(r"E:\UNITY\BombermanTower\python\Jupyter\fresh_100000_model", env=shaped_env)
    
    print(f"Başlangıç timesteps: {model.num_timesteps}")
    
    # 25K step daha eğit
    model.learn(total_timesteps=200000, reset_num_timesteps=False)  # 10K + 25K = 35K
    
    print(f"Final timesteps: {model.num_timesteps}")
    
    # Test
    mean_reward, _ = evaluate_policy(model, shaped_env, n_eval_episodes=5)
    print(f"Shaped env performansı: {mean_reward:.2f}")
    
    # Kaydet
    model.save(r"E:\UNITY\BombermanTower\python\Jupyter\shaped_model")
    
    shaped_env.close()
    return model

def test_anti_hovering():
    """Anti-hovering environment'ları test et"""
    
    from stable_baselines3.common.evaluation import evaluate_policy
    
    # 1. Normal shaped env (sizin mevcut)
    shaped_env = AntiHoveringLunarLander(gym.make("LunarLander-v3"))
    
    # 2. Force descent env
    force_env = ForceDescentLunarLander(gym.make("LunarLander-v3"))
    
    print("=== ANTI-HOVERING TEST ===")
    
    # Mevcut modelinizi test edin
    model_path = r"E:\UNITY\BombermanTower\python\Jupyter\shaped_model"
    
    try:
        # Shaped env ile test
        model1 = DQN.load(model_path, env=shaped_env)
        perf1, _ = evaluate_policy(model1, shaped_env, n_eval_episodes=3)
        print(f"Anti-hovering env: {perf1:.2f}")
        
        # Force env ile test
        model2 = DQN.load(model_path, env=force_env)
        perf2, _ = evaluate_policy(model2, force_env, n_eval_episodes=3)
        print(f"Force descent env: {perf2:.2f}")
        
    except:
        print("Model yüklenemedi, yeni model eğitiliyor...")
        
        # Anti-hovering ile eğitim
        model = DQN("MlpPolicy", shaped_env, verbose=1)
        model.learn(total_timesteps=20000)
        
        perf, _ = evaluate_policy(model, shaped_env, n_eval_episodes=5)
        print(f"Yeni anti-hovering model: {perf:.2f}")
        
        model.save("anti_hovering_model")
    
    shaped_env.close()
    force_env.close()
class ForceDescentLunarLander(gym.Wrapper):
    """Daha da agresif - iniş zorlaması"""
    def __init__(self, env):
        super().__init__(env)
        self.prev_obs = None
        self.step_count = 0
        
    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        self.prev_obs = obs
        self.step_count = 0
        return obs, info
    
    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        self.step_count += 1
        
        if self.prev_obs is not None:
            x, y, vx, vy = obs[:4]
            prev_y = self.prev_obs[1]
            
            # BASIT AMA ETKİLİ: Step sayısına göre ceza
            altitude_penalty = -0.01 * self.step_count * y
            reward += altitude_penalty
            
            # Aşağı gitme ödülü
            if y < prev_y:  # Aşağı iniyor
                reward += 0.1 * (prev_y - y)
            
            # Yukarı gitme cezası
            elif y > prev_y:  # Yukarı çıkıyor
                reward -= 0.2 * (y - prev_y)
            
            # Yükseklik threshold'u
            if y > 1.0 and self.step_count > 100:
                reward -= 1.0  # Çok yüksekte çok uzun süre
            
            # Episode sonlandırma
            if self.step_count > 400:
                terminated = True
        
        self.prev_obs = obs
        return obs, reward, terminated, truncated, info
def fresh_shaped_training():
    """Sıfırdan shaped environment ile eğitim"""
    
    print("\n=== SIFIRDAN SHAPED EĞİTİMİ ===")
    
    shaped_env = AntiHoveringLunarLander(gym.make("LunarLander-v3"))
    
    # Yeni model oluştur
    fresh_model = DQN("MlpPolicy", shaped_env, verbose=1)
    
    # Eğit
    fresh_model.learn(total_timesteps=150000)
    
    # Test
    fresh_perf, _ = evaluate_policy(fresh_model, shaped_env, n_eval_episodes=5)
    print(f"Sıfırdan shaped model performansı: {fresh_perf:.2f}")
    
    # Kaydet
    fresh_model.save(r"E:\UNITY\BombermanTower\python\Jupyter\fresh_shaped_model")
    
    shaped_env.close()
    return fresh_model, fresh_perf

def quick_fix():
    """Hızlı hovering düzeltmesi"""
    
    # Environment
    env = AntiHoveringLunarLander(gym.make("LunarLander-v3"))
    
    # Model yükle (yeni model oluşturmaya gerek yok)
    #model = DQN.load(r"E:\UNITY\BombermanTower\python\Jupyter\PPO\final_anti_hovering_model", env=env)
    
    model = PPO(
        policy="MlpPolicy",
        env=env,
        verbose=1,
        learning_rate=3e-4,        # PPO için tipik
        n_steps=2048,              # rollout buffer length
        batch_size=64,
        n_epochs=10,
        gamma=0.99,
        gae_lambda=0.95,
        clip_range=0.2,
    )

    print(f"Başlangıç timesteps: {model.num_timesteps}")
    
    model.learn(total_timesteps=50000, reset_num_timesteps=False)
    
    print(f"Final timesteps: {model.num_timesteps}")
    
    # Test
    from stable_baselines3.common.evaluation import evaluate_policy
    perf, _ = evaluate_policy(model, env, n_eval_episodes=5)
    print(f"Anti-hovering model performansı: {perf:.2f}")
    
    # Kaydet
    model.save("PPO\AntiHoveringLunarLander_model")
    env.close()
    
    return model

# Çalıştır
model = quick_fix()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Başlangıç timesteps: 0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 93.3     |
|    ep_rew_mean     | -212     |
| time/              |          |
|    fps             | 2064     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 98.2        |
|    ep_rew_mean          | -214        |
| time/                   |             |
|    fps                  | 1474        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009106861 |
|    clip_fraction        | 0.0551      |
|    clip_range           | 0.2        



Anti-hovering model performansı: -306.54


In [29]:
import gymnasium as gym
import numpy as np
from stable_baselines3 import DQN

class ExtremeAntiHoveringLunarLander(gym.Wrapper):
    """HOVERING = INSTANT DEATH"""
    def __init__(self, env):
        super().__init__(env)
        self.prev_obs = None
        self.hovering_steps = 0
        self.step_count = 0
        self.max_altitude_seen = 0
        
    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        self.prev_obs = obs
        self.hovering_steps = 0
        self.step_count = 0
        self.max_altitude_seen = obs[1]
        return obs, info
    
    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        self.step_count += 1
        
        x, y, vx, vy, angle, angular_vel, leg1, leg2 = obs
        velocity_mag = np.sqrt(vx**2 + vy**2)
        
        # Max altitude güncelle
        self.max_altitude_seen = max(self.max_altitude_seen, y)
        
        # HOVERING DETECTION (çok hassas)
        is_hovering = (
            y > 0.1 and 
            velocity_mag < 0.1 and 
            abs(vy) < 0.03
        )
        
        if is_hovering:
            self.hovering_steps += 1
            
            # SADECE 5 STEP HOVERING İZİN VER
            if self.hovering_steps > 5:
                reward = -500  # EXTREME PENALTY
                terminated = True
                info['hovering_death'] = True
                print(f"🚨 HOVERING DEATH at step {self.step_count}")
        else:
            self.hovering_steps = 0
        
        # ALTITUDE-BASED EXTREME PENALTIES
        if y > 0.5:
            reward -= 2.0  # Çok yüksekte olmak = büyük ceza
        
        if y > 0.8:
            reward -= 5.0  # Aşırı yüksekte = çok büyük ceza
        
        # YUKAR ÇIKMA = ÖLÜM
        if self.prev_obs is not None:
            prev_y = self.prev_obs[1]
            if y > prev_y + 0.02:  # Yukarı çıkıyor
                reward -= 10.0
                print(f"⬆️ UPWARD MOVEMENT PENALTY: {y:.3f} -> {prev_y:.3f}")
        
        # PROGRESS FORCE (aşağı gitmeyi zorla)
        if self.prev_obs is not None:
            prev_y = self.prev_obs[1]
            if y < prev_y:  # Aşağı iniyor
                reward += 5.0 * (prev_y - y)  # Büyük ödül
        
        # TIME PRESSURE (zamanla artan ceza)
        time_penalty = -0.1 * (self.step_count / 100)
        reward += time_penalty
        
        # EPISODE HARD LIMIT
        if self.step_count > 200:  # Çok kısa episode
            reward = -1000
            terminated = True
            info['time_limit_death'] = True
        
        # ALTITUDE REGRESSION REWARD
        altitude_progress = self.max_altitude_seen - y
        if altitude_progress > 0:
            reward += altitude_progress * 10  # Ne kadar aşağı indiyse o kadar ödül
        
        self.prev_obs = obs
        return obs, reward, terminated, truncated, info

class ForcedDescentLunarLander(gym.Wrapper):
    """Baştan aşağı inmeye zorla"""
    def __init__(self, env):
        super().__init__(env)
        self.step_count = 0
        
    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        self.step_count = 0
        return obs, info
    
    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        self.step_count += 1
        
        y = obs[1]  # altitude
        
        # BASIT BRUTAL APPROACH: Altitude = instant penalty
        altitude_penalty = -y * 10  # Her metre yükseklik = -10 reward
        reward += altitude_penalty
        
        # Step penalty
        reward -= 0.1
        
        # Hard time limit
        if self.step_count > 150:
            terminated = True
            reward = -500
        
        return obs, reward, terminated, truncated, info

def nuclear_option():
    """NUCLEAR OPTION: Sıfırdan tamamen yeni approach"""
    
    print("🚨 NUCLEAR OPTION: EXTREME ANTI-HOVERING 🚨")
    
    # En extreme environment
    env = ExtremeAntiHoveringLunarLander(gym.make("LunarLander-v3"))

    model = DQN.load(r"E:\UNITY\BombermanTower\python\Jupyter\fresh_100000_model", env=env)

    model.learn(total_timesteps=200000, reset_num_timesteps=False)  # 10K + 25K = 35K
    # Yeni model - sıfırdan başla
    # model = DQN(
    #     "MlpPolicy", 
    #     env, 
    #     verbose=1,
    #     learning_rate=1e-3,
    #     exploration_initial_eps=1.0,    # Tam exploration
    #     exploration_final_eps=0.01,     # Minimum exploration  
    #     exploration_fraction=0.5,       # Yarıya kadar explore
    #     target_update_interval=500,     # Daha sık update
    #     train_freq=1,                   # Her step train
    #     buffer_size=10000               # Küçük buffer
    # )
    
    print("🔥 EXTREME TRAINING BAŞLIYOR...")
    
    # Çok kısa ama yoğun eğitim
    #model.learn(total_timesteps=100000)
    
    print("✅ EXTREME TRAINING BİTTİ")
    
    # Test
    from stable_baselines3.common.evaluation import evaluate_policy
    
    print("🧪 TESTING...")
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
    print(f"EXTREME Model Performance: {mean_reward:.2f} ± {std_reward:.2f}")
    
    # Manuel test
    print("\n🎮 MANUEL TEST:")
    obs, _ = env.reset()
    total_reward = 0
    steps = 0
    
    while steps < 300:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = env.step(action)
        total_reward += reward
        steps += 1
        
        y = obs[1]
        print(f"Step {steps}: Action={action}, Altitude={y:.3f}, Reward={reward:.2f}")
        
        if terminated or truncated:
            break
    
    print(f"Manuel test sonucu: {total_reward:.2f} in {steps} steps")
    
    if 'hovering_death' in info:
        print("💀 HOVERING DEATH!")
    elif 'time_limit_death' in info:
        print("⏰ TIME LIMIT DEATH!")
    elif terminated:
        print("🎯 MISSION COMPLETE!")
    
    # Son çare kaydet
    model.save("nuclear_anti_hovering_model")
    env.close()
    
    return model

def simple_brutal_fix():
    """En basit brutal çözüm"""
    
    print("💥 SIMPLE BRUTAL FIX")
    
    env = ForcedDescentLunarLander(gym.make("LunarLander-v3"))
    model = DQN.load(r"E:\UNITY\BombermanTower\python\Jupyter\brutal_model", env=env)
    #model = DQN("MlpPolicy", env, verbose=1)
    model.learn(total_timesteps=100000, reset_num_timesteps=False)
    
    from stable_baselines3.common.evaluation import evaluate_policy
    perf, _ = evaluate_policy(model, env, n_eval_episodes=5)
    print(f"Brutal model: {perf:.2f}")
    
    model.save("brutal_model2")
    env.close()
    
    return model

# SON ÇARE TEST
def debug_hovering():
    """Hovering davranışını debug et"""
    
    print("🔍 HOVERING DEBUG")
    
    # Mevcut modelinizi yükle
    env = gym.make("LunarLander-v3")
    model = DQN.load(r"E:\UNITY\BombermanTower\python\Jupyter\shaped_model", env=env)
    
    # 10 step takip et
    obs, _ = env.reset()
    for step in range(50):
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = env.step(action)
        
        x, y, vx, vy = obs[:4]
        velocity = np.sqrt(vx**2 + vy**2)
        
        print(f"Step {step:2d}: Action={action}, Y={y:.3f}, Vel={velocity:.3f}, Reward={reward:.2f}")
        
        if terminated or truncated:
            break
    
    env.close()

# KULLANIM:
print("SEÇENEKLER:")
print("1. nuclear_option()      - EN EXTREME ÇÖZÜM")
print("2. simple_brutal_fix()   - BASIT BRUTAL")  
print("3. debug_hovering()      - MEVCUT MODELİ DEBUG ET")

# EN EXTREME ÇÖZÜM:
model = debug_hovering()

SEÇENEKLER:
1. nuclear_option()      - EN EXTREME ÇÖZÜM
2. simple_brutal_fix()   - BASIT BRUTAL
3. debug_hovering()      - MEVCUT MODELİ DEBUG ET
🔍 HOVERING DEBUG
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Step  0: Action=2, Y=1.386, Vel=0.739, Reward=1.31
Step  1: Action=2, Y=1.375, Vel=0.718, Reward=2.40
Step  2: Action=2, Y=1.363, Vel=0.718, Reward=0.32
Step  3: Action=2, Y=1.352, Vel=0.708, Reward=1.12
Step  4: Action=2, Y=1.341, Vel=0.672, Reward=3.68
Step  5: Action=2, Y=1.331, Vel=0.646, Reward=2.55
Step  6: Action=2, Y=1.322, Vel=0.633, Reward=1.16
Step  7: Action=2, Y=1.312, Vel=0.644, Reward=-1.20
Step  8: Action=2, Y=1.303, Vel=0.626, Reward=1.61
Step  9: Action=2, Y=1.294, Vel=0.639, Reward=-1.37
Step 10: Action=2, Y=1.285, Vel=0.647, Reward=-0.85
Step 11: Action=2, Y=1.277, Vel=0.646, Reward=-0.01
Step 12: Action=1, Y=1.268, Vel=0.654, Reward=-0.39
Step 13: Action=2, Y=1.260, Vel=0.668, Reward=-1.12
Step 14: Action=1, Y=1.251, Vel=0.673, R