## üì¶ 1. Installation et Setup

Installation des d√©pendances n√©cessaires

In [None]:
# Installation des packages requis
!pip install -q torch torchvision gymnasium pygame numpy matplotlib seaborn pyyaml tqdm scipy pandas tensorboard

In [None]:
# V√©rifier GPU disponible
import torch
print(f"üî• CUDA disponible: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   M√©moire: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("   ‚ö†Ô∏è  CPU mode (sera plus lent)")

## üìÅ 2. Cloner le Projet

Clone ton repository GitHub

In [None]:
# Cloner le projet depuis GitHub
!git clone https://github.com/mohhajji-1111/projet_RL.git
%cd projet_RL

print("‚úÖ Projet clon√© avec succ√®s!")

In [None]:
# V√©rifier la structure du projet
!ls -la

## üîß 3. Configuration de l'Environnement

Setup de l'environnement Python

‚ö†Ô∏è **Important**: Cette cellule configure le Python path pour que les imports fonctionnent correctement.

---

In [None]:
import sys
import os
from pathlib import Path

# Ajouter le projet au Python path
project_root = Path.cwd()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# V√©rifier que les modules sont accessibles
print(f"‚úÖ Project root: {project_root}")
print(f"‚úÖ Python path configur√©")
print(f"\nüìÅ Structure du projet:")
!ls -la src/
!ls -la src/agents/

## üéÆ 4. Test de l'Environnement

V√©rifier que l'environnement fonctionne

In [None]:
from src.environment.navigation_env import NavigationEnv
import numpy as np

# Cr√©er environnement
env = NavigationEnv(width=800, height=600, render_mode=None)

print(f"‚úÖ Environnement cr√©√© avec succ√®s!")
print(f"   Observation space: {env.observation_space.shape}")
print(f"   Action space: {env.action_space.n} actions")

# Test rapide
state, _ = env.reset()
print(f"\nüìä √âtat initial: {state}")

# Faire quelques steps
for i in range(5):
    action = env.action_space.sample()
    state, reward, done, truncated, info = env.step(action)
    print(f"   Step {i+1}: action={action}, reward={reward:.2f}, done={done}")

print("\n‚úÖ Environnement fonctionne correctement!")

## üß† 5. Initialiser l'Agent avec Curiosit√©

Cr√©er l'agent CuriosityAgent avec le module ICM

In [None]:
from src.agents.curiosity_agent import CuriosityAgent
import yaml

# Charger la configuration
with open('configs/curiosity_config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Cr√©er l'agent
device = 'cuda' if torch.cuda.is_available() else 'cpu'

agent = CuriosityAgent(
    state_dim=env.observation_space.shape[0],
    action_dim=env.action_space.n,
    config=config['agent'],
    device=device
)

print(f"‚úÖ Agent cr√©√© avec succ√®s!")
print(f"   Device: {device}")
print(f"   State dim: {env.observation_space.shape[0]}")
print(f"   Action dim: {env.action_space.n}")
print(f"   Feature dim: {config['agent']['feature_dim']}")
print(f"   Curiosity beta: {config['agent']['curiosity_beta']}")

# Afficher les r√©seaux
print("\nüìä Architecture des r√©seaux ICM:")
print(f"   Feature Network: {sum(p.numel() for p in agent.feature_net.parameters())} params")
print(f"   Inverse Model: {sum(p.numel() for p in agent.inverse_model.parameters())} params")
print(f"   Forward Model: {sum(p.numel() for p in agent.forward_model.parameters())} params")
print(f"   Q-Network: {sum(p.numel() for p in agent.q_network.parameters())} params")

## üöÄ 6. Entra√Ænement de l'Agent

Lancer l'entra√Ænement complet avec suivi en temps r√©el

In [None]:
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from collections import deque

# Param√®tres d'entra√Ænement
num_episodes = 1500
eval_interval = 50
save_interval = 100

# Cr√©er dossiers pour sauvegardes
os.makedirs('results/models/curiosity', exist_ok=True)
os.makedirs('results/plots', exist_ok=True)
os.makedirs('results/logs', exist_ok=True)

# M√©triques
episode_rewards = []
episode_lengths = []
intrinsic_rewards = []
success_rate = deque(maxlen=100)
avg_rewards = deque(maxlen=100)

# ICM losses
forward_losses = []
inverse_losses = []

best_reward = -float('inf')

print("üöÄ D√©but de l'entra√Ænement...\n")

# Barre de progression
pbar = tqdm(range(num_episodes), desc="Training")

for episode in pbar:
    state, _ = env.reset()
    episode_reward = 0
    episode_intrinsic = 0
    episode_length = 0
    done = False
    truncated = False
    
    while not (done or truncated):
        # S√©lectionner action
        action = agent.select_action(state)
        
        # Step dans l'environnement
        next_state, reward, done, truncated, info = env.step(action)
        
        # Calculer r√©compense intrins√®que
        intrinsic_reward = agent.compute_intrinsic_reward(
            torch.FloatTensor(state).unsqueeze(0).to(device),
            torch.LongTensor([action]).to(device),
            torch.FloatTensor(next_state).unsqueeze(0).to(device)
        )
        
        # Stocker transition
        agent.store_transition(state, action, reward, next_state, done or truncated)
        
        # Entra√Æner
        if len(agent.replay_buffer) > agent.batch_size:
            loss = agent.train_step()
        
        episode_reward += reward
        episode_intrinsic += intrinsic_reward
        episode_length += 1
        state = next_state
    
    # Enregistrer m√©triques
    episode_rewards.append(episode_reward)
    episode_lengths.append(episode_length)
    intrinsic_rewards.append(episode_intrinsic)
    avg_rewards.append(episode_reward)
    success_rate.append(1.0 if info.get('success', False) else 0.0)
    
    # ICM stats
    icm_stats = agent.get_icm_stats()
    forward_losses.append(icm_stats['forward_loss'])
    inverse_losses.append(icm_stats['inverse_loss'])
    
    # Update progress bar
    pbar.set_postfix({
        'Reward': f"{episode_reward:.2f}",
        'Avg': f"{np.mean(avg_rewards):.2f}",
        'Success': f"{np.mean(success_rate):.2%}",
        'Epsilon': f"{agent.epsilon:.3f}"
    })
    
    # Sauvegarder meilleur mod√®le
    if episode_reward > best_reward:
        best_reward = episode_reward
        agent.save_checkpoint('results/models/curiosity/best.pth')
    
    # Sauvegardes r√©guli√®res
    if (episode + 1) % save_interval == 0:
        agent.save_checkpoint(f'results/models/curiosity/checkpoint_{episode+1}.pth')
        print(f"\nüíæ Checkpoint sauvegard√©: episode {episode+1}")

print("\n‚úÖ Entra√Ænement termin√©!")
print(f"   Meilleure r√©compense: {best_reward:.2f}")
print(f"   Taux de succ√®s final: {np.mean(success_rate):.2%}")
print(f"   R√©compense moyenne (100 derniers): {np.mean(avg_rewards):.2f}")

## üìä 7. Visualisation des R√©sultats

Analyse des performances de l'agent

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.ndimage import gaussian_filter1d

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (16, 12)

fig, axes = plt.subplots(3, 2, figsize=(16, 12))

# 1. R√©compenses d'√©pisode
ax = axes[0, 0]
ax.plot(episode_rewards, alpha=0.3, label='Raw', color='blue')
smoothed = gaussian_filter1d(episode_rewards, sigma=20)
ax.plot(smoothed, label='Smoothed (œÉ=20)', color='red', linewidth=2)
ax.set_xlabel('Episode')
ax.set_ylabel('Total Reward')
ax.set_title('üìà R√©compenses d\'√âpisode')
ax.legend()
ax.grid(True, alpha=0.3)

# 2. Moyenne mobile
ax = axes[0, 1]
window = 100
moving_avg = [np.mean(episode_rewards[max(0, i-window):i+1]) for i in range(len(episode_rewards))]
ax.plot(moving_avg, color='green', linewidth=2)
ax.set_xlabel('Episode')
ax.set_ylabel('Average Reward')
ax.set_title(f'üìä Moyenne Mobile (window={window})')
ax.grid(True, alpha=0.3)

# 3. R√©compenses intrins√®ques
ax = axes[1, 0]
ax.plot(intrinsic_rewards, alpha=0.4, color='orange')
smoothed_int = gaussian_filter1d(intrinsic_rewards, sigma=20)
ax.plot(smoothed_int, color='red', linewidth=2)
ax.set_xlabel('Episode')
ax.set_ylabel('Intrinsic Reward')
ax.set_title('üîç R√©compenses Intrins√®ques (Curiosit√©)')
ax.grid(True, alpha=0.3)

# 4. ICM Losses
ax = axes[1, 1]
ax.plot(gaussian_filter1d(forward_losses, sigma=20), label='Forward Loss', color='purple', linewidth=2)
ax.plot(gaussian_filter1d(inverse_losses, sigma=20), label='Inverse Loss', color='brown', linewidth=2)
ax.set_xlabel('Episode')
ax.set_ylabel('Loss')
ax.set_title('üß† ICM Losses')
ax.legend()
ax.grid(True, alpha=0.3)

# 5. Longueur des √©pisodes
ax = axes[2, 0]
ax.plot(episode_lengths, alpha=0.3, color='cyan')
smoothed_len = gaussian_filter1d(episode_lengths, sigma=20)
ax.plot(smoothed_len, color='darkblue', linewidth=2)
ax.set_xlabel('Episode')
ax.set_ylabel('Steps')
ax.set_title('‚è±Ô∏è Longueur des √âpisodes')
ax.grid(True, alpha=0.3)

# 6. Distribution des r√©compenses
ax = axes[2, 1]
ax.hist(episode_rewards, bins=50, alpha=0.7, color='green', edgecolor='black')
ax.axvline(np.mean(episode_rewards), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(episode_rewards):.2f}')
ax.axvline(np.median(episode_rewards), color='blue', linestyle='--', linewidth=2, label=f'Median: {np.median(episode_rewards):.2f}')
ax.set_xlabel('Reward')
ax.set_ylabel('Frequency')
ax.set_title('üìä Distribution des R√©compenses')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('results/plots/training_summary.png', dpi=150, bbox_inches='tight')
plt.show()

print("‚úÖ Visualisations sauvegard√©es dans results/plots/")

## üìà 8. Statistiques D√©taill√©es

In [None]:
# Statistiques finales
print("="*60)
print("üìä STATISTIQUES FINALES")
print("="*60)

print(f"\nüéØ R√©compenses:")
print(f"   Moyenne totale: {np.mean(episode_rewards):.2f} ¬± {np.std(episode_rewards):.2f}")
print(f"   M√©diane: {np.median(episode_rewards):.2f}")
print(f"   Minimum: {np.min(episode_rewards):.2f}")
print(f"   Maximum: {np.max(episode_rewards):.2f}")
print(f"   Meilleur: {best_reward:.2f}")

# Derniers 100 √©pisodes
last_100 = episode_rewards[-100:]
print(f"\nüìä Performance (100 derniers √©pisodes):")
print(f"   Moyenne: {np.mean(last_100):.2f} ¬± {np.std(last_100):.2f}")
print(f"   Taux de succ√®s: {np.mean(list(success_rate)):.2%}")

print(f"\n‚è±Ô∏è √âpisodes:")
print(f"   Longueur moyenne: {np.mean(episode_lengths):.1f} steps")
print(f"   Longueur m√©diane: {np.median(episode_lengths):.1f} steps")

print(f"\nüîç Curiosit√© (ICM):")
print(f"   R√©compense intrins√®que moyenne: {np.mean(intrinsic_rewards):.4f}")
print(f"   Forward loss finale: {forward_losses[-1]:.4f}")
print(f"   Inverse loss finale: {inverse_losses[-1]:.4f}")

print(f"\nüß† Agent:")
print(f"   Epsilon final: {agent.epsilon:.4f}")
print(f"   Replay buffer size: {len(agent.replay_buffer)}")

print("\n" + "="*60)

## üíæ 9. T√©l√©charger les Mod√®les

T√©l√©charger les mod√®les entra√Æn√©s vers Google Drive

In [None]:
# Monter Google Drive (optionnel)
from google.colab import drive
drive.mount('/content/drive')

# Copier les r√©sultats vers Drive
!cp -r results /content/drive/MyDrive/projet_RL_results

print("‚úÖ R√©sultats copi√©s vers Google Drive: MyDrive/projet_RL_results/")

In [None]:
# Ou t√©l√©charger directement
from google.colab import files

# Compresser les r√©sultats
!zip -r results.zip results/

# T√©l√©charger
files.download('results.zip')

print("‚úÖ Archive results.zip t√©l√©charg√©e!")

## üéÆ 10. Test de l'Agent Entra√Æn√©

Tester l'agent sur quelques √©pisodes

In [None]:
# Charger le meilleur mod√®le
agent.load_checkpoint('results/models/curiosity/best.pth')
agent.epsilon = 0.0  # Mode exploitation pur

print("üéÆ Test de l'agent entra√Æn√© (10 √©pisodes)...\n")

test_rewards = []
test_lengths = []

for ep in range(10):
    state, _ = env.reset()
    episode_reward = 0
    steps = 0
    done = False
    truncated = False
    
    while not (done or truncated) and steps < 500:
        action = agent.select_action(state)
        state, reward, done, truncated, info = env.step(action)
        episode_reward += reward
        steps += 1
    
    test_rewards.append(episode_reward)
    test_lengths.append(steps)
    success = "‚úÖ" if info.get('success', False) else "‚ùå"
    print(f"Episode {ep+1}: {success} Reward={episode_reward:.2f}, Steps={steps}")

print(f"\nüìä R√©sultats du test:")
print(f"   Moyenne: {np.mean(test_rewards):.2f} ¬± {np.std(test_rewards):.2f}")
print(f"   M√©diane: {np.median(test_rewards):.2f}")
print(f"   Longueur moyenne: {np.mean(test_lengths):.1f} steps")

## üìù 11. Sauvegarder le Rapport Final

In [None]:
# Cr√©er un rapport texte
report = f"""
RAPPORT D'ENTRA√éNEMENT - CURIOSITY AGENT
{'='*60}

Configuration:
- √âpisodes: {num_episodes}
- Device: {device}
- Feature dim: {config['agent']['feature_dim']}
- Curiosity beta: {config['agent']['curiosity_beta']}

R√©sultats:
- Meilleure r√©compense: {best_reward:.2f}
- R√©compense moyenne: {np.mean(episode_rewards):.2f} ¬± {np.std(episode_rewards):.2f}
- R√©compense m√©diane: {np.median(episode_rewards):.2f}
- Taux de succ√®s (final): {np.mean(list(success_rate)):.2%}

Performance (100 derniers √©pisodes):
- R√©compense moyenne: {np.mean(last_100):.2f} ¬± {np.std(last_100):.2f}
- Longueur moyenne: {np.mean(episode_lengths[-100:]):.1f} steps

Curiosit√© (ICM):
- R√©compense intrins√®que moyenne: {np.mean(intrinsic_rewards):.4f}
- Forward loss finale: {forward_losses[-1]:.4f}
- Inverse loss finale: {inverse_losses[-1]:.4f}

Test (10 √©pisodes):
- Moyenne: {np.mean(test_rewards):.2f} ¬± {np.std(test_rewards):.2f}
- M√©diane: {np.median(test_rewards):.2f}
- Longueur moyenne: {np.mean(test_lengths):.1f} steps

{'='*60}
"""

# Sauvegarder
with open('results/training_report.txt', 'w') as f:
    f.write(report)

print(report)
print("‚úÖ Rapport sauvegard√©: results/training_report.txt")

## üéâ Fin de l'Entra√Ænement!

### Fichiers g√©n√©r√©s:
- `results/models/curiosity/best.pth` - Meilleur mod√®le
- `results/models/curiosity/checkpoint_*.pth` - Checkpoints
- `results/plots/training_summary.png` - Visualisations
- `results/training_report.txt` - Rapport d√©taill√©

### Prochaines √©tapes:
1. T√©l√©charger les mod√®les entra√Æn√©s
2. Tester localement avec l'interface GUI
3. Comparer avec DQN baseline
4. Optimiser les hyperparam√®tres si n√©cessaire

**Bon courage! üöÄü§ñ**