# 🔥 Stress Testing et Power Capping sur Grid5000


## 🔧 Réservation de Ressources (Mode Standalone)

In [None]:
import os
import enoslib as en
from pathlib import Path

# --- Configuration de la réservation ---
G5K_SITE = "lille"  # Site Grid'5000 à utiliser (rennes, lille, nancy, etc.)
JOB_NAME = "StressPowerTest"
WALLTIME = "05:00:00"  # Durée de la réservation 
OS_ENV = "ubuntu2204-min"  # Environnement OS à utiliser
NODE_NAME = "chifflot"  # Nom de la machine à réserver



print("⚙️ Configuration de la réservation standalone")
print("="*80)
print(f"📍 Site Grid5000 : {G5K_SITE}")
print(f"🏷️  Nom du job    : {JOB_NAME}")
print(f"⏱️  Durée         : {WALLTIME}")
print("")
print("💡 Cette réservation est pour 1 machine uniquement (tests stress/power)")
print("="*80)

In [None]:
# Configuration de la ressource Grid'5000
conf = (
en.G5kConf.from_settings(
    job_name=JOB_NAME,
    job_type=["deploy"],
    walltime=WALLTIME, 
    env_name=OS_ENV
).add_machine(
    roles=["master"],
    cluster=NODE_NAME,
    nodes=1
)
)

print("📋 Configuration de la réservation :")
print(f"   • 1 nœud sur le cluster {NODE_NAME} ({G5K_SITE})")
print(f"   • Réseau : prod")
print(f"   • Image : {OS_ENV}")
print("")
print("✅ Configuration créée")

In [None]:
# Lancement de la réservation
print("🚀 Lancement de la réservation sur Grid'5000...")
print("⏳ Cette opération peut prendre quelques minutes...")
print("")

provider = en.G5k(conf)
roles, networks = provider.init()

print("")
print("✅ Réservation effectuée avec succès !")
print("="*80)
print("")

# Extraire le nœud de test
worker_nodes = roles["master"]

print(f"📍 Nœud réservé : {worker_nodes[0].address}")
print(f"🔗 Hostname     : {worker_nodes[0].alias}")




### 📦 Installation des Outils Nécessaires

Le nœud réservé a besoin de quelques outils pour les tests de stress et power capping.

In [None]:
print("📦 Installation des outils sur le nœud de test...")
print("="*80)
print("")

# Liste des paquets à installer
packages = [
    "stress-ng",       # Pour les tests de stress
    "powercap-utils",  # Pour le power capping Intel RAPL
    "sysstat",         # Pour les statistiques système
]

print("📋 Paquets à installer :")
for pkg in packages:
    print(f"   • {pkg}")
print("")
print("⏳ Installation en cours (cela peut prendre 1-2 minutes)...")
print("")

# Installation des paquets
with en.actions(roles=worker_nodes) as p:
    p.apt(
        name=packages,
        state="present",
        update_cache=True
    )

if p.results:
    for result in p.results:
        print(f"📍 Nœud : {result.host}")
        if result.status == "OK":
            changed = result.payload.get('changed', False)
            if changed:
                print("  ✅ Paquets installés avec succès")
            else:
                print("  ℹ️  Paquets déjà installés")
        else:
            stderr = result.payload.get('stderr', 'Erreur inconnue')
            print(f"  ❌ Erreur : {stderr}")

print("")
print("="*80)
print("✅ Installation terminée ! Le nœud est prêt pour les tests")
print("="*80)

In [None]:
print("🔍 Vérification des outils installés...")
print("="*80)
print("")

verification_commands = [
    ("stress-ng --version", "stress-ng"),
    ("powercap-info -p intel-rapl", "Intel RAPL"),
]

all_ok = True

for cmd, tool_name in verification_commands:
    print(f"📌 Vérification de {tool_name}...")
    with en.actions(roles=worker_nodes) as p:
        p.shell(cmd)
    
    if p.results:
        result = p.results[0]
        if result.status == "OK":
            print(f"   ✅ {tool_name} est disponible")
        else:
            print(f"   ❌ {tool_name} n'est pas disponible")
            all_ok = False
    print("")

print("="*80)
if all_ok:
    print("✅ Tous les outils sont correctement installés et fonctionnels !")
else:
    print("⚠️  Certains outils ne sont pas disponibles. Vérifiez l'installation.")
print("="*80)

# LSCPU

In [None]:

# Exécuter lscpu
with en.actions(roles=worker_nodes) as p:
    p.shell("lscpu")

if p.results:
    for result in p.results:
        print(f"Worker: {result.host}")
        print("-"*80)
        if result.status == "OK":
            print(result.stdout)
        else:
            print(f"Erreur: {result.payload.get('stderr', 'Erreur inconnue')}")

## 🎬 Scénarios d'Expérimentation

### ⚙️ Configuration Machine-Spécifique

In [None]:
# Configuration des profils machines
import json

# Charger les profils depuis le fichier JSON
with open("configs/machine_profiles.json") as f:
    MACHINE_PROFILES = json.load(f)

# Sélectionner le profil actif (MODIFIER CETTE LIGNE pour changer de machine)
ACTIVE_PROFILE = "chifflot"
machine_config = MACHINE_PROFILES[ACTIVE_PROFILE]

print("⚙️ Configuration machine active:")
print("="*80)
print(f"📍 Profil      : {ACTIVE_PROFILE}")
print(f"🏛️  Cluster    : {machine_config['cluster']}")
print(f"🌍 Site       : {machine_config['site']}")
print(f"🔢 CPU Threads: {machine_config['cpu_threads']} ({machine_config['cpu_cores']} cores × {machine_config['threads_per_core']} threads)")
print(f"📦 Sockets    : {machine_config['sockets']}")
print(f"💾 RAM        : {machine_config['memory_gb']} GB")
print(f"🖥️  CPU Model  : {machine_config['cpu_model']}")
print(f"⚡ CPU Freq   : {machine_config['cpu_base_mhz']}-{machine_config['cpu_max_mhz']} MHz")
print("")
print("🔥 Paramètres de stress :")
print(f"   • CPU Threads : {machine_config['stress_cpu_threads']}")
print(f"   • CPU Method  : {machine_config['cpu_method']}")
print(f"   • VM Workers  : {machine_config['stress_vm_workers']}")
print(f"   • VM Memory   : {machine_config['stress_vm_memory']}")
print("="*80)
print("")
print("💡 Pour changer de machine, modifiez ACTIVE_PROFILE = 'nom_cluster'")
print("📖 Voir configs/MACHINE_CONFIG_GUIDE.md pour ajouter une nouvelle machine")
print("="*80)

In [None]:
# Lister les profils disponibles
print("📋 Profils machines disponibles :")
print("="*80)
for profile_name in MACHINE_PROFILES.keys():
    profile = MACHINE_PROFILES[profile_name]
    marker = "✓ ACTIF" if profile_name == ACTIVE_PROFILE else ""
    print(f"  • {profile_name:15s} - {profile['site']:10s} - {profile['cpu_threads']:3d} threads - {profile['memory_gb']:3d} GB  {marker}")
print("="*80)

In [None]:
import json
from datetime import datetime

def load_scenario(scenario_path):
    with open(scenario_path, 'r') as f:
        return json.load(f)

def apply_rapl_config(node, package_id, config):
    """
    Apply RAPL power capping configuration.
    IMPORTANT: CPU constraints (long_term and short_term) are ALWAYS applied first
    to reset packages to their default/target values before configuring DRAM.
    This ensures proper isolation when testing DRAM variations.
    """
    cmds = []
    control_type = "intel-rapl"
    
    # ALWAYS configure CPU constraints first (even for DRAM-focused tests)
    # This ensures packages are reset to default/baseline values
    
    # CPU long_term (constraint 0 of package zone)
    if 'long_term_power_uw' in config:
        cmds.append(f"powercap-set {control_type} -z {package_id} -c 0 -l {config['long_term_power_uw']}")
    if 'long_term_time_us' in config:
        cmds.append(f"powercap-set {control_type} -z {package_id} -c 0 -s {config['long_term_time_us']}")
    
    # CPU short_term (constraint 1 of package zone)
    if 'short_term_power_uw' in config:
        cmds.append(f"powercap-set {control_type} -z {package_id} -c 1 -l {config['short_term_power_uw']}")
    if 'short_term_time_us' in config:
        cmds.append(f"powercap-set {control_type} -z {package_id} -c 1 -s {config['short_term_time_us']}")
    
    # DRAM configuration (applied AFTER CPU constraints)
    # This ensures CPU is at baseline when testing DRAM variations
    if config.get('dram_enabled', False):
        cmds.append(f"powercap-set {control_type} -z {package_id}:0 -e 1")
        if 'dram_power_uw' in config:
            cmds.append(f"powercap-set {control_type} -z {package_id}:0 -c 0 -l {config['dram_power_uw']}")
        if 'dram_time_us' in config:
            cmds.append(f"powercap-set {control_type} -z {package_id}:0 -c 0 -s {config['dram_time_us']}")
    else:
        cmds.append(f"powercap-set {control_type} -z {package_id}:0 -e 0")
    
    cmd = " && ".join(cmds)
    with en.actions(roles=node) as p:
        p.shell(cmd, become=True)
    
    return p.results[0].status == "OK"

def run_stress_test(node, duration=600, stress_type='cpu', config=None):
    if config is None:
        config = machine_config
    
    if stress_type == 'cpu':
        cmd = f"stress-ng --cpu {config['stress_cpu_threads']} --cpu-method {config['cpu_method']} --timeout {duration}s --metrics-brief"
    elif stress_type == 'memory':
        cmd = f"stress-ng --vm {config['stress_vm_workers']} --vm-bytes {config['stress_vm_memory']} --vm-method write64 --timeout {duration}s --metrics-brief"
    elif stress_type == 'mixed':
        cpu_threads = int(config['stress_cpu_threads'] * 0.75)
        vm_workers = int(config['stress_vm_workers'] * 0.5)
        cmd = f"stress-ng --cpu {cpu_threads} --cpu-method {config['cpu_method']} --vm {vm_workers} --vm-bytes 50% --vm-method write64 --timeout {duration}s --metrics-brief"
    else:
        cmd = f"stress-ng --cpu {config['stress_cpu_threads']} --cpu-method {config['cpu_method']} --timeout {duration}s --metrics-brief"
    
    with en.actions(roles=node) as p:
        p.shell(cmd, become=True)
    return p.results[0]

def run_scenario_experiment(scenario_path, stress_duration=600):
    scenario = load_scenario(scenario_path)
    results = []
    
    print(f"\n{'='*80}")
    print(f"🎬 {scenario['name']}")
    print(f"📝 {scenario['description']}")
    print(f"{'='*80}\n")
    
    stress_type = scenario.get('stress_type', 'cpu')
    
    for exp in scenario['experiments']:
        print(f"🧪 Test: {exp['id']} - {exp['description']}")
        
        config = {**scenario.get('default_config', {}), **exp}
        
        success = True
        for pkg in [0, 1]:
            if not apply_rapl_config(worker_nodes, pkg, config):
                print(f"  ❌ Failed package {pkg}")
                success = False
                break
        
        if not success:
            continue
        
        print(f"  ⏱️  Stress test ({stress_duration}s, type={stress_type})...")
        result = run_stress_test(worker_nodes, stress_duration, stress_type)
        
        results.append({
            'experiment_id': exp['id'],
            'config': config,
            'timestamp': datetime.now().isoformat(),
            'success': result.status == "OK",
            'stress_type': stress_type,
            'duration': stress_duration
        })
        
        print(f"  ✅ Done\n")
    
    print(f"{'='*80}")
    print(f"✅ {len(results)} experiments done")
    print(f"{'='*80}\n")
    
    return results

print("✅ Scenario runner loaded")

In [None]:
# Scenario 1: Long Term Variations
results_s1 = run_scenario_experiment(
    "scenarios/chifflot/scenario1_long_term_variations.json",
    stress_duration=300
)

In [None]:
# Scenario 2: Short Term Variations
results_s2 = run_scenario_experiment(
    "scenarios/chifflot/scenario2_short_term_variations.json",
    stress_duration=180
)

In [None]:
# Scenario 3: Both Terms Variations
results_s3 = run_scenario_experiment(
    "scenarios/chifflot/scenario3_both_terms_variations.json",
    stress_duration=180
)

In [None]:
# Scenario 4: DRAM Variations
results_s4 = run_scenario_experiment(
    "scenarios/chifflot/scenario4_dram_variations.json",
    stress_duration=126
)

In [None]:
# Save all results
all_results = {
    'scenario1': results_s1,
    'scenario2': results_s2,
    'scenario3': results_s3,
    'scenario4': results_s4
}

output_file = f"results/chiroples_experiments_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
os.makedirs("results", exist_ok=True)

with open(output_file, 'w') as f:
    json.dump(all_results, f, indent=2)

print(f"💾 Results saved to: {output_file}")

## 🔬 Commandes powercap-utils - Analyse RAPL

In [None]:
print("🔬 Exécution de commandes powercap-utils...")
print("")

# Commande simple pour voir les infos RAPL
with en.actions(roles=worker_nodes) as p:
    p.shell("powercap-info intel-rapl")

if p.results:
    for i, result in enumerate(p.results):
        print(f"\n{'='*80}")
        print(f"📍 WORKER {i+1}: {result.host}")
        print('='*80)
        
        if result.status == "OK":
            print(result.stdout)
        else:
            print(f"❌ Erreur: {result.payload.get('stderr', 'Erreur inconnue')}")

print("\n✅ Terminé")

## 📖 Aide powercap-set

In [None]:
print("📖 Affichage de l'aide powercap-set sur le premier worker...")
print("="*80)
print("")

# Exécuter powercap-set --help sur le premier worker
with en.actions(roles=worker_nodes) as p:
    p.shell("powercap-set --help")

if p.results:
    for result in p.results:
        print(f"Worker: {result.host}")
        print("-"*80)
        if result.status == "OK":
            print(result.stdout)
        else:
            print(f"Erreur: {result.payload.get('stderr', 'Erreur inconnue')}")

---

## 🧹 Libération des Ressources (Mode Standalone)



In [None]:
print("Libération des ressources sur Grid'5000...")
provider.destroy()
print("Ressources libérées. ✅")