# Partie 1 : CodeBERT

## Pour le trainset

In [None]:
import os
import re
import json
import torch
from tqdm import tqdm
from datetime import datetime
from collections import Counter
from transformers import AutoTokenizer, AutoModel

# ========== CONFIGURATION ==========
json_folder = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/SDC/Fichiers source/folder_training_set_2/folder_training_set"
save_dir = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/instruction_embeddings_grouped_codebert_v2"
os.makedirs(save_dir, exist_ok=True)

# ========== MODELE ET PERIPHERIQUE ==========
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base", torch_dtype=torch.float16).to(device)
model.eval()

# ========== FONCTIONS UTILITAIRES ==========
def is_valid_dot_file(filepath):
    try:
        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
            first_line = f.readline().strip().lower()
            return first_line and ("digraph" in first_line or "graph" in first_line)
    except Exception:
        return False

def extract_labels_from_text(dot_text):
    return re.findall(r'label\s*=\s*"(.*?)"', dot_text)

# Version A: Traitement des labels par fréquence
def extract_distinct_payloads_version_a(node_labels):
    freq = Counter()
    cleaned_payloads = []

    for label in node_labels:
        parts = label.split(":")
        payload = ":".join(parts[1:]).strip() if len(parts) > 1 else label.strip()
        payload = re.sub(r"[:,\"\[\],]", " ", payload)
        payload = re.sub(r"[^a-zA-Z0-9_\-+*/\\= ]+", " ", payload)
        payload = re.sub(r"\b0x[0-9a-fA-F]+\b", '', payload)
        payload = re.sub(r'\b[a-zA-Z]+\s*=\s*0x[0-9a-fA-F]+\b', '', payload)
        payload = re.sub(r"=", "", payload)
        payload = re.sub(r"[+\-]", "", payload)
        payload = re.sub(r"\s+", " ", payload).strip()
        if payload:
            freq[payload] += 1
            cleaned_payloads.append(payload)

    distinct_ordered = []
    seen = set()
    for p in sorted(cleaned_payloads, key=lambda x: freq[x]):
        if p not in seen:
            seen.add(p)
            distinct_ordered.append(p)
    return distinct_ordered

# ========== FONCTIONS D'EMBEDDING ==========
def embed_instruction_batch(list_texts):
    encodings = tokenizer(list_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    encodings = {k: v.to(device) for k, v in encodings.items()}
    with torch.no_grad():
        outputs = model(**encodings)
        hidden = outputs.last_hidden_state
        attention_mask = encodings["attention_mask"].unsqueeze(-1).expand(hidden.size())
        summed = (hidden * attention_mask).sum(dim=1)
        counts = attention_mask.sum(dim=1)
        mean_pooled = summed / counts
    return mean_pooled.cpu()

# ========== GESTION DES IDs ==========
def get_all_embedded_ids(save_dir):
    all_ids = []
    for fname in os.listdir(save_dir):
        if fname.startswith("embedding_") and fname.endswith(".pt"):
            try:
                data = torch.load(os.path.join(save_dir, fname), map_location="cpu")
                all_ids.extend(data.get("ids", []))
            except:
                continue
    return set(all_ids)

# ========== ENCODAGE PRINCIPAL ==========
def encode_graphs_in_size_range(json_folder, save_dir, min_kb, max_kb, batch_size=32,
                                custom_file_list=None, custom_output_name=None):
    embedded_ids = get_all_embedded_ids(save_dir)

    all_files = [(path, os.path.getsize(path) / 1024.0) for path in custom_file_list] if custom_file_list else []
    if not custom_file_list:
        for f in os.listdir(json_folder):
            if not f.endswith(".json"): continue
            path = os.path.join(json_folder, f)
            size_kb = os.path.getsize(path) / 1024.0
            if min_kb <= size_kb < max_kb:
                graph_id = f.replace(".json", "")
                if graph_id not in embedded_ids:
                    all_files.append((path, size_kb))

    all_files.sort(key=lambda x: x[1])
    print(f"Fichiers à traiter ({min_kb}-{max_kb}KB): {len(all_files)}")

    buffer_instr, buffer_ids = [], []
    embeddings, ids, failed = [], [], []

    for path, size_kb in tqdm(all_files):
        if not is_valid_dot_file(path):
            failed.append(os.path.basename(path))
            continue
        try:
            with open(path, "r", encoding="utf-8", errors="ignore") as f:
                dot_text = f.read()
            labels = extract_labels_from_text(dot_text)
            distinct_payloads = extract_distinct_payloads_version_a(labels)
            combined_instr = " ".join(distinct_payloads)[:2560]
            file_id = os.path.basename(path).replace(".json", "")
            buffer_instr.append(combined_instr)
            buffer_ids.append(file_id)

            if len(buffer_instr) >= batch_size:
                batch_emb = embed_instruction_batch(buffer_instr)
                embeddings.extend(batch_emb)
                ids.extend(buffer_ids)
                buffer_instr.clear()
                buffer_ids.clear()

        except Exception as e:
            print(f"Erreur avec {os.path.basename(path)}: {e}")
            failed.append(os.path.basename(path))

    if buffer_instr:
        batch_emb = embed_instruction_batch(buffer_instr)
        embeddings.extend(batch_emb)
        ids.extend(buffer_ids)

    if embeddings:
        tensor = torch.stack(embeddings)
        output_name = custom_output_name if custom_output_name else f"embedding_{min_kb}_{max_kb}_KB.pt"
        torch.save({'ids': ids, 'embeddings': tensor}, os.path.join(save_dir, output_name))
        print(f"Sauvegarde de {len(ids)} graphs dans {output_name}")

    if failed:
        log_path = os.path.join(save_dir, f"log_{min_kb}_{max_kb}_KB.json")
        with open(log_path, "w") as f:
            json.dump({"failed": failed, "timestamp": datetime.now().isoformat()}, f, indent=2)
        failed_txt_path = os.path.join(save_dir, "failed_graphs_remaining_test.txt")
        with open(failed_txt_path, "w") as ftxt:
            for fname in failed:
                ftxt.write(fname + "\n")
        print(f"Erreurs: {len(failed)} - log dans {log_path}")

# ========== EXECUTION ==========
if __name__ == "__main__":
    max_new_pt_files = 100
    created_pt_count = 0
    start_group_num = 1

    # Liste des fichiers existants
    print("\nFichiers .pt existants:")
    pt_files = sorted([f for f in os.listdir(save_dir) if f.endswith(".pt")])
    for f in pt_files:
        f_path = os.path.join(save_dir, f)
        try:
            data = torch.load(f_path, map_location="cpu")
            n_graphs = len(data.get("ids", []))
            print(f"  - {f:<35} contient {n_graphs} graphs")
        except Exception as e:
            print(f"  Erreur lecture {f}: {e}")

    # Fichiers restants à traiter
    embedded_ids = get_all_embedded_ids(save_dir)
    remaining_files = []
    for fname in os.listdir(json_folder):
        if fname.endswith(".json"):
            graph_id = fname.replace(".json", "")
            if graph_id not in embedded_ids:
                full_path = os.path.join(json_folder, fname)
                size_kb = os.path.getsize(full_path) / 1024.0
                remaining_files.append((full_path, size_kb))

    print(f"\nGraphs restants: {len(remaining_files)}")
    remaining_files.sort(key=lambda x: x[1])

    # Traitement par groupes
    group_size = 50
    for i in range(0, len(remaining_files), group_size):
        group = remaining_files[i:i+group_size]
        if not group:
            continue

        group_num = start_group_num + (i // group_size)
        output_name = f"embedding_50_graphs_{group_num:02d}.pt"
        output_path = os.path.join(save_dir, output_name)

        if os.path.exists(output_path):
            print(f"Fichier {output_name} existe déjà")
            continue

        print(f"\nTraitement groupe {group_num:02d} ({len(group)} fichiers) → {output_name}")
        file_list = [path for path, _ in group]
        encode_graphs_in_size_range(
            json_folder=json_folder,
            save_dir=save_dir,
            min_kb=0, max_kb=1_000_000,
            batch_size=32,
            custom_file_list=file_list,
            custom_output_name=output_name
        )

        created_pt_count += 1
        if created_pt_count >= max_new_pt_files:
            print("Nombre maximal de fichiers atteint")
            break

## Pour le testset

In [None]:
import os
import re
import json
import torch
from tqdm import tqdm
from datetime import datetime
from collections import Counter
from transformers import AutoTokenizer, AutoModel

# ========== CONFIGURATION ==========
json_folder = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/folder_test_set"
save_dir = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/instruction_embeddings_test_codebert_v2"
os.makedirs(save_dir, exist_ok=True)

# ========== MODELE ET PERIPHERIQUE ==========
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base", torch_dtype=torch.float16).to(device)
model.eval()

# ========== FONCTIONS UTILITAIRES ==========
def is_valid_dot_file(filepath):
    try:
        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
            first_line = f.readline().strip().lower()
            return first_line and ("digraph" in first_line or "graph" in first_line)
    except Exception:
        return False

def extract_labels_from_text(dot_text):
    return re.findall(r'label\s*=\s*"(.*?)"', dot_text)

# Version A: Traitement des labels par fréquence
def extract_distinct_payloads_version_a(node_labels):
    freq = Counter()
    cleaned_payloads = []

    for label in node_labels:
        parts = label.split(":")
        payload = ":".join(parts[1:]).strip() if len(parts) > 1 else label.strip()
        payload = re.sub(r"[:,\"\[\],]", " ", payload)
        payload = re.sub(r"[^a-zA-Z0-9_\-+*/\\= ]+", " ", payload)
        payload = re.sub(r"\b0x[0-9a-fA-F]+\b", '', payload)
        payload = re.sub(r'\b[a-zA-Z]+\s*=\s*0x[0-9a-fA-F]+\b', '', payload)
        payload = re.sub(r"=", "", payload)
        payload = re.sub(r"[+\-]", "", payload)
        payload = re.sub(r"\s+", " ", payload).strip()
        if payload:
            freq[payload] += 1
            cleaned_payloads.append(payload)

    distinct_ordered = []
    seen = set()
    for p in sorted(cleaned_payloads, key=lambda x: freq[x]):
        if p not in seen:
            seen.add(p)
            distinct_ordered.append(p)
    return distinct_ordered

# ========== FONCTIONS D'EMBEDDING ==========
def embed_instruction_batch(list_texts):
    encodings = tokenizer(list_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    encodings = {k: v.to(device) for k, v in encodings.items()}
    with torch.no_grad():
        outputs = model(**encodings)
        hidden = outputs.last_hidden_state
        attention_mask = encodings["attention_mask"].unsqueeze(-1).expand(hidden.size())
        summed = (hidden * attention_mask).sum(dim=1)
        counts = attention_mask.sum(dim=1)
        mean_pooled = summed / counts
    return mean_pooled.cpu()

# ========== GESTION DES IDs ==========
def get_all_embedded_ids(save_dir):
    all_ids = []
    for fname in os.listdir(save_dir):
        if fname.startswith("embedding_") and fname.endswith(".pt"):
            try:
                data = torch.load(os.path.join(save_dir, fname), map_location="cpu")
                all_ids.extend(data.get("ids", []))
            except:
                continue
    return set(all_ids)

# ========== ENCODAGE PRINCIPAL ==========
def encode_graphs_in_size_range(json_folder, save_dir, min_kb, max_kb, batch_size=32,
                                custom_file_list=None, custom_output_name=None):
    embedded_ids = get_all_embedded_ids(save_dir)

    all_files = [(path, os.path.getsize(path) / 1024.0) for path in custom_file_list] if custom_file_list else []
    if not custom_file_list:
        for f in os.listdir(json_folder):
            if not f.endswith(".json"): continue
            path = os.path.join(json_folder, f)
            size_kb = os.path.getsize(path) / 1024.0
            if min_kb <= size_kb < max_kb:
                graph_id = f.replace(".json", "")
                if graph_id not in embedded_ids:
                    all_files.append((path, size_kb))

    all_files.sort(key=lambda x: x[1])
    print(f"Fichiers a traiter ({min_kb}-{max_kb}KB): {len(all_files)}")

    buffer_instr, buffer_ids = [], []
    embeddings, ids, failed = [], [], []

    for path, size_kb in tqdm(all_files):
        if not is_valid_dot_file(path):
            failed.append(os.path.basename(path))
            continue
        try:
            with open(path, "r", encoding="utf-8", errors="ignore") as f:
                dot_text = f.read()
            labels = extract_labels_from_text(dot_text)
            distinct_payloads = extract_distinct_payloads_version_a(labels)
            combined_instr = " ".join(distinct_payloads)[:2560]
            file_id = os.path.basename(path).replace(".json", "")
            buffer_instr.append(combined_instr)
            buffer_ids.append(file_id)

            if len(buffer_instr) >= batch_size:
                batch_emb = embed_instruction_batch(buffer_instr)
                embeddings.extend(batch_emb)
                ids.extend(buffer_ids)
                buffer_instr.clear()
                buffer_ids.clear()

        except Exception as e:
            print(f"Erreur avec {os.path.basename(path)}: {e}")
            failed.append(os.path.basename(path))

    if buffer_instr:
        batch_emb = embed_instruction_batch(buffer_instr)
        embeddings.extend(batch_emb)
        ids.extend(buffer_ids)

    if embeddings:
        tensor = torch.stack(embeddings)
        output_name = custom_output_name if custom_output_name else f"embedding_{min_kb}_{max_kb}_KB.pt"
        torch.save({'ids': ids, 'embeddings': tensor}, os.path.join(save_dir, output_name))
        print(f"Sauvegarde de {len(ids)} graphs dans {output_name}")

    if failed:
        log_path = os.path.join(save_dir, f"log_{min_kb}_{max_kb}_KB.json")
        with open(log_path, "w") as f:
            json.dump({"failed": failed, "timestamp": datetime.now().isoformat()}, f, indent=2)
        failed_txt_path = os.path.join(save_dir, "failed_graphs_remaining_test.txt")
        with open(failed_txt_path, "w") as ftxt:
            for fname in failed:
                ftxt.write(fname + "\n")
        print(f"Erreurs: {len(failed)} - log dans {log_path}")

# ========== EXECUTION ==========
if __name__ == "__main__":
    max_new_pt_files = 200
    created_pt_count = 0

    embedded_ids = get_all_embedded_ids(save_dir)
    remaining_files = []
    for fname in os.listdir(json_folder):
        if not fname.endswith(".json"):
            continue
        graph_id = fname.replace(".json", "")
        if graph_id not in embedded_ids:
            remaining_files.append(os.path.join(json_folder, fname))

    print(f"\nGraphs restants: {len(remaining_files)}")

    categorized_files = {}
    for lower in range(0, 100, 20):  categorized_files[f"{lower}_{lower+20}_KB"] = []
    for lower in range(100, 500, 50):  categorized_files[f"{lower}_{lower+50}_KB"] = []
    for lower in range(500, 1000, 50):  categorized_files[f"{lower}_{lower+50}_KB"] = []
    for lower in range(1000, 10000, 1000):  categorized_files[f"{lower}_{lower+1000}_KB"] = []
    for lower in range(10000, 200000, 20000):  categorized_files[f"{lower}_{lower+20000}_KB"] = []
    for lower in range(200000, 400000, 50000):  categorized_files[f"{lower}_{lower+50000}_KB"] = []
    categorized_files["over_400000_KB"] = []

    for file_path in remaining_files:
        size_kb = os.path.getsize(file_path) / 1024.0
        assigned = False
        for key in categorized_files:
            if key == "over_400000_KB":
                continue
            lower, upper, _ = key.split("_")
            if float(lower) <= size_kb < float(upper):
                categorized_files[key].append(file_path)
                assigned = True
                break
        if not assigned:
            categorized_files["over_400000_KB"].append(file_path)

    for key, file_list in categorized_files.items():
        if not file_list:
            print(f"Groupe {key}: aucun fichier")
            continue

        output_name = f"embedding_{key}.pt"
        output_path = os.path.join(save_dir, output_name)

        if os.path.exists(output_path):
            print(f"Groupe {key} deja traite: {output_name}")
            continue

        print(f"\n=== Traitement groupe {key} ({len(file_list)} fichiers) ===")
        min_kb = 400000 if key == "over_400000_KB" else int(key.split("_")[0])
        max_kb = 1_000_000 if key == "over_400000_KB" else int(key.split("_")[1])

        encode_graphs_in_size_range(
            json_folder=json_folder,
            save_dir=save_dir,
            min_kb=min_kb,
            max_kb=max_kb,
            batch_size=32,
            custom_file_list=file_list,
            custom_output_name=output_name
        )

        created_pt_count += 1
        print(f"Groupe {key} traite et sauvegarde dans {output_name}")

        if created_pt_count >= max_new_pt_files:
            print("Nombre maximal de fichiers atteint")
            break

## TRAIN : Vérifier la base d'embedding finale - IDs manquants dans metadata

In [None]:
import os
import torch
import pandas as pd

# ========== CONFIGURATION ==========
embedding_dir = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/instruction_embeddings_grouped_codebert_v2"
metadata_csv_path = "/content/drive/My Drive/Colab Notebooks/Data/SDC/training_set_metadata.csv"

# ========== ANALYSE DES FICHIERS .pt ==========
pt_files = [f for f in os.listdir(embedding_dir) if f.endswith(".pt") and f.startswith("embedding_")]
print(f"Total fichiers .pt: {len(pt_files)}\n")

all_ids = set()
duplicate_ids = set()

for fname in sorted(pt_files):
    fpath = os.path.join(embedding_dir, fname)
    try:
        data = torch.load(fpath, map_location="cpu")
        ids = data.get("ids", [])
        emb = data.get("embeddings", None)

        print(f"Fichier {fname}:")
        print(f"   - Graphs: {len(ids)}")
        print(f"   - Dimensions embeddings: {emb.shape if emb is not None else 'Non trouve'}")
        print(f"   - Exemples IDs: {ids[:3]}")

        for gid in ids:
            if gid in all_ids:
                duplicate_ids.add(gid)
            else:
                all_ids.add(gid)

    except Exception as e:
        print(f"Erreur lecture {fname}: {e}")

print("\nTotal IDs uniques:", len(all_ids))
if duplicate_ids:
    print(f"IDs dupliques trouves: {len(duplicate_ids)}")
    print("   Exemples:", list(duplicate_ids)[:10])
else:
    print("Aucun ID duplique trouve.")

# ========== LECTURE METADONNEES ==========
print("\nLecture metadata CSV...")
metadata_df = pd.read_csv(metadata_csv_path, sep=";")

if 'name' not in metadata_df.columns:
    print("Colonne 'name' absente. Colonnes disponibles:", metadata_df.columns.tolist())
else:
    metadata_ids = set(metadata_df["name"].astype(str))
    print(f"Total IDs dans metadata: {len(metadata_ids)}")

    # Comparaison
    missing_from_metadata = all_ids - metadata_ids
    print(f"\nIDs manquants dans metadata: {len(missing_from_metadata)}")
    if missing_from_metadata:
        print("   Exemples:", list(missing_from_metadata)[:10])
    else:
        print("Tous les IDs sont presents dans metadata.")

In [None]:
# Eliminer tous les IDs ne sont pas presents dans metadata.

import os
import torch
import pandas as pd

# ========== CONFIGURATION ==========
embedding_dir = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/instruction_embeddings_grouped_codebert_v2"
metadata_csv_path = "/content/drive/My Drive/Colab Notebooks/Data/SDC/training_set_metadata.csv"

# ========== LECTURE METADONNEES ==========
metadata_df = pd.read_csv(metadata_csv_path, sep=";")
metadata_ids = set(metadata_df["name"].astype(str))
print(f"Metadata charges : {len(metadata_ids)} IDs\n")

# ========== TRAITEMENT DES FICHIERS .pt ==========
pt_files = [f for f in os.listdir(embedding_dir) if f.endswith(".pt") and f.startswith("embedding_")]

for fname in sorted(pt_files):
    fpath = os.path.join(embedding_dir, fname)
    try:
        data = torch.load(fpath, map_location="cpu")
        ids = data.get("ids", [])
        emb = data.get("embeddings", None)

        if emb is None or not ids:
            print(f"{fname}: Aucun embedding ou ID trouve")
            continue

        # Filtrage des IDs valides (present dans metadata)
        valid_indices = [i for i, gid in enumerate(ids) if gid in metadata_ids]

        if len(valid_indices) == len(ids):
            print(f"{fname}: Tous les IDs valides ({len(ids)})")
            continue

        # Mise a jour des donnees
        new_ids = [ids[i] for i in valid_indices]
        new_emb = emb[valid_indices]
        data["ids"] = new_ids
        data["embeddings"] = new_emb

        # Ecriture atomique
        tmp_path = fpath + ".tmp"
        torch.save(data, tmp_path)
        os.replace(tmp_path, fpath)

        print(f"{fname}: {len(ids) - len(new_ids)} IDs invalides supprimes. Reste {len(new_ids)}")

    except Exception as e:
        print(f"Erreur traitement {fname}: {e}")

# Partie 2 : Feature extraction

## Pour le trainset

In [None]:
import os
import re
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

# ========== CONFIGURATION ==========
TRAIN_CSV = "/content/drive/My Drive/Colab Notebooks/Data/SDC/training_set_metadata.csv"
TRAIN_GRAPH_DIR = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/SDC/Fichiers source/folder_training_set_2/folder_training_set"
EMBEDDING_DIR = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/instruction_embeddings_grouped_codebert_v2"
SAMPLE_SIZE = None

# ========== FONCTIONS UTILITAIRES ==========
def entropy(vals):
    probs = np.array(list(vals.values())) / sum(vals.values())
    return -np.sum(probs * np.log2(probs + 1e-9))

def extract_features(filepath):
    with open(filepath, 'r') as f:
        content = f.read()

    inst_matches = re.findall(r'INST\s+:\s+([^:\n]+)', content)
    all_text = " ".join(inst_matches)
    inst_lower = [w.lower() for w in inst_matches]
    c = Counter(inst_lower)

    # Instructions communes
    top_instrs = ['call','jmp','ret','mov','push','pop','lea']
    stats = {f'instr_{k}': c.get(k, 0) for k in top_instrs}

    # Groupes sémantiques
    control = {'jmp','call','ret'}
    stats['count_control'] = sum(c[i] for i in control)

    # Statistiques instructions
    stats['nb_tokens'] = len(inst_lower)
    stats['unique_instructions'] = len(set(inst_lower))
    stats['instruction_entropy'] = -np.sum(
        (np.array(list(c.values())) / (len(inst_lower)+1e-9)) *
        np.log2(np.array(list(c.values())) / (len(inst_lower)+1e-9) + 1e-9)
    ) if inst_lower else 0

    # Patterns spéciaux
    stats['has_xor_zeroing'] = int(bool(re.search(r'xor\s+[re]?[abcd]x,\s+[re]?[abcd]x', all_text, re.IGNORECASE)))
    stats['has_getproc'] = int(bool(re.search(r'getprocaddress', all_text, re.IGNORECASE)))

    # Structure du graphe
    edges = re.findall(r'"([0-9a-fx]+)"\s*->\s*"([0-9a-fx]+)"', content)
    nodes = set()
    out_degree = defaultdict(int)
    graph = defaultdict(list)
    for src, dst in edges:
        nodes.add(src)
        nodes.add(dst)
        out_degree[src] += 1
        graph[src].append(dst)

    stats['nb_nodes'] = len(nodes)
    stats['nb_edges'] = len(edges)
    stats['max_out_degree'] = max(out_degree.values()) if out_degree else 0

    def dfs_iterative(start_node):
        visited = set()
        stack = [(start_node, 0)]
        max_depth = 0
        while stack:
            node, depth = stack.pop()
            if node not in visited:
                visited.add(node)
                max_depth = max(max_depth, depth)
                for child in graph.get(node, []):
                    if child not in visited:
                        stack.append((child, depth + 1))
        return max_depth

    entry_node = list(nodes)[0] if nodes else None
    stats['depth_max'] = dfs_iterative(entry_node) if entry_node else 0

    return all_text, stats

# ========== CHARGEMENT DES IDs EMBEDDÉS ==========
print("Analyse des fichiers .pt...")
pt_files = [f for f in os.listdir(EMBEDDING_DIR) if f.endswith(".pt") and f.startswith("embedding_")]
selected_ids = set()
for fname in tqdm(pt_files):
    fpath = os.path.join(EMBEDDING_DIR, fname)
    try:
        data = torch.load(fpath, map_location="cpu")
        ids = data.get("ids", [])
        selected_ids.update(ids)
    except Exception as e:
        print(f"Erreur lecture {fname}: {e}")
print(f"Total IDs embeddés: {len(selected_ids)}\n")

# ========== TRAITEMENT DES GRAPHES ==========
def process_graphs(folder, selected_ids):
    print(f"Traitement de {len(selected_ids)} graphes depuis: {folder}")
    texts, stats = {}, {}
    for fname in tqdm(os.listdir(folder)):
        if fname.endswith('.json'):
            id_ = fname.replace('.json', '')
            if id_ in selected_ids:
                path = os.path.join(folder, fname)
                txt, feat = extract_features(path)
                texts[id_] = txt
                stats[id_] = feat
    print(f"Extraction terminée: {len(texts)} graphes")
    return texts, stats

# ========== CHARGEMENT DES LABELS ==========
print("Chargement metadata...")
train_labels = pd.read_csv(TRAIN_CSV, sep=';')
train_labels['name'] = train_labels['name'].astype(str)
sampled = train_labels[train_labels['name'].isin(selected_ids)].copy()
print(f"Labels correspondants: {len(sampled)}")

# ========== EXTRACTION DES CARACTÉRISTIQUES ==========
train_texts, train_stats = process_graphs(TRAIN_GRAPH_DIR, selected_ids)

# ========== FUSION DES DONNÉES ==========
print("Fusion text + stats + labels...")
X_text = pd.Series(train_texts).rename_axis('name').reset_index(name='text')
X_stats = pd.DataFrame.from_dict(train_stats, orient='index').reset_index().rename(columns={'index': 'name'})
df = pd.merge(X_text, X_stats, on='name')
df = pd.merge(df, sampled, on='name')
print(f"Fusion terminée: {df.shape[0]} graphes")

## Pour le testset

In [None]:
import os
import re
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter, defaultdict
import joblib

# ========== CONFIGURATION ==========
TEST_GRAPH_DIR = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/folder_test_set"
EMBEDDING_DIR_TEST = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/instruction_embeddings_test_codebert_v2"
OUTPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/Modelisation 4/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ========== FONCTIONS D'EXTRACTION ==========
def extract_features(filepath):
    with open(filepath, 'r') as f:
        content = f.read()

    inst_matches = re.findall(r'INST\s+:\s+([^:\n]+)', content)
    all_text = " ".join(inst_matches)
    inst_lower = [w.lower() for w in inst_matches]
    c = Counter(inst_lower)

    top_instrs = ['call','jmp','ret','mov','push','pop','lea']
    stats = {f'instr_{k}': c.get(k, 0) for k in top_instrs}

    control = {'jmp','call','ret'}
    stats['count_control'] = sum(c[i] for i in control)

    stats['nb_tokens'] = len(inst_lower)
    stats['unique_instructions'] = len(set(inst_lower))
    stats['instruction_entropy'] = -np.sum(
        (np.array(list(c.values())) / (len(inst_lower)+1e-9)) *
        np.log2(np.array(list(c.values())) / (len(inst_lower)+1e-9) + 1e-9)
    ) if inst_lower else 0

    stats['has_xor_zeroing'] = int(bool(re.search(r'xor\s+[re]?[abcd]x,\s+[re]?[abcd]x', all_text, re.IGNORECASE)))
    stats['has_getproc'] = int(bool(re.search(r'getprocaddress', all_text, re.IGNORECASE)))

    edges = re.findall(r'"([0-9a-fx]+)"\s*->\s*"([0-9a-fx]+)"', content)
    nodes = set()
    out_degree = defaultdict(int)
    graph = defaultdict(list)
    for src, dst in edges:
        nodes.add(src)
        nodes.add(dst)
        out_degree[src] += 1
        graph[src].append(dst)

    stats['nb_nodes'] = len(nodes)
    stats['nb_edges'] = len(edges)
    stats['max_out_degree'] = max(out_degree.values()) if out_degree else 0

    def dfs_iterative(start_node):
        visited = set()
        stack = [(start_node, 0)]
        max_depth = 0
        while stack:
            node, depth = stack.pop()
            if node not in visited:
                visited.add(node)
                max_depth = max(max_depth, depth)
                for child in graph.get(node, []):
                    if child not in visited:
                        stack.append((child, depth + 1))
        return max_depth

    entry_node = list(nodes)[0] if nodes else None
    stats['depth_max'] = dfs_iterative(entry_node) if entry_node else 0

    return all_text, stats

# ========== CHARGEMENT DES IDs EMBEDDÉS ==========
print("Recherche des fichiers .pt...")
pt_files = [f for f in os.listdir(EMBEDDING_DIR_TEST) if f.endswith(".pt") and f.startswith("embedding_")]
selected_ids = set()
for fname in tqdm(pt_files):
    fpath = os.path.join(EMBEDDING_DIR_TEST, fname)
    try:
        data = torch.load(fpath, map_location="cpu")
        ids = data.get("ids", [])
        selected_ids.update(ids)
    except Exception as e:
        print(f"Erreur lecture {fname}: {e}")
print(f"Total IDs embeddés (test): {len(selected_ids)}\n")

# ========== PRÉTRAITEMENT DES GRAPHES ==========
def process_graphs(folder, selected_ids):
    texts, stats = {}, {}
    json_files = [f"{id_}.json" for id_ in selected_ids if os.path.exists(os.path.join(folder, f"{id_}.json"))]
    for fname in tqdm(json_files, desc="Traitement des graphes test"):
        id_ = fname.replace('.json', '')
        path = os.path.join(folder, fname)
        txt, feat = extract_features(path)
        texts[id_] = txt
        stats[id_] = feat
    return texts, stats

print("Extraction des features pour le test set...")
test_texts, test_stats = process_graphs(TEST_GRAPH_DIR, selected_ids)

# ========== CRÉATION DU DATAFRAME ==========
print("Création du DataFrame test...")
X_text_test = pd.Series(test_texts).rename_axis('name').reset_index(name='text')
X_stats_test = pd.DataFrame.from_dict(test_stats, orient='index').reset_index().rename(columns={'index': 'name'})
df_test = pd.merge(X_text_test, X_stats_test, on='name')
print(f"DataFrame test créé: {df_test.shape[0]} graphes.")

# Partie 3 : TF - IDF & SVD

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd
import joblib
import time, psutil
import os

# ========== CONFIGURATION DES CHEMINS ==========
SAVE_PATH = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/Modelisation 5/"
os.makedirs(SAVE_PATH, exist_ok=True)

df_test = joblib.load(os.path.join(SAVE_PATH, "df_text_cleaned.pkl"))

# ========== PARAMETRES DU MODELE ==========
n_components = 600
max_features = 20000
ngram_range = (1, 3)
max_text_len = 200000  # Longueur maximale pour chaque échantillon

def log_memory():
    mem = psutil.virtual_memory()
    print(f"Memoire: {mem.used / 1e9:.2f} GB utilise / {mem.total / 1e9:.2f} GB total ({mem.percent}%)")

# ========== TRONCATURE DU TEXTE ==========
print(f"Troncature du texte a {max_text_len} caracteres...")
start = time.time()
df['text_short'] = df['text_cleaned'].str.slice(0, max_text_len)
print(f"Texte traite: {len(df)} documents — temps: {time.time() - start:.2f}s")
log_memory()

# ========== VECTORISATION TF-IDF ==========
print("Initialisation du vectorizer TF-IDF...")
vectorizer = TfidfVectorizer(
    ngram_range=ngram_range,
    max_features=max_features,
    min_df=3,
    max_df=0.85,
    sublinear_tf=True,
    use_idf=True,
    norm='l2',
    analyzer='word',
    stop_words=None
)

print("Transformation TF-IDF...")
start = time.time()
X_tfidf = vectorizer.fit_transform(df['text_short'])
print(f"TF-IDF shape: {X_tfidf.shape} — temps: {time.time() - start:.2f}s")
log_memory()

# ========== SAUVEGARDE ==========
joblib.dump(vectorizer, os.path.join(SAVE_PATH, "tfidf_vectorizer.pkl"))
joblib.dump(X_tfidf, os.path.join(SAVE_PATH, "X_tfidf_raw.pkl"))

# ========== REDUCTION DE DIMENSION ==========
print(f"Reduction SVD ({n_components} dimensions)...")
start = time.time()
svd = TruncatedSVD(n_components=n_components, random_state=42, algorithm='arpack')
X_svd = svd.fit_transform(X_tfidf)
print(f"SVD shape: {X_svd.shape} — temps: {time.time() - start:.2f}s — variance: {svd.explained_variance_ratio_.sum()*100:.2f}%")
log_memory()

In [None]:
# ========== MERGE SVD & df ==========
svd_cols = [f"svd_{i}" for i in range(n_components)]
df_svd = pd.DataFrame(X_svd, columns=svd_cols)
df_svd["name"] = df["name"].values
df_merged = pd.merge(df, df_svd, on="name")

# ========== CONCATÉNER X_final & y ==========
embedding_cols = [col for col in df_merged.columns if col.startswith("emb_")]
stat_cols = [col for col in X_stats.columns if col != "name"]
tfidf_svd_cols = svd_cols

X_full_final = df_merged[embedding_cols + stat_cols + tfidf_svd_cols].values
y_final = df_merged.drop(columns=["name", "text", "text_short"] + embedding_cols + stat_cols + tfidf_svd_cols).astype(int)

# Partie 4 : Concaténer la base finale

In [None]:
import torch
from tqdm import tqdm

# ========== LECTURE DES FICHIERS .pt ==========
embedding_dir = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/instruction_embeddings_grouped_codebert_v2"
embedding_dict = {}

pt_files = [f for f in os.listdir(embedding_dir) if f.endswith(".pt") and f.startswith("embedding_")]
print(f"Chargement des embeddings depuis {len(pt_files)} fichiers...")

for f in tqdm(pt_files):
    path = os.path.join(embedding_dir, f)
    data = torch.load(path, map_location="cpu")
    ids = data["ids"]
    embeds = data["embeddings"]
    for id_, emb in zip(ids, embeds):
        embedding_dict[id_] = emb.numpy()

print(f"Total des graph_id avec embeddings: {len(embedding_dict)}")

📂 Đang load embeddings từ 150 file...


100%|██████████| 150/150 [00:17<00:00,  8.68it/s]

✅ Tổng số graph_id có embedding: 23049





In [None]:
import os
import torch
import joblib
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# ========== CONFIGURATION ==========
output_dir = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/Modelisation 4/"
embedding_dir = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/instruction_embeddings_grouped_codebert_v2"

# ========== CHARGEMENT DES DONNEES ==========
print("Chargement de X_full_final et y_final...")
X_full = joblib.load(os.path.join(output_dir, "X_full_final.pkl"))  # TF-IDF + stats
y_final = joblib.load(os.path.join(output_dir, "y_final.pkl"))
print(f"X_full shape (stats + TF-IDF): {X_full.shape}")
print(f"y_final shape: {y_final.shape}")

# ========== CHARGEMENT DES EMBEDDINGS ==========
def load_all_embeddings(embedding_dir):
    all_rows = []
    pt_files = [f for f in os.listdir(embedding_dir) if f.endswith(".pt") and f.startswith("embedding_")]

    print(f"Chargement des embeddings depuis {len(pt_files)} fichiers...")
    for fname in tqdm(pt_files):
        fpath = os.path.join(embedding_dir, fname)
        try:
            data = torch.load(fpath, map_location="cpu")
            ids = data.get("ids", [])
            embs = data.get("embeddings", None)

            if embs is None or len(ids) != embs.shape[0]:
                print(f"Fichier {fname} invalide")
                continue

            for gid, vec in zip(ids, embs):
                all_rows.append((gid, vec.numpy()))
        except Exception as e:
            print(f"Erreur chargement {fname}: {e}")

    print(f"Total embeddings charges: {len(all_rows)}")
    return pd.DataFrame(all_rows, columns=["name", "embedding"])

# Chargement embeddings
embedding_df = load_all_embeddings(embedding_dir)
embedding_expanded = pd.DataFrame(embedding_df["embedding"].to_list())
embedding_expanded.columns = [f"emb_{i}" for i in range(embedding_expanded.shape[1])]
embedding_expanded["name"] = embedding_df["name"]

# ========== PREPARATION DES DONNEES ==========
df = joblib.load(os.path.join(output_dir, "df_merged_text_stats_labels.pkl"))
df = df[df["name"].isin(embedding_expanded["name"])].reset_index(drop=True)

# ========== FUSION DES DONNEES ==========
df_merged = pd.merge(df[["name"]], embedding_expanded, on="name")
print(f"Fusion embeddings reussie - shape: {df_merged.shape}")

# ========== CONCATENATION FINALE ==========
X_embedding = df_merged.drop(columns=["name"]).values
X_combined = np.concatenate([X_embedding, X_full], axis=1)
print(f"Dimensions finales apres fusion: {X_combined.shape[1]}")

# ========== DECOUPAGE TRAIN/VAL ==========
X_tr, X_val, y_tr, y_val = train_test_split(X_combined, y_final, test_size=0.2, random_state=42)
print(f"X_train: {X_tr.shape}, X_val: {X_val.shape}")
print(f"y_train: {y_tr.shape}, y_val: {y_val.shape}")

📥 Đang load X_full_final và y_final từ .pkl...
✅ X_full shape (stats + TF-IDF): (23011, 617)
✅ y_final shape: (23011, 453)
📥 Đang load toàn bộ embedding từ 150 file...


100%|██████████| 150/150 [00:01<00:00, 134.17it/s]


✅ Tổng số embedding: 23049
🔗 Merge thành công embedding — shape: (23011, 769)
📊 Tổng số chiều feature cuối cùng sau khi ghép embedding: 1385
✅ X_train: (18408, 1385), X_val: (4603, 1385)
✅ y_train: (18408, 453), y_val: (4603, 453)


# Partie 5 : Modélisation

## Sur le train/val 80-20 + les seuils optimisés

In [None]:
try:
    from bayes_opt import BayesianOptimization
except ImportError:
    !pip install bayesian-optimization
    from bayes_opt import BayesianOptimization

from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from sklearn import set_config
import numpy as np

# ========== CONVERSION EN NUMPY ==========
y_tr = y_tr.values if hasattr(y_tr, "values") else y_tr
y_val = y_val.values if hasattr(y_val, "values") else y_val

# ========== OPTIMISATION BAYESIENNE PAR LABEL ==========
def optimize_thresholds_per_label(y_true, y_proba):
    best_thresholds = []
    for i in range(y_true.shape[1]):
        y_true_i = y_true[:, i]
        y_proba_i = y_proba[:, i]

        def f(t):
            y_pred_bin = (y_proba_i > t).astype(int)
            return f1_score(y_true_i, y_pred_bin, zero_division=1)

        optimizer = BayesianOptimization(f=f, pbounds={'t': (0.05, 0.95)}, random_state=42, verbose=0)
        optimizer.maximize(init_points=5, n_iter=10)
        best_t = optimizer.max['params']['t']
        best_thresholds.append(best_t)

    return np.array(best_thresholds)

# ========== POIDS DES ECHANTILLONS ==========
print("\nCalcul des poids d'echantillons...")
sample_weights = np.ones(y_tr.shape[0])
for i in range(y_tr.shape[1]):
    col = y_tr[:, i]
    if len(np.unique(col)) < 2:
        continue
    weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=col)
    sample_weights += col * weights[1]

# ========== ENTRAINEMENT DU MODELE ==========
xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='gpu_hist',
    reg_alpha=0.5,
    reg_lambda=1.0,
    gamma=0.2,
    use_label_encoder=False,
    verbosity=0
)
clf = OneVsRestClassifier(xgb, n_jobs=-1)

# Configuration des poids
set_config(enable_metadata_routing=True)
xgb.set_fit_request(sample_weight=True)

print("\nEntrainement du modele XGBoost...")
clf.fit(X_tr, y_tr, sample_weight=sample_weights)

# ========== PREDICTION ==========
print("\nPrediction sur l'ensemble de validation...")
y_pred_proba = clf.predict_proba(X_val)

# ========== OPTIMISATION DES SEUILS PAR LABEL ==========
print("\nOptimisation des seuils par label...")
thresholds = optimize_thresholds_per_label(y_val, y_pred_proba)

# Application des seuils
y_pred_bin = (y_pred_proba > thresholds).astype(int)

# ========== EVALUATION ==========
macro_f1 = f1_score(y_val, y_pred_bin, average="macro", zero_division=0)
roc_auc = roc_auc_score(y_val, y_pred_proba, average="macro")
precision = precision_score(y_val, y_pred_bin, average="macro", zero_division=0)
recall = recall_score(y_val, y_pred_bin, average="macro", zero_division=0)

print("\nResultats d'evaluation:")
print(f"  • Macro F1   : {macro_f1:.4f}")
print(f"  • ROC-AUC    : {roc_auc:.4f}")
print(f"  • Precision  : {precision:.4f}")
print(f"  • Recall     : {recall:.4f}")

# ========== LABELS LES MOINS PERFORMANTS ==========
f1_per_label = f1_score(y_val, y_pred_bin, average=None, zero_division=1)
worst_idx = np.argsort(f1_per_label)[:5]
print("\nTop 5 labels avec F1 le plus bas:")
for idx in worst_idx:
    print(f"   Label {idx:3d}: F1 = {f1_per_label[idx]:.4f}")

## Sur 100% la base de train + les seuils sauvegardés en haut

In [None]:
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn import set_config
import numpy as np
import os
import joblib

# ========== CONFIGURATION ==========
output_dir = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/Modelisation 4/"
output_dir_1 = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/Modelisation 5/"

# ========== CHARGEMENT DES SEUILS OPTIMISES ==========
thresholds_path = os.path.join(output_dir, "optimized_thresholds_per_label.npy")
thresholds = np.load(thresholds_path)
print(f"Chargement de {len(thresholds)} seuils optimises depuis validation")

# ========== CHARGEMENT DES DONNEES ==========
print("Chargement de X_full_final et y_final...")
X_full = joblib.load(os.path.join(output_dir_1, "X_full_final.pkl"))  # TF-IDF + stats
y_final = joblib.load(os.path.join(output_dir_1, "y_final.pkl"))
X_combined = joblib.load(os.path.join(output_dir_1, "X_combined.pkl"))
print(f"X_full shape (stats + TF-IDF): {X_full.shape}")
print(f"y_final shape: {y_final.shape}")

# ========== CALCUL DES POIDS ==========
print("\nCalcul des poids d'echantillons...")
sample_weights = np.ones(y_final.shape[0])
y_arr = y_final.values if hasattr(y_final, "values") else y_final

for i in range(y_arr.shape[1]):
    col = y_arr[:, i]
    if len(np.unique(col)) < 2:
        continue
    weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=col)
    sample_weights += col * weights[1]

# ========== ENTRAINEMENT FINAL ==========
xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='gpu_hist',
    reg_alpha=0.5,
    reg_lambda=1.0,
    gamma=0.2,
    use_label_encoder=False,
    verbosity=0
)

clf = OneVsRestClassifier(xgb, n_jobs=-1)

# Configuration des poids
set_config(enable_metadata_routing=True)
xgb.set_fit_request(sample_weight=True)

print("\nEntrainement final du modele XGBoost...")
clf.fit(X_combined, y_arr, sample_weight=sample_weights)

# Sauvegarde du modele
joblib.dump(clf, os.path.join(output_dir_1, "xgb_model_full.pkl"))
np.save(os.path.join(output_dir_1, "optimized_thresholds_per_label.npy"), thresholds)

print("\nModele final entraine avec succes.")
print("Note: Les seuils sont conserves depuis la validation, pas de re-optimisation.")

# Partie 6 : Prédiction sur le test

## Préparation la base pour le testset

In [None]:
import os
import torch
import joblib
import pandas as pd
from tqdm import tqdm
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

# ========== CONFIGURATION ==========
OUTPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/Modelisation 4/"
OUTPUT_DIR_1 = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/Modelisation 5/"
EMBEDDING_DIR_TEST = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/instruction_embeddings_test_codebert_v2/"
TFIDF_PATH = os.path.join(OUTPUT_DIR_1, "tfidf_vectorizer.pkl")
SVD_PATH = os.path.join(OUTPUT_DIR_1, "svd_model_tfidf.pkl")

# ========== CHARGEMENT DES MODÈLES ==========
print("Chargement des modèles vectorizer et SVD...")
vectorizer = joblib.load(TFIDF_PATH)
svd = joblib.load(SVD_PATH)

# ========== APPLICATION TF-IDF + SVD ==========
print("Application TF-IDF + SVD sur l'ensemble test...")
df_test["text_short"] = df_test["text_cleaned"].str[:200000]  # Troncature pour économiser la RAM
X_tfidf_test = vectorizer.transform(df_test["text_short"])
X_svd_test = svd.transform(X_tfidf_test)

# ========== CHARGEMENT DES EMBEDDINGS TEST ==========
def load_embeddings_test(embedding_dir):
    rows = []
    pt_files = [f for f in os.listdir(embedding_dir) if f.endswith(".pt") and f.startswith("embedding_")]
    for fname in tqdm(pt_files, desc="Chargement des embeddings test"):
        fpath = os.path.join(embedding_dir, fname)
        data = torch.load(fpath, map_location="cpu")
        ids = data["ids"]
        embs = data["embeddings"]
        for gid, emb in zip(ids, embs):
            rows.append((gid, emb.numpy()))
    return pd.DataFrame(rows, columns=["name", "embedding"])

embedding_test_df = load_embeddings_test(EMBEDDING_DIR_TEST)
embedding_test_exp = pd.DataFrame(embedding_test_df["embedding"].to_list())
embedding_test_exp.columns = [f"emb_{i}" for i in range(embedding_test_exp.shape[1])]
embedding_test_final = pd.concat([embedding_test_df["name"], embedding_test_exp], axis=1)

# ========== FUSION DES DONNÉES ==========
print("Fusion embedding + stats + tfidf_svd...")
df_test = df_test[df_test["name"].isin(embedding_test_final["name"])].reset_index(drop=True)
df_test_merged = pd.merge(df_test, embedding_test_final, on="name")

# Ajout SVD
svd_cols = [f"svd_{i}" for i in range(X_svd_test.shape[1])]
df_svd_test = pd.DataFrame(X_svd_test, columns=svd_cols)
df_svd_test["name"] = df_test["name"].values
df_test_merged = pd.merge(df_test_merged, df_svd_test, on="name")

# ========== MATRICE FINALE TEST ==========
embedding_cols = [col for col in df_test_merged.columns if col.startswith("emb_")]
stat_cols = [col for col in X_stats_test.columns if col != "name"]
X_test_final = df_test_merged[embedding_cols + stat_cols + svd_cols].values

print(f"Matrice test finale: {X_test_final.shape}")

## Prédiction et génération des submissions

In [None]:
import joblib
import numpy as np
import pandas as pd
from google.colab import files
import os

# ========== CONFIGURATION DES CHEMINS ==========
output_dir = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/Modelisation 4/"
output_dir_1 = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/Modelisation 5/"
submission_template_path = "/content/drive/MyDrive/Colab Notebooks/Data/SDC/test_set_metadata_to_predict.xlsx"

# ========== CHARGEMENT DES MODELES ==========
clf = joblib.load(os.path.join(output_dir_1, "xgb_model_full.pkl"))
thresholds = np.load(os.path.join(output_dir_1, "optimized_thresholds_per_label.npy"))

# ========== CHARGEMENT DES DONNEES TEST ==========
X_test = joblib.load(os.path.join(output_dir_1, "X_test_final.pkl"))
df_test = joblib.load(os.path.join(output_dir, "df_test_merged_text_stats.pkl"))  # contient les 'name' de X_test
test_ids_predicted = df_test["name"].astype(str).tolist()

# ========== CHARGEMENT DU TEMPLATE ==========
submission_df = pd.read_excel(submission_template_path)
test_ids_full = submission_df['name'].astype(str).tolist()

# ========== PREDICTION ==========
print("Prediction des probabilites sur l'ensemble test...")
y_pred_proba = clf.predict_proba(X_test)
y_pred = (y_pred_proba > thresholds).astype(int)

# ========== CREATION DU FICHIER DE SOUMISSION ==========
submission_array = pd.DataFrame(y_pred, columns=submission_df.columns[1:])
submission_array.insert(0, "name", test_ids_predicted)

# ========== GESTION DES ID MANQUANTS ==========
missing_ids = set(test_ids_full) - set(test_ids_predicted)
if missing_ids:
    print(f"Attention: {len(missing_ids)} graph_id sans embedding. Remplissage par des 0.")
    zero_df = pd.DataFrame(0, index=range(len(missing_ids)), columns=submission_df.columns)
    zero_df["name"] = list(missing_ids)
    submission_array = pd.concat([submission_array, zero_df], axis=0)

# ========== REORDONNANCEMENT ==========
submission_array = submission_array.set_index("name").reindex(test_ids_full).reset_index()

# ========== SAUVEGARDE ==========
submission_file = "submission_xgb_08.xlsx"
submission_array.to_excel(submission_file, index=False)
print(f"Fichier de soumission cree: {submission_file}")

# ========== TELECHARGEMENT ==========
files.download(submission_file)

## Evaluation par des métriques

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
import pandas as pd
import numpy as np

# ===== PREDICTION SUR VALIDATION =====
y_pred_proba = clf.predict_proba(X_val)
y_pred_bin = (y_pred_proba > thresholds).astype(int)

# ===== CALCUL DES METRIQUES PAR LABEL =====
f1 = f1_score(y_val, y_pred_bin, average=None, zero_division=0)
precision = precision_score(y_val, y_pred_bin, average=None, zero_division=0)
recall = recall_score(y_val, y_pred_bin, average=None, zero_division=0)
accuracy = np.mean(y_val == y_pred_bin, axis=0)

# ===== NOMS DES LABELS =====
label_names = y_final.columns if hasattr(y_final, "columns") else [f"label_{i}" for i in range(y_val.shape[1])]

# ===== CREATION DU DATAFRAME =====
metrics_df = pd.DataFrame({
    "label": label_names,
    "f1_score": f1,
    "precision": precision,
    "recall": recall,
    "accuracy": accuracy
}).sort_values(by="f1_score", ascending=False).reset_index(drop=True)

# Affichage top 10
print("Top 10 labels par F1-score:")
print(metrics_df.head(10))

# ===== EXPORT EXCEL =====
output_path = os.path.join(output_dir, "metrics_per_label_val_05.xlsx")
metrics_df.to_excel(output_path, index=False)
print(f"Metriques par label sauvegardees dans: {output_path}")