In [1]:
import networkx as nx
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import os
from io import StringIO
import re

from networkx.drawing.nx_pydot import read_dot
from sklearn.feature_extraction.text import TfidfVectorizer



# 1. Exploration

In [None]:
import os

json_path = "./data"  # Dossier contenant les fichiers .json
files = os.listdir(json_path)

# Filtrer pour obtenir uniquement les fichiers .json
json_files = [f for f in files if f.endswith('.json')]

for json_file in json_files:
    json_file_path = os.path.join(json_path, json_file)

    # Lire le fichier comme texte
    with open(json_file_path, 'r') as file:
        file_content = file.read()

        # Vérifier si le fichier est vide
        if not file_content.strip():  # Si le fichier est vide ou contient seulement des espaces
            print(f"Le fichier {json_file} est vide.")
        else:
            print(f"Contenu du fichier {json_file} :")
            print(file_content[:1000])  # Afficher les premiers 1000 caractères


In [None]:
import os
import graphviz

os.environ["PATH"] += os.pathsep + r"C:\Program Files (x86)\Graphviz\bin"

dot_source = file_content

graph = graphviz.Source(dot_source)
graph.render('output/graph', format='png', view=True) 
#graph.render('output/graph', format='pdf', view=True) # si en png ça passe pas car trop lourd



'graph.pdf'

In [14]:
metadata_train = pd.read_csv("data/training_set_metadata.csv", sep = ";")

In [173]:
metadata_train.head(5)

Unnamed: 0,name,64-bit execution via heavens gate,64bits,PEB access,accept command line arguments,access the Windows event log,act as TCP client,allocate RW memory,allocate RWX memory,allocate memory,...,winzip,wise,worm,write and execute a file,write clipboard data,write file on Linux,write file on Windows,write pipe,xorcrypt,yoda
0,9fbf213113ba0a18dc2642f83b1201541428fd7951d6a8...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1b35c9dbf3cd9ac60015aaa6cd451c898defa6dac1ff43...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,bf8d307a136a936f7338c1f2eec773c4eb1c802cab77da...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1e51933903f0358c0b635f863368eb15a61cd3442bc5bf...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8a6503fe68d699f8a31531c157e9da931192cd7e3ec809...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from ydata_profiling import ProfileReport
profil = ProfileReport(metadata_train)

In [None]:
profil.to_file("rapport_metadata_train.html")

# 2. Extraire informations binaires

In [3]:
metadata_train = pd.read_csv("data/training_set_metadata.csv", sep = ";")

In [4]:
behaviors = metadata_train.columns.tolist()[1:]

# 3. Extraire information digraph

In [5]:
def read_graph_from_dot(dot_file_path):
    # Utiliser NetworkX et nx_pydot pour lire un fichier .dot
    graph = read_dot(dot_file_path)  # Utilisation de read_dot dans nx_pydot
    return graph


In [6]:
json_path = r"D:\cours\MOSEF\challenge\nexialog data\folder_training_set"

files = os.listdir(json_path)


In [None]:
files

## 3.1 Utiliser l'information avec une représentation des données commme un graph

In [10]:
# Fonction pour extraire les sous-graphes fortement connexes
def extract_strongly_connected_components(graph):
    strongly_connected_components = list(nx.strongly_connected_components(graph))
    print(f"Nombre de composantes fortement connexes : {len(strongly_connected_components)}")
    
    subgraphs = []
    for component in strongly_connected_components:
        subgraph = graph.subgraph(component).copy()
        subgraphs.append(subgraph)
    
    return subgraphs

# Fonction pour extraire les caractéristiques des sous-graphes
def extract_features_from_subgraphs(subgraphs):
    features_list = []
    
    for subgraph in subgraphs:
        features = {}
        features['number_of_nodes'] = subgraph.number_of_nodes()
        features['number_of_edges'] = subgraph.number_of_edges()
        features['density'] = nx.density(subgraph)
        
        # Calculer le degré moyen
        features['average_degree'] = sum(dict(subgraph.degree()).values()) / subgraph.number_of_nodes() if subgraph.number_of_nodes() > 0 else 0
        
        # Calculer la longueur moyenne des plus courts chemins uniquement si le graphe est fortement connecté
        if nx.is_strongly_connected(subgraph):
            features['average_shortest_path_length'] = nx.average_shortest_path_length(subgraph)
        else:
            features['average_shortest_path_length'] = None  # Pas de calcul si le graphe n'est pas fortement connecté
        
        # Vérifier si le sous-graphe est un multigraphe
        if isinstance(subgraph, nx.MultiGraph) or isinstance(subgraph, nx.MultiDiGraph):
            # Si c'est un multigraphe, on le convertit en graphe simple
            subgraph = nx.Graph(subgraph)  # Convertir en graphe simple
        
        # Calcul du coefficient de clustering
        features['clustering_coefficient'] = nx.average_clustering(subgraph)
        
        # Centralités (degré entrant et sortant pour les graphes dirigés)
        features['in_degree_centrality'] = nx.in_degree_centrality(subgraph) if isinstance(subgraph, nx.DiGraph) else {}
        features['out_degree_centrality'] = nx.out_degree_centrality(subgraph) if isinstance(subgraph, nx.DiGraph) else {}
        
        features_list.append(features)
    
    return features_list

def convert_to_digraph(graph):
    # Création d'un graphe dirigé simple en éliminant les arêtes multiples
    G = nx.DiGraph()

    # Ajouter les nœuds et les arêtes en prenant seulement une arête entre chaque paire de nœuds
    for u, v, data in graph.edges(data=True):
        G.add_edge(u, v, **data)
    
    return G

def extract_graph_features(graph):
    """
    Extrait des caractéristiques globales d'un graphe dirigé.
    """

    # Convertir en DiGraph si c'est un MultiDiGraph
    if isinstance(graph, nx.MultiDiGraph):
        graph = nx.DiGraph(graph)  # Suppression des arêtes multiples

    # Vérifier si le graphe est vide
    if graph.number_of_nodes() == 0:
        return {
            'number_of_nodes': 0,
            'number_of_edges': 0,
            'density': 0,
            'average_degree': 0,
            'largest_scc_size': 0,
            'average_shortest_path_length': None,
            'clustering_coefficient': 0,
            'in_degree_centrality': 0,
            'out_degree_centrality': 0,
            'num_strongly_connected_components': 0
        }

    features = {}

    # Nombre de nœuds et d'arêtes
    features['number_of_nodes'] = graph.number_of_nodes()
    features['number_of_edges'] = graph.number_of_edges()

    # Densité du graphe
    features['density'] = nx.density(graph)

    # Degré moyen
    degrees = dict(graph.degree()).values()
    features['average_degree'] = sum(degrees) / len(degrees) if len(degrees) > 0 else 0

    # Nombre de composantes fortement connexes
    scc = list(nx.strongly_connected_components(graph))
    features['num_strongly_connected_components'] = len(scc)

    # Taille de la plus grande composante fortement connexe
    largest_scc = max(scc, key=len, default=[])
    features['largest_scc_size'] = len(largest_scc)

    # Calculer la longueur moyenne des chemins sur la plus grande SCC
    if len(largest_scc) > 1:
        subgraph = graph.subgraph(largest_scc).copy()
        if nx.is_strongly_connected(subgraph):
            features['average_shortest_path_length'] = nx.average_shortest_path_length(subgraph)
        else:
            features['average_shortest_path_length'] = None
    else:
        features['average_shortest_path_length'] = None

    # Coefficient de clustering moyen (en convertissant en graphe non dirigé)
    if graph.number_of_nodes() > 1:
        features['clustering_coefficient'] = nx.average_clustering(graph.to_undirected())
    else:
        features['clustering_coefficient'] = 0

    # Centralité de degré moyenne (entrée et sortie)
    features['in_degree_centrality'] = sum(nx.in_degree_centrality(graph).values()) / len(graph)
    features['out_degree_centrality'] = sum(nx.out_degree_centrality(graph).values()) / len(graph)

    return features


In [11]:
json_path

'D:\\cours\\MOSEF\\challenge\\nexialog data\\folder_training_set'

In [12]:
features_and_labels = []
labels_list = []
for index, row in metadata_train.iterrows():
    graph_name = row['name']  # Le nom du fichier graphique
    labels = row.drop('name')  # Les labels binaires
    
    file_path = os.path.join(json_path, f"{graph_name}.json")

    # Lire le graphe
    graph = read_graph_from_dot(file_path)

    # Extraire les caractéristiques globales du graphe
    features = extract_graph_features(graph)

    # Associer les caractéristiques avec les labels binaires
    combined = {**features, **labels.to_dict()}
    
    # Ajouter la ligne complète
    features_and_labels.append(combined)


KeyboardInterrupt: 

In [14]:
features_and_labels[2]

{'number_of_nodes': 385,
 'number_of_edges': 385,
 'density': 0.0026041666666666665,
 'average_degree': 2.0,
 'num_strongly_connected_components': 385,
 'largest_scc_size': 1,
 'average_shortest_path_length': None,
 'clustering_coefficient': 0.0,
 'in_degree_centrality': 0.0026041666666666526,
 'out_degree_centrality': 0.0026041666666666513,
 '64-bit execution via heavens gate': 0,
 '64bits': 0,
 'PEB access': 0,
 'accept command line arguments': 0,
 'access the Windows event log': 0,
 'act as TCP client': 0,
 'allocate RW memory': 0,
 'allocate RWX memory': 0,
 'allocate memory': 0,
 'allocate or change RW memory': 0,
 'allocate or change RWX memory': 0,
 'allocate thread local storage': 0,
 'android': 0,
 'anorganix': 0,
 'apatch': 0,
 'apk': 0,
 'arm': 0,
 'armadillo': 0,
 'aspack': 0,
 'asprotect': 0,
 'assembly': 0,
 'attach user process memory': 0,
 'attachment': 0,
 'authenticate HMAC': 0,
 'block operations on executable memory pages using Arbitrary Code Guard': 0,
 'bobsoft': 

## 3.2 Utiliser l'information avec une représentation des données comme un texte

In [197]:
json_path = r"D:\cours\MOSEF\challenge\nexialog data\folder_training_set"
json_files = os.listdir(json_path)


In [137]:

def extract_text_features(digraph_text):
    """
    Extrait des caractéristiques textuelles à partir du texte brut du digraph.
    """
    features = {}
    
    # Extraire les types d'instructions (JCC, CALL, JMP, RET, etc.)
    instructions = re.findall(r': (\w+) :', digraph_text)
    
    # Compter la fréquence de chaque type d'instruction
    instruction_counts = {instr: instructions.count(instr) for instr in set(instructions)}
    
    # Ajouter ces fréquences aux features
    features.update(instruction_counts)

    # Ajouter la longueur totale du texte
    features["text_length"] = len(digraph_text)
    
    return features

In [198]:
text_features_list = []
for json_file in json_files:
    with open(os.path.join(json_path, json_file), 'r') as file:
        digraph_text = file.read()
        text_features = extract_text_features(digraph_text)
        text_features_list.append(text_features)

# Convertir en DataFrame
text_features_df = pd.DataFrame(text_features_list).fillna(0)


KeyboardInterrupt: 

In [200]:
def extract_instructions(text):
    matches = re.findall(r'label\s*=\s*".*?\s*:\s*([\w]+)\s*:', text)
    return " ".join(matches) if matches else ""

# """regex"""
# label\s*=\s*" → Cherche label = " avec ou sans espaces.
# 🔹 .*?\s*:\s*([\w]+)\s*: → Capture le mot après le 2e : (l’instruction).
# 🔹 On combine toutes les instructions en une seule chaîne avec " ".join(matches).
# ***


In [None]:

# Lire et extraire les instructions de chaque fichier JSON
instructions_list = [extract_instructions(open(os.path.join(json_path, f), 'r').read()) for f in json_files]

# Vérification
print("Exemple d'instructions extraites :", instructions_list[:5])

In [None]:

# Appliquer TF-IDF aux instructions extraites
vectorizer = TfidfVectorizer()
tfidf_features = vectorizer.fit_transform([" ".join(re.findall(r': (\w+) :', text)) for text in json_files])


TF-IDF Shape: (1, 5)
TF-IDF Features Preview:
    tfidf_call  tfidf_inst  tfidf_jcc  tfidf_jmp  tfidf_ret
0    0.472348    0.199436   0.748759    0.40237   0.120711


In [None]:
# Conversion en DataFrame
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=vectorizer.get_feature_names_out())


# 4. Prédiction

In [15]:
# Convertir les listes en DataFrame
features_df = pd.DataFrame(features_and_labels).drop(columns=behaviors)  # Garde uniquement les features
labels_df = pd.DataFrame(features_and_labels)[behaviors]  # Garde uniquement les labels binaires


In [None]:
# final_features_df = pd.concat([features_df, text_features_df, tfidf_df], axis=1)

In [16]:

# Vérification de la correspondance des indices
assert features_df.shape[0] == labels_df.shape[0], "Les features et labels ne correspondent pas en nombre de lignes."


In [20]:


# Séparer les données en train et test
X_train, X_test, y_train, y_test = train_test_split(features_df, labels_df, test_size=0.2, random_state=42)

# Modèle multi-label : RandomForest avec MultiOutputClassifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
multi_clf = MultiOutputClassifier(rf_clf, n_jobs=-1)

# Entraînement du modèle
multi_clf.fit(X_train, y_train)

# Prédiction
y_pred = multi_clf.predict(X_test)

# Évaluation
print(classification_report(y_test, y_pred, target_names=behaviors))


                                                                        precision    recall  f1-score   support

                                     64-bit execution via heavens gate       0.00      0.00      0.00         0
                                                                64bits       0.00      0.00      0.00         1
                                                            PEB access       0.00      0.00      0.00         0
                                         accept command line arguments       0.00      0.00      0.00         0
                                          access the Windows event log       0.00      0.00      0.00         0
                                                     act as TCP client       0.00      0.00      0.00         0
                                                    allocate RW memory       0.00      0.00      0.00         0
                                                   allocate RWX memory       0.00      0.00      0.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
