# CPU & Memory

- Modification: filtrage des valeurs égales à 0
- ajout de .cache()

# Cleaning

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, BooleanType

# Initialiser une session Spark
spark = SparkSession.builder \
    .appName("Analyse Memory & CPU Usage (Jobs & Tasks)") \
    .getOrCreate()

In [26]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType, BooleanType

# Initialiser une session Spark
spark = SparkSession.builder \
    .appName("Analyse Memory & CPU Usage (Jobs & Tasks)") \
    .getOrCreate()

# Définition du schéma pour les fichiers CSV
task_events_schema = StructType([
    StructField("timestamp", IntegerType(), True),
    StructField("missing info", IntegerType(), True),
    StructField("job ID", IntegerType(), True),
    StructField("task index", IntegerType(), True),
    StructField("machine ID", IntegerType(), True),
    StructField("event type", IntegerType(), True),
    StructField("user name", StringType(), True),
    StructField("scheduling class", IntegerType(), True),
    StructField("priority", IntegerType(), True),
    StructField("resource request for CPU cores", FloatType(), True),
    StructField("resource request for RAM", FloatType(), True),
    StructField("resource request for local disk space", FloatType(), True),
    StructField("different-machine constraint", BooleanType(), True)
])

csv_directory_task_events = "/mnt/lustre/scratch/nlsas/home/ulc/cursos/curso341/Big_Data_Frameworks/data/task_events/*.csv"

# Charger le fichier CSV
df = spark.read \
    .option("header", "false") \
    .schema(task_events_schema) \
    .csv(csv_directory_task_events)


# Nettoyer les données en supprimant les entrées avec job ID null
df_cleaned = df.filter(F.col("job ID").isNotNull())

# Sélectionner les colonnes pertinentes
df_cleaned_selected = df_cleaned.select(
    "job ID", "task index", "priority")

# Vérifier si chaque tâche a une seule priorité associée
df_task_priority_check = df_cleaned_selected.groupBy("job ID", "task index").agg(F.countDistinct("priority").alias("unique_priorities"), F.count("priority").alias("number of lines"))
df_task_priority_check = df_task_priority_check.orderBy("job ID", "task index")

# Afficher les tâches avec plus d'une priorité
df_task_priority_check.filter(F.col("unique_priorities") > 1).show()
df_task_priority_check.show(100)

                                                                                

+------+----------+-----------------+---------------+
|job ID|task index|unique_priorities|number of lines|
+------+----------+-----------------+---------------+
+------+----------+-----------------+---------------+





+-------+----------+-----------------+---------------+
| job ID|task index|unique_priorities|number of lines|
+-------+----------+-----------------+---------------+
|3418309|         0|                1|              5|
|3418309|         1|                1|              2|
|3418314|         0|                1|              2|
|3418314|         1|                1|              5|
|3418319|         0|                1|              8|
|3418319|         1|                1|              2|
|3418324|         0|                1|              8|
|3418324|         1|                1|              2|
|3418329|         0|                1|              5|
|3418329|         1|                1|              2|
|3418329|         2|                1|              8|
|3418334|         0|                1|              2|
|3418334|         1|                1|              2|
|3418339|         0|                1|              8|
|3418339|         1|                1|              2|
|3418339| 

                                                                                

In [3]:
spark.stop()