In [9]:
from utils import (
    _to_dict,
    _donut,
    _pie,
    _line_graph,
    _other_fields,
    _exitcode_info
)
from datetime import datetime, date, timedelta
from pyspark.sql.functions import (
    col,
    lit,
    when,
    sum as _sum,
    count as _count,
    first,
    date_format,
    from_unixtime,
    to_date
)
import numpy as np
import pandas as pd
from pyspark.sql.types import (
    StructType,
    LongType,
    StringType,
    StructField,
    DoubleType,
    IntegerType,
)

In [None]:
pip install plotly

Which are the 5 (10) most used datasets[DESIRED_CMSDataset] in the last week/month[RecordTime]... etc. ? How much CPU time[CpuTimeHr]*[RequestCpus]=[CoreHr],[WallClockHr] was spent on those ? Which fraction of the total ? How big are those ?[Size] How many users/tasks hit each dataset ?[User][CRAB_LumiMask]

In [10]:
def _get_schema():
    return StructType(
        [
            StructField(
                "data",
                StructType(
                    [
                        StructField("RecordTime", LongType(), nullable=False),
                        StructField("InputData", StringType(), nullable=True),
                        StructField("Status", StringType(), nullable=True),
                        StructField("DESIRED_CMSDataset", StringType(), nullable=True),
                        StructField("CpuTimeHr", DoubleType(), nullable=True),
                        StructField("RequestCpus", LongType(), nullable=True),
                        StructField("GlobalJobId", StringType(), nullable=False),
                        StructField("CMS_SubmissionTool", StringType(), nullable=True)
                    ]
                ),
            ),
        ]
    )

In [11]:
_DEFAULT_HDFS_FOLDER = "/project/monitoring/archive/condor/raw/metric"

In [12]:
def get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER):
    st_date = start_date - timedelta(days=3)
    ed_date = end_date + timedelta(days=3)
    days = (ed_date - st_date).days
    pre_candidate_files = [
        "{base}/{day}{{,.tmp}}".format(
            base=base, day=(st_date + timedelta(days=i)).strftime("%Y/%m/%d")
        )
        for i in range(0, days)
    ]
    sc = spark.sparkContext
    candidate_files = [
        f"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}"
        for i in range(0, days)
    ]
    FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
    URI = sc._gateway.jvm.java.net.URI
    Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
    fs = FileSystem.get(URI("hdfs:///"), sc._jsc.hadoopConfiguration())
    candidate_files = [url for url in candidate_files if fs.globStatus(Path(url))]
    return candidate_files


In [13]:
schema = _get_schema()
start_date = datetime(2022, 5, 1)
end_date = datetime(2022, 5, 8)

In [26]:
get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER)

['/project/monitoring/archive/condor/raw/metric/2022/04/28',
 '/project/monitoring/archive/condor/raw/metric/2022/04/29',
 '/project/monitoring/archive/condor/raw/metric/2022/04/30',
 '/project/monitoring/archive/condor/raw/metric/2022/05/01',
 '/project/monitoring/archive/condor/raw/metric/2022/05/02',
 '/project/monitoring/archive/condor/raw/metric/2022/05/03',
 '/project/monitoring/archive/condor/raw/metric/2022/05/04',
 '/project/monitoring/archive/condor/raw/metric/2022/05/05',
 '/project/monitoring/archive/condor/raw/metric/2022/05/06',
 '/project/monitoring/archive/condor/raw/metric/2022/05/07',
 '/project/monitoring/archive/condor/raw/metric/2022/05/08',
 '/project/monitoring/archive/condor/raw/metric/2022/05/09',
 '/project/monitoring/archive/condor/raw/metric/2022/05/10']

In [15]:
raw_df = (
        spark.read.option("basePath", _DEFAULT_HDFS_FOLDER)
        .json(
            get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER),
            schema=schema,
        ).select("data.*")
        .filter(
            f"""RecordTime >= {start_date.timestamp() * 1000}
          AND RecordTime < {end_date.timestamp() * 1000}
          """
        )
        .drop_duplicates(["GlobalJobId"])
    )

spark.conf.set("spark.sql.session.timeZone", "UTC")

In [27]:
raw_df.printSchema()

root
 |-- RecordTime: long (nullable = true)
 |-- InputData: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- DESIRED_CMSDataset: string (nullable = true)
 |-- CpuTimeHr: double (nullable = true)
 |-- RequestCpus: long (nullable = true)
 |-- GlobalJobId: string (nullable = true)
 |-- CMS_SubmissionTool: string (nullable = true)



In [18]:
x = raw_df.select(col('CMS_SubmissionTool')).distinct().show()

+-------------------+
| CMS_SubmissionTool|
+-------------------+
|               CRAB|
|InstitutionalSchedd|
|         CMSConnect|
|            WMAgent|
|          Condor_SI|
+-------------------+



### df1 - time range 05/01 - 05/08 (7days) - show sum_CpuTimeHr used by each dataset

In [25]:
df1 = raw_df.withColumn("timestamp", date_format(from_unixtime(col('RecordTime')/1000), "dd"))\
            .select(col('timestamp'),\
                    col('DESIRED_CMSDataset'),\
                    col('CpuTimeHr'))\
            .groupby(col('timestamp'), col('DESIRED_CMSDataset'))\
            .agg(_sum("CpuTimeHr").alias("Sum_CpuTimeHr"))\


In [28]:
df1.printSchema()

root
 |-- timestamp: string (nullable = true)
 |-- DESIRED_CMSDataset: string (nullable = true)
 |-- Sum_CpuTimeHr: double (nullable = true)



In [59]:
import pyspark.sql.functions as F     
week_Sum = df1.agg(F.sum("Sum_CpuTimeHr")).collect()[0][0]

In [98]:
daily_Sum_CpuTimeHr = df1.groupby(col('timestamp')).agg(F.sum("Sum_CpuTimeHr")).orderBy(col('timestamp')).collect()

In [99]:
daily_Sum_CpuTimeHr

[Row(timestamp='01', sum(Sum_CpuTimeHr)=3237665.5605555526),
 Row(timestamp='02', sum(Sum_CpuTimeHr)=2405833.891111112),
 Row(timestamp='03', sum(Sum_CpuTimeHr)=2780124.6286111106),
 Row(timestamp='04', sum(Sum_CpuTimeHr)=1640770.8308333335),
 Row(timestamp='05', sum(Sum_CpuTimeHr)=1856863.838611111),
 Row(timestamp='06', sum(Sum_CpuTimeHr)=2425978.531666666),
 Row(timestamp='07', sum(Sum_CpuTimeHr)=2002801.1166666658),
 Row(timestamp='30', sum(Sum_CpuTimeHr)=570767.4969444439)]

In [41]:
df1.createOrReplaceTempView("df1")

In [47]:
daily_Top5_CpuTimeHr = spark.sql("(SELECT * FROM df1 WHERE df1.timestamp=='30' ORDER BY df1.Sum_CpuTimeHr DESC LIMIT 5)\
                        UNION ALL (SELECT * FROM df1 WHERE df1.timestamp=='01' ORDER BY df1.Sum_CpuTimeHr DESC LIMIT 5)\
                        UNION ALL (SELECT * FROM df1 WHERE df1.timestamp=='02' ORDER BY df1.Sum_CpuTimeHr DESC LIMIT 5)\
                        UNION ALL (SELECT * FROM df1 WHERE df1.timestamp=='03' ORDER BY df1.Sum_CpuTimeHr DESC LIMIT 5)\
                        UNION ALL (SELECT * FROM df1 WHERE df1.timestamp=='04' ORDER BY df1.Sum_CpuTimeHr DESC LIMIT 5)\
                        UNION ALL (SELECT * FROM df1 WHERE df1.timestamp=='05' ORDER BY df1.Sum_CpuTimeHr DESC LIMIT 5)\
                        UNION ALL (SELECT * FROM df1 WHERE df1.timestamp=='06' ORDER BY df1.Sum_CpuTimeHr DESC LIMIT 5)\
                        UNION ALL (SELECT * FROM df1 WHERE df1.timestamp=='07' ORDER BY df1.Sum_CpuTimeHr DESC LIMIT 5)")

In [86]:
Labels = ['May 1st Week', '30/04', '01/05','02/05', '03/05', '04/05', '05/05', '06/05', '07/05']
for i in ['30', '01', '02', '03', '04', '05', '06', '07']:
    for j in range(1, 6):
        Labels.append('%s - Top%d' % (i, j))

In [119]:
Values = [week_Sum]
Values.append(daily_Sum_CpuTimeHr[7]['sum(Sum_CpuTimeHr)'])
for i in daily_Sum_CpuTimeHr:
    if(i['timestamp']=='30'):
        break
    else:
        Values.append(i['sum(Sum_CpuTimeHr)'])
Values.extend(daily_Top5_CpuTimeHr_list)

In [93]:
Parents = [""]
for i in range(8):
    Parents.append("May 1st Week")
for k in ['30/04', '01/05','02/05', '03/05', '04/05', '05/05', '06/05', '07/05']:
    for j in range(5):
        Parents.append(k)

In [116]:
Hover = ['May 1st Week', '30/04', '01/05','02/05', '03/05', '04/05', '05/05', '06/05', '07/05']
Hover.extend(daily_Top5_Dataset_list)
for i in range(49):
    x = " - CPU Time: %.3f"%(Values[i])
    Hover[i] = f'{Hover[i]}{x}'

In [67]:
x = sum(Values)-Values[0]

In [68]:
x - Values[0]

-97856.37749999762

In [71]:
daily_Top5_CpuTimeHr_list = daily_Top5_CpuTimeHr.select('Sum_CpuTimeHr').rdd.flatMap(lambda x: x).collect()

22/07/20 15:26:02 ERROR YarnScheduler: Lost executor 136 on ithdp1111.cern.ch: Container from a bad node: container_e136_1658221000868_4354_01_000178 on host: ithdp1111.cern.ch. Exit status: 137. Diagnostics: [2022-07-20 15:26:01.975]Container killed on request. Exit code is 137
[2022-07-20 15:26:02.023]Container exited with a non-zero exit code 137. 
[2022-07-20 15:26:02.027]Killed by external signal
.
22/07/20 15:26:02 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 136 for reason Container from a bad node: container_e136_1658221000868_4354_01_000178 on host: ithdp1111.cern.ch. Exit status: 137. Diagnostics: [2022-07-20 15:26:01.975]Container killed on request. Exit code is 137
[2022-07-20 15:26:02.023]Container exited with a non-zero exit code 137. 
[2022-07-20 15:26:02.027]Killed by external signal
.
22/07/20 15:26:02 WARN TaskSetManager: Lost task 80.0 in stage 47.0 (TID 9426) (ithdp1111.cern.ch executor 136): ExecutorLostFailure (executor 136

In [72]:
daily_Top5_Dataset_list = daily_Top5_CpuTimeHr.select('DESIRED_CMSDataset').rdd.flatMap(lambda x: x).collect()

22/07/20 15:30:44 ERROR YarnScheduler: Lost executor 127 on ithdp5006.cern.ch: Container from a bad node: container_e136_1658221000868_4354_01_000165 on host: ithdp5006.cern.ch. Exit status: 137. Diagnostics: [2022-07-20 15:30:44.752]Container killed on request. Exit code is 137
[2022-07-20 15:30:44.808]Container exited with a non-zero exit code 137. 
[2022-07-20 15:30:44.811]Killed by external signal
.
22/07/20 15:30:44 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 127 for reason Container from a bad node: container_e136_1658221000868_4354_01_000165 on host: ithdp5006.cern.ch. Exit status: 137. Diagnostics: [2022-07-20 15:30:44.752]Container killed on request. Exit code is 137
[2022-07-20 15:30:44.808]Container exited with a non-zero exit code 137. 
[2022-07-20 15:30:44.811]Killed by external signal
.
22/07/20 15:30:44 WARN TaskSetManager: Lost task 83.0 in stage 63.0 (TID 11372) (ithdp5006.cern.ch executor 127): ExecutorLostFailure (executor 12

In [76]:
for i in range(len(daily_Top5_Dataset_list)):
    if (daily_Top5_Dataset_list[i] is None):
        daily_Top5_Dataset_list[i] = "Null"
        

In [121]:
import plotly.graph_objects as go

fig =go.Figure(go.Sunburst(
    labels=Labels,
    parents=Parents,
    values=Values,
    hovertemplate=Hover,
    name="",
    branchvalues="total"
))
fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))

fig.show()