written by: Nutchaya Phumekham, Aug 2022

# DataFrame of DESIRED_CMSDataset and CRAB_DataBlock accessed in the specific times for CMS_SubmissionTool==CRAB

In [1]:
from utils import (
    _to_dict,
    _donut,
    _pie,
    _line_graph,
    _other_fields,
    _exitcode_info,
    _better_label
)
from datetime import datetime, date, timedelta
from pyspark.sql.functions import (
    col,
    lit,
    when,
    sum as _sum,
    max as _max,
    min as _min,
    count as _count,
    first,
    date_format,
    from_unixtime,
    to_date,
    countDistinct
)
import numpy as np
import pandas as pd
from pyspark.sql.types import (
    StructType,
    LongType,
    StringType,
    StructField,
    DoubleType,
    IntegerType,
)
import matplotlib.pyplot as plt

In [3]:
def _get_schema():
    return StructType(
        [
            StructField(
                "data",
                StructType(
                    [
                        StructField("RecordTime", LongType(), nullable=False),
                        StructField("DESIRED_CMSDataset", StringType(), nullable=True),
                        StructField("GlobalJobId", StringType(), nullable=False),
                        StructField("CMS_SubmissionTool", StringType(), nullable=True),
                        StructField("CRAB_DataBlock", StringType(), nullable=True),
                        StructField("CMSPrimaryDataTier", StringType(), nullable=True),
                        StructField("CRAB_Workflow", StringType(), nullable=True)
                    ]
                ),
            ),
        ]
    )

In [4]:
def get_candidate_files(start_date, end_date, spark, base):
    st_date = start_date - timedelta(days=3)
    ed_date = end_date + timedelta(days=3)
    days = (ed_date - st_date).days
    pre_candidate_files = [
        "{base}/{day}{{,.tmp}}".format(
            base=base, day=(st_date + timedelta(days=i)).strftime("%Y/%m/%d")
        )
        for i in range(0, days)
    ]
    sc = spark.sparkContext
    candidate_files = [
        f"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}"
        for i in range(0, days)
    ]
    FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
    URI = sc._gateway.jvm.java.net.URI
    Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
    fs = FileSystem.get(URI("hdfs:///"), sc._jsc.hadoopConfiguration())
    candidate_files = [url for url in candidate_files if fs.globStatus(Path(url))]
    return candidate_files


In [5]:
_DEFAULT_HDFS_FOLDER = "/project/monitoring/archive/condor/raw/metric"
HDFS_DBS_FILES = '/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/FILES/part-m-00000'
HDFS_DBS_DATASETS = '/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/DATASETS/part-m-00000'
HDFS_DFS_BLOCKS = '/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/BLOCKS/part-m-00000'

In [6]:
schema = _get_schema()
start_date = datetime(2022, 5, 1)
end_date = datetime(2022, 5, 2)

In [7]:
get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER)

['/project/monitoring/archive/condor/raw/metric/2022/04/28',
 '/project/monitoring/archive/condor/raw/metric/2022/04/29',
 '/project/monitoring/archive/condor/raw/metric/2022/04/30',
 '/project/monitoring/archive/condor/raw/metric/2022/05/01',
 '/project/monitoring/archive/condor/raw/metric/2022/05/02',
 '/project/monitoring/archive/condor/raw/metric/2022/05/03',
 '/project/monitoring/archive/condor/raw/metric/2022/05/04']

### Raw DF is filtered for CMS_SubmissionTool == 'CRAB'
- start_date = datetime(2022, 5, 1)
- end_date = datetime(2022, 5, 2)

In [8]:
raw_df = (
        spark.read.option("basePath", _DEFAULT_HDFS_FOLDER)
        .json(
            get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER),
            schema=schema,
        ).select("data.*")
        .filter(
            f"""CMS_SubmissionTool == 'CRAB'
          AND CMSPrimaryDataTier != 'Unknown'
          AND CRAB_DataBlock IS NOT NULL
          AND RecordTime >= {start_date.timestamp() * 1000}
          AND RecordTime < {end_date.timestamp() * 1000}
          """
        )
        .drop_duplicates(["GlobalJobId"])
    )

### d_size_df - dataset size and b_size_df - block size
- HDFS_DBS_FILES = '/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/FILES/part-m-00000'
- HDFS_DBS_DATASETS = '/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/DATASETS/part-m-00000'
- HDFS_DFS_BLOCKS = '/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/BLOCKS/part-m-00000'

In [10]:
from CMSSpark.src.python.CMSSpark import schemas as cms_schemas
csvreader = spark.read.format('csv') \
        .option('nullValue', 'null') \
        .option('mode', 'FAILFAST')
dbs_files = csvreader.schema(cms_schemas.schema_files()) \
        .load(HDFS_DBS_FILES) \
        .select(['f_file_size', 'f_block_id', 'f_dataset_id'])\
        .withColumnRenamed('f_block_id', 'BLOCK_ID')\
        .withColumnRenamed('f_dataset_id', 'DATASET_ID')
        
dbs_datasets = csvreader.schema(cms_schemas.schema_datasets()) \
        .load(HDFS_DBS_DATASETS) \
        .select(['d_dataset_id', 'd_dataset']).withColumnRenamed('d_dataset_id', 'DATASET_ID')

d_size_df = dbs_datasets.join(dbs_files, ['DATASET_ID'], how='left') \
        .groupby('d_dataset') \
        .agg(
            _sum('f_file_size').alias('dataset_size'))
    
b_size_df = csvreader.schema(cms_schemas.schema_blocks()) \
    .load(HDFS_DFS_BLOCKS) \
    .select(['b_block_name', 'b_block_size']).withColumnRenamed('b_block_size', 'block_size')

In [9]:
raw_df.printSchema()

root
 |-- RecordTime: long (nullable = true)
 |-- DESIRED_CMSDataset: string (nullable = true)
 |-- GlobalJobId: string (nullable = true)
 |-- CMS_SubmissionTool: string (nullable = true)
 |-- CRAB_DataBlock: string (nullable = true)
 |-- CMSPrimaryDataTier: string (nullable = true)
 |-- CRAB_Workflow: string (nullable = true)



In [11]:
d_size_df.printSchema()

root
 |-- d_dataset: string (nullable = true)
 |-- dataset_size: double (nullable = true)



In [12]:
b_size_df.printSchema()

root
 |-- b_block_name: string (nullable = true)
 |-- block_size: double (nullable = true)



### Table1: Dataset size of datasets that appear in raw_df
raw_df.DESIRED_CMSDataset == d_size_df.d_dataset AND drop_duplicates('DESIRED_CMSDataset')

In [16]:
# datasets accessed withing specific time range without considering diff user/job/etc
datasets = raw_df.select(['CMSPrimaryDataTier', 'DESIRED_CMSDataset']).drop_duplicates(['DESIRED_CMSDataset'])

In [23]:
datasets.summary().show()

+-------+------------------+--------------------+
|summary|CMSPrimaryDataTier|  DESIRED_CMSDataset|
+-------+------------------+--------------------+
|  count|             12549|               12548|
|   mean|              null|                null|
| stddev|              null|                null|
|    min|          ALCARECO|/ADDGravToGG_NegI...|
|    25%|              null|                null|
|    50%|              null|                null|
|    75%|              null|                null|
|    max|           Unknown|/ttbb_4FS_ckm_amc...|
+-------+------------------+--------------------+



In [21]:
dateset_size = datasets.join(d_size_df, datasets.DESIRED_CMSDataset==d_size_df.d_dataset)\
                        .select(['CMSPrimaryDataTier', 'DESIRED_CMSDataset', 'dataset_size'])

In [None]:
dateset_size.show(10, False)

In [24]:
dateset_size.summary().show()

+-------+------------------+--------------------+--------------------+
|summary|CMSPrimaryDataTier|  DESIRED_CMSDataset|        dataset_size|
+-------+------------------+--------------------+--------------------+
|  count|             10819|               10819|               10819|
|   mean|              null|                null|2.308864295416406...|
| stddev|              null|                null|1.350402476414188E13|
|    min|          ALCARECO|/ADDGravToGG_NegI...|         7.7320687E7|
|    25%|              null|                null|       6.290462739E9|
|    50%|              null|                null|     2.5367147132E10|
|    75%|              null|                null|    4.99981753797E11|
|    max|              USER|/ttbb_4FS_ckm_amc...| 5.16713993514068E14|
+-------+------------------+--------------------+--------------------+



In [None]:
from pyspark.sql.functions import col,isnan,when,count
a = dateset_size.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            col(c).contains('null') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in dateset_size.columns])

a.show()

### Table2: Block size of blocks that appear in raw_df
raw_df.CRAB_DataBlock == b_size_df.b_block_name AND drop_duplicates('CRAB_DataBlock')


In [38]:
# blocks accessed withing specific time range without considering diff user/job/etc
from pyspark.sql import Window
blocks = raw_df.withColumn("first_access", _min('RecordTime').over(Window.partitionBy('CRAB_DataBlock')))\
            .withColumn("last_access", _max('RecordTime').over(Window.partitionBy('CRAB_DataBlock')))\
            .select(['DESIRED_CMSDataset', 'CRAB_DataBlock', 'CRAB_Workflow', 'first_access', 'last_access'])\
            .drop_duplicates(['CRAB_DataBlock'])

In [40]:
blocks.summary().show()

+-------+--------------------+--------------------+--------------------+--------------------+--------------------+
|summary|  DESIRED_CMSDataset|      CRAB_DataBlock|       CRAB_Workflow|        first_access|         last_access|
+-------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  count|               10485|               10486|               10487|               10487|               10487|
|   mean|                null|                null|                null|1.651402187300848...|1.651415701606751...|
| stddev|                null|                null|                null|2.2869054699137878E7| 1.897668727966448E7|
|    min|/BPlusToJpsiK_pTh...|/BPlusToJpsiK_pTh...|200226_221614:esc...|       1651356000000|       1651356001000|
|    25%|                null|                null|                null|       1651397762000|       1651403571000|
|    50%|                null|                null|                null|       1

In [41]:
block_size = blocks.join(b_size_df, blocks.CRAB_DataBlock==b_size_df.b_block_name)\
                        .select(['DESIRED_CMSDataset', 'CRAB_DataBlock', 'block_size', 'CRAB_Workflow', 'first_access', 'last_access'])

In [42]:
block_size.summary().show()

+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|summary|  DESIRED_CMSDataset|      CRAB_DataBlock|          block_size|       CRAB_Workflow|        first_access|         last_access|
+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  count|                9972|                9972|                9972|                9972|                9972|                9972|
|   mean|                null|                null|4.863553386768151E10|                null|1.651401724591055E12|1.651415570126654...|
| stddev|                null|                null|1.914416664348531...|                null| 2.354649470811847E7| 1.949631900440551E7|
|    min|/BPlusToJpsiK_pTh...|/BPlusToJpsiK_pTh...|           2964934.0|201020_093216:anm...|       1651356000000|       1651356001000|
|    25%|                null|                nu

### Table3: Join the dataset and block tables

In [46]:
df = dateset_size.join(block_size, dateset_size.DESIRED_CMSDataset == block_size.DESIRED_CMSDataset)\
                .select([dateset_size.CMSPrimaryDataTier, dateset_size.DESIRED_CMSDataset, 'dataset_size', 'CRAB_DataBlock', 'block_size', 'CRAB_Workflow', 'first_access', 'last_access'])\
                .orderBy(col('dataset_size').desc())

In [48]:
df.show(40,False)

+------------------+------------------------------------------------------------------------------------------------------------------+-------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+---------------------------------------------------------------------------------------------------------+-------------+-------------+
|CMSPrimaryDataTier|DESIRED_CMSDataset                                                                                                |dataset_size       |CRAB_DataBlock                                                                                                                                         |block_size       |CRAB_Workflow                                                                                            |first_access |last_access  |
+------------------+------------------------------------------------------------