written by: Nutchaya Phumekham, Aug 2022

# DataFrame of CMS Dataset, DataBlock, and their sizes

This notebook shows the process of constructing the Dataframe of CMS Dataset, CRAB DataBlock, and their sizes
- CMS_SubmissionTool == 'CRAB'
- CMSPrimaryDataTier != 'Unknown'
- CRAB_DataBlock IS NOT NULL

Information about the appearence of dataset and datablock here can be used to determine their first access and last access by the users. Then analysts can take a look at their respective size to determine how long they should keep these dataset for.

In [1]:
from datetime import datetime, date, timedelta
from pyspark.sql.functions import (
    col,
    lit,
    when,
    sum as _sum,
    max as _max,
    min as _min,
    count as _count,
    first,
    date_format,
    from_unixtime,
    to_date,
    countDistinct
)
import numpy as np
import pandas as pd
from pyspark.sql.types import (
    StructType,
    LongType,
    StringType,
    StructField,
    DoubleType,
    IntegerType,
)
import matplotlib.pyplot as plt
from pyspark.sql.window import Window

In [2]:
def _get_schema():
    return StructType(
        [
            StructField(
                "data",
                StructType(
                    [
                        StructField("RecordTime", LongType(), nullable=False),
                        StructField("DESIRED_CMSDataset", StringType(), nullable=True),
                        StructField("GlobalJobId", StringType(), nullable=False),
                        StructField("CMS_SubmissionTool", StringType(), nullable=True),
                        StructField("CRAB_DataBlock", StringType(), nullable=True),
                        StructField("CMSPrimaryDataTier", StringType(), nullable=True),
                        StructField("CRAB_Workflow", StringType(), nullable=True)
                    ]
                ),
            ),
        ]
    )

In [3]:
def get_candidate_files(start_date, end_date, spark, base):
    st_date = start_date - timedelta(days=3)
    ed_date = end_date + timedelta(days=3)
    days = (ed_date - st_date).days
    pre_candidate_files = [
        "{base}/{day}{{,.tmp}}".format(
            base=base, day=(st_date + timedelta(days=i)).strftime("%Y/%m/%d")
        )
        for i in range(0, days)
    ]
    sc = spark.sparkContext
    candidate_files = [
        f"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}"
        for i in range(0, days)
    ]
    FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
    URI = sc._gateway.jvm.java.net.URI
    Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
    fs = FileSystem.get(URI("hdfs:///"), sc._jsc.hadoopConfiguration())
    candidate_files = [url for url in candidate_files if fs.globStatus(Path(url))]
    return candidate_files


In [4]:
schema = _get_schema()
start_date = datetime(2022, 5, 1)
end_date = datetime(2022, 5, 2)

In [5]:
_DEFAULT_HDFS_FOLDER = "/project/monitoring/archive/condor/raw/metric"

In [6]:
get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER)

['/project/monitoring/archive/condor/raw/metric/2022/04/28',
 '/project/monitoring/archive/condor/raw/metric/2022/04/29',
 '/project/monitoring/archive/condor/raw/metric/2022/04/30',
 '/project/monitoring/archive/condor/raw/metric/2022/05/01',
 '/project/monitoring/archive/condor/raw/metric/2022/05/02',
 '/project/monitoring/archive/condor/raw/metric/2022/05/03',
 '/project/monitoring/archive/condor/raw/metric/2022/05/04']

### Raw DF is filtered for CMS_SubmissionTool == 'CRAB' AND CMSPrimaryDataTier != 'Unknown' AND CRAB_DataBlock IS NOT NULL
- get the data from _DEFAULT_HDFS_FOLDER = "/project/monitoring/archive/condor/raw/metric"
- start_date = datetime(2022, 5, 1)
- end_date = datetime(2022, 5, 2)

In [7]:
raw_df = (
        spark.read.option("basePath", _DEFAULT_HDFS_FOLDER)
        .json(
            get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER),
            schema=schema,
        ).select("data.*")
        .filter(
            f"""CMS_SubmissionTool == 'CRAB'
          AND CMSPrimaryDataTier != 'Unknown'
          AND CRAB_DataBlock IS NOT NULL
          AND RecordTime >= {start_date.timestamp() * 1000}
          AND RecordTime < {end_date.timestamp() * 1000}
          """
        )
        .drop_duplicates(["GlobalJobId"])
    )

### dataset size and block size

In [8]:
HDFS_DBS_FILES = '/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/FILES/part-m-00000'
HDFS_DBS_DATASETS = '/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/DATASETS/part-m-00000'
HDFS_DFS_BLOCKS = '/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/BLOCKS/part-m-00000'

In [10]:
from CMSSpark.src.python.CMSSpark import schemas as cms_schemas
csvreader = spark.read.format('csv') \
        .option('nullValue', 'null') \
        .option('mode', 'FAILFAST')
dbs_files = csvreader.schema(cms_schemas.schema_files()) \
        .load(HDFS_DBS_FILES) \
        .select(['f_file_size', 'f_block_id', 'f_dataset_id'])\
        .withColumnRenamed('f_block_id', 'BLOCK_ID')\
        .withColumnRenamed('f_dataset_id', 'DATASET_ID')
        
dbs_datasets = csvreader.schema(cms_schemas.schema_datasets()) \
        .load(HDFS_DBS_DATASETS) \
        .select(['d_dataset_id', 'd_dataset']).withColumnRenamed('d_dataset_id', 'DATASET_ID')

d_size_df = dbs_datasets.join(dbs_files, ['DATASET_ID'], how='left') \
        .groupby('d_dataset') \
        .agg(
            _sum('f_file_size').alias('Dataset_Size'))
    
b_size_df = csvreader.schema(cms_schemas.schema_blocks()) \
    .load(HDFS_DFS_BLOCKS) \
    .select(['b_block_name', 'b_block_size']).withColumnRenamed('b_block_size', 'Block_Size')

In [11]:
raw_df.printSchema()

root
 |-- RecordTime: long (nullable = true)
 |-- DESIRED_CMSDataset: string (nullable = true)
 |-- GlobalJobId: string (nullable = true)
 |-- CMS_SubmissionTool: string (nullable = true)
 |-- CRAB_DataBlock: string (nullable = true)
 |-- CMSPrimaryDataTier: string (nullable = true)
 |-- CRAB_Workflow: string (nullable = true)



In [12]:
d_size_df.printSchema()

root
 |-- d_dataset: string (nullable = true)
 |-- Dataset_Size: double (nullable = true)



In [13]:
b_size_df.printSchema()

root
 |-- b_block_name: string (nullable = true)
 |-- Block_Size: double (nullable = true)



### Table1: Dataset size of datasets that appear in raw_df
- datasets accessed withing a specific time range without considering diff user/job/etc
- raw_df.DESIRED_CMSDataset == d_size_df.d_dataset AND drop_duplicates('DESIRED_CMSDataset')

In [14]:
datasets = raw_df.select(['CMSPrimaryDataTier', 'DESIRED_CMSDataset']).drop_duplicates(['DESIRED_CMSDataset'])

In [41]:
datasets.show(5, False)

+------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+
|CMSPrimaryDataTier|DESIRED_CMSDataset                                                                                                                                    |
+------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+
|GEN-SIM           |/BPlusToJpsiK_pThat5_Pbp-Embed_8p16TeV_TuneCUETP8M1_Pythia8_EvtGen/pPb816Spring16GS-PbP_80X_mcRun2_pA_v4-v4/GEN-SIM                                   |
|MINIAODSIM        |/BdToKStarMuMu_TagFilter_SoftQCDnonD_TuneCP5_13TeV-pythia8-evtgen/RunIIAutumn18MiniAOD-Custom_RK_BParking_102X_upgrade2018_realistic_v15-v1/MINIAODSIM|
|MINIAODSIM        |/BulkGravToWWToWhadWhad_narrow_M-1000_TuneCP5_13TeV-madgraph-pythia/RunIISummer20UL18MiniAODv2-106X_upgrade2018_realisti

In [15]:
datasets.summary().show()

+-------+------------------+--------------------+
|summary|CMSPrimaryDataTier|  DESIRED_CMSDataset|
+-------+------------------+--------------------+
|  count|              1883|                1883|
|   mean|              null|                null|
| stddev|              null|                null|
|    min|               AOD|/BPlusToJpsiK_pTh...|
|    25%|              null|                null|
|    50%|              null|                null|
|    75%|              null|                null|
|    max|              USER|/ttHTobb_M125_Tun...|
+-------+------------------+--------------------+



Join the datasets df with the dataset size df

In [16]:
dateset_size = datasets.join(d_size_df, datasets.DESIRED_CMSDataset==d_size_df.d_dataset)\
                        .select(['CMSPrimaryDataTier', 'DESIRED_CMSDataset', 'Dataset_Size'])

In [47]:
dateset_size.show(10, False)

+------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+
|CMSPrimaryDataTier|DESIRED_CMSDataset                                                                                                                                     |Dataset_Size     |
+------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+
|MINIAODSIM        |/BulkGravToWWToWhadWhad_narrow_M-4500_TuneCP5_13TeV-madgraph-pythia/RunIISummer20UL16MiniAODAPV-106X_mcRun2_asymptotic_preVFP_v8-v2/MINIAODSIM         |1.1489723122E10  |
|MINIAODSIM        |/BulkGravToZZToZhadZhad_narrow_M-600_TuneCP5_13TeV-madgraph-pythia/RunIISummer20UL17MiniAODv2-106X_mc2017_realistic_v9-v2/MINIAODSIM                   |1.8172329256E10  |
|MINIAODSIM        |/GluGluToBulkGravitonToHH

In [44]:
dateset_size.summary().show()

+-------+------------------+--------------------+--------------------+
|summary|CMSPrimaryDataTier|  DESIRED_CMSDataset|        Dataset_Size|
+-------+------------------+--------------------+--------------------+
|  count|              1372|                1372|                1372|
|   mean|              null|                null|2.446466468466979...|
| stddev|              null|                null|1.022517745474014...|
|    min|               AOD|/BPlusToJpsiK_pTh...|       2.105073995E9|
|    25%|              null|                null|     1.1234449436E10|
|    50%|              null|                null|     2.3634062538E10|
|    75%|              null|                null|    9.10624135198E11|
|    max|              USER|/ttHTobb_M125_Tun...| 2.01098979361666E14|
+-------+------------------+--------------------+--------------------+



### Table2: Block size of blocks that appear in raw_df
- blocks accessed withing specific time range without considering diff user/job/etc
- raw_df.CRAB_DataBlock == b_size_df.b_block_name AND drop_duplicates('CRAB_DataBlock')


In [17]:
blocks = raw_df.withColumn("First_Access", _min('RecordTime').over(Window.partitionBy('CRAB_DataBlock')))\
            .withColumn("Last_Access", _max('RecordTime').over(Window.partitionBy('CRAB_DataBlock')))\
            .select(['DESIRED_CMSDataset', 'CRAB_DataBlock', 'CRAB_Workflow', 'First_Access', 'Last_Access'])\
            .drop_duplicates(['CRAB_DataBlock'])

In [49]:
blocks.summary().show()

+-------+--------------------+--------------------+--------------------+--------------------+--------------------+
|summary|  DESIRED_CMSDataset|      CRAB_DataBlock|       CRAB_Workflow|        First_Access|         Last_Access|
+-------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  count|               10484|               10484|               10484|               10484|               10484|
|   mean|                null|                null|                null|1.651402028118847...|1.651415617514689E12|
| stddev|                null|                null|                null|2.3080349578355484E7|1.9068649607181933E7|
|    min|/BPlusToJpsiK_pTh...|/BPlusToJpsiK_pTh...|201020_093216:anm...|       1651356000000|       1651356001000|
|    25%|                null|                null|                null|       1651397984000|       1651403521000|
|    50%|                null|                null|                null|       1

Join the blocks df with the block size df

In [18]:
block_size = blocks.join(b_size_df, blocks.CRAB_DataBlock==b_size_df.b_block_name)\
                        .select(['DESIRED_CMSDataset', 'CRAB_DataBlock', 'Block_Size', 
                                 'CRAB_Workflow', 'First_Access', 'Last_Access'])

In [42]:
block_size.summary().show()

+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|summary|  DESIRED_CMSDataset|      CRAB_DataBlock|          block_size|       CRAB_Workflow|        first_access|         last_access|
+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  count|                9972|                9972|                9972|                9972|                9972|                9972|
|   mean|                null|                null|4.863553386768151E10|                null|1.651401724591055E12|1.651415570126654...|
| stddev|                null|                null|1.914416664348531...|                null| 2.354649470811847E7| 1.949631900440551E7|
|    min|/BPlusToJpsiK_pTh...|/BPlusToJpsiK_pTh...|           2964934.0|201020_093216:anm...|       1651356000000|       1651356001000|
|    25%|                null|                nu

### Table3: Join the dataset and block df

In [19]:
df = dateset_size.join(block_size, dateset_size.DESIRED_CMSDataset == block_size.DESIRED_CMSDataset)\
                .select([dateset_size.CMSPrimaryDataTier, dateset_size.DESIRED_CMSDataset, 'Dataset_Size', 'CRAB_DataBlock', 'Block_Size', 'CRAB_Workflow', 'First_Access', 'Last_Access'])\
                .orderBy(col('Dataset_Size').desc())

In [20]:
df.summary().show()

+-------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|summary|CMSPrimaryDataTier|  DESIRED_CMSDataset|        Dataset_Size|      CRAB_DataBlock|          Block_Size|       CRAB_Workflow|        First_Access|         Last_Access|
+-------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  count|              9972|                9972|                9972|                9972|                9972|                9972|                9972|                9972|
|   mean|              null|                null|4.435160342171028E12|                null|4.863553386768151E10|                null|1.651401853364721...|1.651415590893802...|
| stddev|              null|                null|1.347145144039950...|                null|1.914416664348531...|        

In [21]:
df.show(30, False)

+------------------+------------------------------------------------------------------------------------------------------------------+-------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+---------------------------------------------------------------------------+-------------+-------------+
|CMSPrimaryDataTier|DESIRED_CMSDataset                                                                                                |Dataset_Size       |CRAB_DataBlock                                                                                                                                         |Block_Size       |CRAB_Workflow                                                              |First_Access |Last_Access  |
+------------------+------------------------------------------------------------------------------------------------------------------+-----

In [23]:
date_time = datetime.fromtimestamp(1651440241000/1000)
print("Date & Time =>" ,
      date_time.strftime('%Y-%m-%d %H:%M:%S'))

Date & Time => 2022-05-01 23:24:01


In [None]:
Date & Time => 2022-05-01 23:58:28