written by: Nutchaya Phumekham, Aug 2022

### Get CMS dataset size or Block size 
for CRAB data analytics

This notebook saves the active CRAB datasets and datablocks as files. One day per one file. Time range is from 1st of January 2022 to 30th of June 2022 (6 months). The data can be accessed via the following HDFS path:
- hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_month_day.parquet

In [2]:
# !git clone https://github.com/dmwm/CMSSpark.git
import utils as utils
from datetime import datetime, date, timedelta
from pyspark.sql.functions import (
    col,
    lit,
    when,
    sum as _sum,
    count as _count,
    min as _min,
    max as _max,
    first,
    date_format,
    from_unixtime,
    to_date,
    countDistinct
)
import numpy as np
import pandas as pd
from pyspark.sql.types import (
    StructType,
    LongType,
    StringType,
    StructField,
    DoubleType,
    IntegerType,
)
import matplotlib.pyplot as plt


spark.conf.set("spark.sql.session.timeZone", "UTC")
from CMSSpark.src.python.CMSSpark import schemas as cms_schemas
from pyspark.sql.window import Window

In [3]:
def _get_schema():
    return StructType(
        [
            StructField(
                "data",
                StructType(
                    [
                        StructField("RecordTime", LongType(), nullable=False),
                        StructField("DESIRED_CMSDataset", StringType(), nullable=True),
                        StructField("GlobalJobId", StringType(), nullable=False),
                        StructField("CMS_SubmissionTool", StringType(), nullable=True),
                        StructField("CRAB_DataBlock", StringType(), nullable=True),
                        StructField("CMSPrimaryDataTier", StringType(), nullable=True),
                        StructField("CRAB_Workflow", StringType(), nullable=True)
                    ]
                ),
            ),
        ]
    )

In [8]:
#note: there is a use of utils.get_candidate_files
def get_active_dataset_datablock_size_df(start_date, end_date):
    raw_df = (
        spark.read.option("basePath", _DEFAULT_HDFS_FOLDER)
        .json(
            utils.get_candidate_files(start_date, end_date, spark, _DEFAULT_HDFS_FOLDER),
            schema=schema,
        ).select("data.*")
        .filter(
            f"""CMS_SubmissionTool == 'CRAB'
          AND CMSPrimaryDataTier != 'Unknown'
          AND CRAB_DataBlock IS NOT NULL
          AND RecordTime >= {start_date.timestamp() * 1000}
          AND RecordTime < {end_date.timestamp() * 1000}
          """
        )
        .drop_duplicates(["GlobalJobId"])
    )
    datasets = raw_df.select(['CMSPrimaryDataTier', 'DESIRED_CMSDataset']).drop_duplicates(['DESIRED_CMSDataset'])
    dateset_size = datasets.join(d_size_df, datasets.DESIRED_CMSDataset==d_size_df.d_dataset)\
                        .select(['CMSPrimaryDataTier', 'DESIRED_CMSDataset', 'Dataset_Size'])
    blocks = raw_df.withColumn("First_Access", _min('RecordTime').over(Window.partitionBy('CRAB_DataBlock')))\
            .withColumn("Last_Access", _max('RecordTime').over(Window.partitionBy('CRAB_DataBlock')))\
            .select(['DESIRED_CMSDataset', 'CRAB_DataBlock', 'CRAB_Workflow', 'First_Access', 'Last_Access'])\
            .drop_duplicates(['CRAB_DataBlock'])
    block_size = blocks.join(b_size_df, blocks.CRAB_DataBlock==b_size_df.b_block_name)\
                        .select(['DESIRED_CMSDataset', 'CRAB_DataBlock', 'Block_Size', 
                                 'CRAB_Workflow', 'First_Access', 'Last_Access'])
    output_df = dateset_size.join(block_size, dateset_size.DESIRED_CMSDataset == block_size.DESIRED_CMSDataset)\
                .select([dateset_size.CMSPrimaryDataTier, dateset_size.DESIRED_CMSDataset, \
                         'Dataset_Size', 'CRAB_DataBlock', 'Block_Size', 'CRAB_Workflow', \
                         'First_Access', 'Last_Access'])\
                .orderBy(col('Dataset_Size').desc())
    return output_df

In [21]:
_DEFAULT_HDFS_FOLDER = "/project/monitoring/archive/condor/raw/metric"
schema = _get_schema()

In [22]:
d_size_df = utils.d_size_df(spark)
b_size_df = utils.b_size_df(spark)

In [23]:
month = 6

In [None]:
for day in range(1, 30+1):
    if(day==30):
        start_date = datetime(2022, month, day)
        end_date = datetime(2022, month+1, 1)
    else:
        start_date = datetime(2022, month, day)
        end_date = datetime(2022, month, day+1)
    output_df = get_active_dataset_datablock_size_df(start_date, end_date)
    
    output_df.write.parquet("hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_%s_%s.parquet" \
                            % (str(month), str(day)))
    
    print("hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_%s_%s.parquet" \
                            % (str(month), str(day)))

In [1]:
!hdfs dfs -ls 'hdfs://analytix/cms/users/nphumekh'

22/08/23 02:10:14 WARN ipc.Client: Exception encountered while connecting to the server 
org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException): Operation category READ is not supported in state standby. Visit https://s.apache.org/sbnn-error
	at org.apache.hadoop.security.SaslRpcClient.saslConnect(SaslRpcClient.java:376)
	at org.apache.hadoop.ipc.Client$Connection.setupSaslConnection(Client.java:622)
	at org.apache.hadoop.ipc.Client$Connection.access$2300(Client.java:413)
	at org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:822)
	at org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:818)
	at java.base/java.security.AccessController.doPrivileged(Native Method)
	at java.base/javax.security.auth.Subject.doAs(Subject.java:423)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1845)
	at org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:818)
	at org.apache.hadoop.ipc.Client$Connection.access$3800(Client.

Try reading the saved file

In [4]:
try_df = spark.read.parquet("hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_1_1.parquet")

In [5]:
try_df.printSchema()

root
 |-- CMSPrimaryDataTier: string (nullable = true)
 |-- DESIRED_CMSDataset: string (nullable = true)
 |-- Dataset_Size: double (nullable = true)
 |-- CRAB_DataBlock: string (nullable = true)
 |-- Block_Size: double (nullable = true)
 |-- CRAB_Workflow: string (nullable = true)
 |-- First_Access: long (nullable = true)
 |-- Last_Access: long (nullable = true)



In [6]:
try_df.show(20)

+------------------+--------------------+-------------------+--------------------+-----------------+--------------------+-------------+-------------+
|CMSPrimaryDataTier|  DESIRED_CMSDataset|       Dataset_Size|      CRAB_DataBlock|       Block_Size|       CRAB_Workflow| First_Access|  Last_Access|
+------------------+--------------------+-------------------+--------------------+-----------------+--------------------+-------------+-------------+
|              FEVT|/MinBias_TuneCP5_...|4.05859813189503E14|/MinBias_TuneCP5_...|4.998120004573E12|211230_002539:kbu...|1641012481000|1641056401000|
|               RAW|/SingleMuon/Run20...|3.80660439294442E14|/SingleMuon/Run20...|2.014125582609E12|211209_185647:mkr...|1641037681000|1641037681000|
|           MINIAOD|/ParkingBPH2/Run2...| 1.2872668679638E14|/ParkingBPH2/Run2...|  2.2399017025E11|211210_170625:kta...|1640991602000|1641077281000|
|           MINIAOD|/ParkingBPH5/Run2...|1.28647603163384E14|/ParkingBPH5/Run2...|  4.3339629534E10|