written by Nutchaya Phumekham, Aug 2022

## CRAB Active datasets size per month for 2002/01 - 2022/06
The dataset size are saved as files according to the following:
- hdfs://analytix/cms/users/nphumekh/crab_dataset_size_2022_1.parquet
- hdfs://analytix/cms/users/nphumekh/crab_dataset_size_2022_2.parquet
- hdfs://analytix/cms/users/nphumekh/crab_dataset_size_2022_3.parquet
- hdfs://analytix/cms/users/nphumekh/crab_dataset_size_2022_4.parquet
- hdfs://analytix/cms/users/nphumekh/crab_dataset_size_2022_5.parquet
- hdfs://analytix/cms/users/nphumekh/crab_dataset_size_2022_6.parquet

In [1]:
from utils import (
    _to_dict,
    _donut,
    _pie,
    _line_graph,
    _other_fields,
    _exitcode_info
)
from datetime import datetime, date, timedelta
from pyspark.sql.functions import (
    col,
    lit,
    when,
    sum as _sum,
    count as _count,
    first,
    date_format,
    from_unixtime,
    to_date,
    countDistinct
)
import numpy as np
import pandas as pd
from pyspark.sql.types import (
    StructType,
    LongType,
    StringType,
    StructField,
    DoubleType,
    IntegerType,
)
import matplotlib.pyplot as plt

In [None]:
# !git clone https://github.com/dmwm/CMSSpark.git

In [2]:
def _get_schema():
    return StructType(
        [
            StructField(
                "data",
                StructType(
                    [
                        StructField("RecordTime", LongType(), nullable=False),
                        StructField("DESIRED_CMSDataset", StringType(), nullable=True),
                        StructField("GlobalJobId", StringType(), nullable=False),
                        StructField("CMS_SubmissionTool", StringType(), nullable=True),
                    ]
                ),
            ),
        ]
    )

In [3]:
def get_candidate_files(start_date, end_date, spark, base):
    st_date = start_date - timedelta(days=3)
    ed_date = end_date + timedelta(days=3)
    days = (ed_date - st_date).days
    pre_candidate_files = [
        "{base}/{day}{{,.tmp}}".format(
            base=base, day=(st_date + timedelta(days=i)).strftime("%Y/%m/%d")
        )
        for i in range(0, days)
    ]
    sc = spark.sparkContext
    candidate_files = [
        f"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}"
        for i in range(0, days)
    ]
    FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
    URI = sc._gateway.jvm.java.net.URI
    Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
    fs = FileSystem.get(URI("hdfs:///"), sc._jsc.hadoopConfiguration())
    candidate_files = [url for url in candidate_files if fs.globStatus(Path(url))]
    return candidate_files


In [4]:
_DEFAULT_HDFS_FOLDER = "/project/monitoring/archive/condor/raw/metric"
HDFS_DBS_FILES = '/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/FILES/part-m-00000'
HDFS_DBS_DATASETS = '/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/DATASETS/part-m-00000'

In [5]:
from CMSSpark.src.python.CMSSpark import schemas as cms_schemas
csvreader = spark.read.format('csv') \
        .option('nullValue', 'null') \
        .option('mode', 'FAILFAST')
dbs_files = csvreader.schema(cms_schemas.schema_files()) \
        .load(HDFS_DBS_FILES) \
        .withColumnRenamed('f_logical_file_name', 'f_name')
dbs_datasets = csvreader.schema(cms_schemas.schema_datasets()) \
        .load(HDFS_DBS_DATASETS) \
        .select(['d_dataset_id', 'd_dataset'])
df_dbs_f_d = dbs_files.join(dbs_datasets, dbs_files.f_dataset_id == dbs_datasets.d_dataset_id, how='left') \
        .withColumnRenamed('f_dataset_id', 'dataset_id') \
        .withColumnRenamed('d_dataset', 'dataset') \
        .select(['f_name', 'dataset', 'f_file_size'])
d_size_df = df_dbs_f_d.groupby(['dataset'])\
                                .agg(_sum(col('f_file_size')).alias('sum_file_size'))\
                                .orderBy(col('sum_file_size').desc())

In [7]:
for i in range(1,7):
    month = str(i)
    start_date = datetime(2022, 1, i)
    end_date = datetime(2022, 1, i+1)
    schema = _get_schema()
    get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER)
    crab_df = (
        spark.read.option("basePath", _DEFAULT_HDFS_FOLDER)
        .json(
            get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER),
            schema=schema,
        ).select("data.*")
        .filter(
            f"""CMS_SubmissionTool == 'CRAB'
          AND RecordTime >= {start_date.timestamp() * 1000}
          AND RecordTime < {end_date.timestamp() * 1000}
          """
        )
        .drop_duplicates(["GlobalJobId"])
    )
    crab_dataset_size = crab_df.join(d_size_df, crab_df.DESIRED_CMSDataset == d_size_df.dataset, how='left')\
        .select(['DESIRED_CMSDataset', 'sum_file_size'])\
        .filter(col('DESIRED_CMSDataset').isNotNull())\
        .orderBy(col('sum_file_size').desc())
    crab_dataset_size.write.parquet("hdfs://analytix/cms/users/nphumekh/crab_dataset_size_2022_%s.parquet" % (month))
    print("hdfs://analytix/cms/users/nphumekh/crab_dataset_size_2022_%s.parquet" % (month))

hdfs://analytix/cms/users/nphumekh/crab_dataset_size_2022_1.parquet
hdfs://analytix/cms/users/nphumekh/crab_dataset_size_2022_2.parquet
hdfs://analytix/cms/users/nphumekh/crab_dataset_size_2022_3.parquet
hdfs://analytix/cms/users/nphumekh/crab_dataset_size_2022_4.parquet
hdfs://analytix/cms/users/nphumekh/crab_dataset_size_2022_5.parquet
hdfs://analytix/cms/users/nphumekh/crab_dataset_size_2022_6.parquet


### try reading from "hdfs://analytix/cms/users/nphumekh/crab_dataset_size_2022_1.parquet"

In [11]:
df = spark.read.parquet("hdfs://analytix/cms/users/nphumekh/crab_dataset_size_2022_1.parquet")

In [15]:
df.show(40, False)

+----------------------------------------------------------------------------------------------------------------------------+-------------------+
|DESIRED_CMSDataset                                                                                                          |sum_file_size      |
+----------------------------------------------------------------------------------------------------------------------------+-------------------+
|/DYToLL_M-50_TuneCP5_14TeV-pythia8/Run3Winter21DRMiniAOD-FlatPU30to80FEVT_112X_mcRun3_2021_realistic_v16-v2/GEN-SIM-DIGI-RAW|5.16713993514068E14|
|/DYToLL_M-50_TuneCP5_14TeV-pythia8/Run3Winter21DRMiniAOD-FlatPU30to80FEVT_112X_mcRun3_2021_realistic_v16-v2/GEN-SIM-DIGI-RAW|5.16713993514068E14|
|/DYToLL_M-50_TuneCP5_14TeV-pythia8/Run3Winter21DRMiniAOD-FlatPU30to80FEVT_112X_mcRun3_2021_realistic_v16-v2/GEN-SIM-DIGI-RAW|5.16713993514068E14|
|/DYToLL_M-50_TuneCP5_14TeV-pythia8/Run3Winter21DRMiniAOD-FlatPU30to80FEVT_112X_mcRun3_2021_realistic_v16-v2/GEN-SIM-D