In [None]:
from datetime import datetime, timedelta, timezone
import os
import time
import pandas as pd

from pyspark import SparkContext, StorageLevel
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    current_user,
    col, collect_list, concat_ws, greatest, lit, lower, when,
    avg as _avg,
    count as _count,
    hex as _hex,
    max as _max,
    min as _min,
    round as _round,
    sum as _sum,
)
from pyspark.sql.types import (
    StructType,
    LongType,
    StringType,
    StructField,
    DoubleType,
    IntegerType,
)

In [None]:
# try to import osearch from current directory, fallback to $PWD/../workdir if not found
try:
    import osearch
except ModuleNotFoundError:
    import sys
    sys.path.insert(0, f'{os.getcwd()}/../workdir')
    import osearch

In [None]:
spark = SparkSession\
        .builder\
        .appName('crab-taskdb')\
        .getOrCreate()
spark

In [None]:
# clear any cache left, for working with notebook
# it safe to run everytime cronjob start
spark.catalog.clearCache()

In [None]:
# secret path, also check if file exists
secretpath = os.environ.get('OPENSEARCH_SECRET_PATH', f'{os.getcwd()}/../workdir/secret_opensearch.txt')
if not os.path.isfile(secretpath): 
    raise Exception(f'OS secrets file {secretpath} does not exists')
# if PROD, index prefix will be `crab-*`, otherwise `crab-test-*`
PROD = os.environ.get('PROD', 'false').lower() in ('true', '1', 't')
# FROM_DATE, in strptime("%Y-%m-%d")
START = os.environ.get('START_DATE', None) 
END = os.environ.get('END_DATE', None)

In [None]:
# For run playbook manually, set start/end date here
START_DATE = "2020-01-01"
END_DATE = "2024-10-01"
# if cronjob, replace constant with value from env
if START and END:
    START_DATE = START
    END_DATE = END

In [None]:
# index name
index_name = 'taskdb'
# use prod index pattern if this execution is for production
if PROD:
    index_name = f'crab-prod-{index_name}'
else:
    index_name = f'crab-test-{index_name}'

In [None]:
# datetime object
start_datetime = datetime.strptime(START_DATE, "%Y-%m-%d").replace(tzinfo=timezone.utc)
end_datetime = datetime.strptime(END_DATE, "%Y-%m-%d").replace(tzinfo=timezone.utc)
# sanity check
if end_datetime < start_datetime: 
    raise Exception(f"end date ({END_DATE}) is less than start date ({START_DATE})")
start_epochmilis = int(start_datetime.timestamp()) * 1000
end_epochmilis = int(end_datetime.timestamp()) * 1000
yesterday_epoch = int((end_datetime-timedelta(days=1)).timestamp())

In [None]:
# debug
print(START_DATE, 
      END_DATE, 
      index_name,
      sep='\n')

In [None]:
# This code block and following block is copied from Panos's script.
# https://gitlab.cern.ch/cmsdmops/cmsdmops/-/blob/8da699db49097d7a58440e6058f022c3f93992e2/monitoring/kubernetes/src/rucio_activity_account_usage.py
# see more in https://github.com/dmwm/CRABServer/issues/7798#issuecomment-2389265249
def get_df_rses(spark):
    """Get Spark dataframe of RSES
    """
    hdfs_rses_path = '/project/awg/cms/rucio/{}/rses/part*.avro'.format(datetime.today().strftime('%Y-%m-%d'))
    df_rses = spark.read.format("avro").load(hdfs_rses_path) \
        .filter(col('DELETED_AT').isNull()) \
        .withColumn('rse_id', lower(_hex(col('ID')))) \
        .withColumn('rse_tier', _split(col('RSE'), '_').getItem(0)) \
        .withColumn('rse_country', _split(col('RSE'), '_').getItem(1)) \
        .withColumn('rse_kind',
                    when((col("rse").endswith('Temp') | col("rse").endswith('temp') | col("rse").endswith('TEMP')),
                         'temp')
                    .when((col("rse").endswith('Test') | col("rse").endswith('test') | col("rse").endswith('TEST')),
                          'test')
                    .otherwise('prod')
                    ) \
        .select(['rse_id', 'RSE', 'RSE_TYPE', 'rse_tier', 'rse_country', 'rse_kind'])
    return df_rses
def get_df_locks(spark):
    """Get Spark dataframe of Locks
    """
    today = datetime.today().strftime('%Y-%m-%d')
    locks_path = f'/project/awg/cms/rucio/{today}/locks/part*.avro'
    locks = spark.read.format('avro').load(locks_path) \
                .filter(col('SCOPE') == 'cms') \
                .filter(col('STATE').isin(['O', 'R'])) \
                .withColumn('rse_id', lower(_hex(col('RSE_ID')))) \
                .withColumnRenamed('NAME', 'f_name') \
                .withColumnRenamed('ACCOUNT', 'account_name') \
                .withColumnRenamed('BYTES', 'f_size') \
                .withColumn('r_id', lower(_hex(col('RULE_ID')))) \
                .select(['rse_id', 'f_name', 'f_size', 'r_id', 'account_name'])
    return locks
def get_df_accounts(spark):
    """Get Spark dataframe of Accounts
    """
    today = datetime.today().strftime('%Y-%m-%d')
    hdfs_rucio_accounts = f'/project/awg/cms/rucio/{today}/accounts/part*.avro'
    df_accounts = spark.read.format("avro").load(hdfs_rucio_accounts) \
        .filter(col('DELETED_AT').isNull()) \
        .withColumnRenamed('ACCOUNT', 'account_name') \
        .withColumnRenamed('ACCOUNT_TYPE', 'account_type') \
        .select(['account_name', 'account_type'])
    return df_accounts
def get_df_rules(spark):
    """Get Spark dataframe of rules
    """
    hdfs_rules_path = '/project/awg/cms/rucio/{}/rules/part*.avro'.format(datetime.today().strftime('%Y-%m-%d'))
    return spark.read.format('avro').load(hdfs_rules_path) \
        .filter(col('SCOPE') == 'cms') \
        .withColumnRenamed('name', 'r_name') \
        .withColumn('r_id', lower(_hex(col('ID')))) \
        .withColumn('s_id', lower(_hex(col('SUBSCRIPTION_ID')))) \
        .withColumnRenamed('ACTIVITY', 'activity') \
        .withColumnRenamed('STATE', 'rule_state') \
        .withColumnRenamed('RSE_EXPRESSION', 'rse_expression') \
        .select(['r_name','r_id', 's_id', 'activity', 'rule_state', 'rse_expression']) 



In [None]:
# add data_tier field
df_rses = get_df_rses(spark)
df_locks = get_df_locks(spark)
df_accounts = get_df_accounts(spark)
df_rules = get_df_rules(spark)
tb_denominator = 10 ** 12
locks = df_locks.join(df_rses, ['rse_id'], how='left') \
        .filter(col('rse_kind') == 'prod') \
        .select(['f_name', 'f_size', 'RSE', 'rse_type', 'account_name', 'r_id']) 

locks_with_activity = (
    locks.join(df_rules, ['r_id'], how='leftouter')
         .select(['f_name', 'account_name', 'RSE', 'rse_type', 'f_size', 'activity', 'r_name'])
         .withColumn('data_tier', regexp_extract('r_name', r'^\/([\w-]+)\/([\w-]+)\/([\w-]+)(#[\w-]+)?', 3))
         .select(['f_name', 'account_name', 'RSE', 'rse_type', 'f_size', 'activity', 'data_tier'])
)

timestamp = int(time.time())

# A File locked by the user for two activities is accounted to both activities
# A File locked by two users for the same activity is accounted to both Users
user_aggreagated = locks_with_activity \
        .select(['f_name', 'f_size', 'RSE', 'rse_type', 'account_name', 'activity', 'data_tier']) \
        .distinct() \
        .groupby(['RSE', 'rse_type', 'account_name', 'activity', 'data_tier']) \
        .agg(_round(_sum(col('f_size')) / tb_denominator, 5).alias('total_locked')) \
        .join(df_accounts, ['account_name'], how='left') \
        .withColumnRenamed('RSE', 'rse_name') \
        .withColumn('timestamp', lit(timestamp)) \
        .select(['total_locked', 'rse_name', 'rse_type', 'account_name', 'account_type', 'activity', 'data_tier', 'timestamp']) \
        .cache()



In [None]:
user_aggreagated.show(10, False)

In [None]:
user_aggreagated.count()