In [1]:
from datetime import datetime, timedelta, timezone
import os
import time
import pandas as pd

from pyspark import SparkContext, StorageLevel
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    current_user,
    col, collect_list, concat_ws, greatest, lit, lower, when,
    avg as _avg,
    count as _count,
    hex as _hex,
    max as _max,
    min as _min,
    round as _round,
    sum as _sum,
)
from pyspark.sql.types import (
    StructType,
    LongType,
    StringType,
    StructField,
    DoubleType,
    IntegerType,
)

In [2]:
spark = SparkSession\
        .builder\
        .appName('tape-recall-history')\
        .getOrCreate()
spark

24/10/01 23:31:26 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [36]:
spark.catalog.clearCache()

In [37]:
# arguments
# secret path, also check if file exists
secretpath = os.environ.get('OPENSEARCH_SECRET_PATH', f'{os.getcwd()}/../workdir/secret_opensearch.txt')
with open(secretpath, 'r') as r:
    pass
# if PROD, index prefix will be `crab-*`, otherwise `crab-test-*`
PROD = os.environ.get('PROD', 'false').lower() in ('true', '1', 't')
# FROM_DATE, in strptime("%Y-%m-%d")
START = os.environ.get('START_DATE', None) 
END = os.environ.get('END_DATE', None)

In [38]:
# try to import osearch from current directory, fallback to $PWD/../workdir if not found
try:
    import osearch
except ModuleNotFoundError:
    import sys
    sys.path.insert(0, f'{os.getcwd()}/../workdir')
    import osearch

In [39]:
# variables for run inside notebook
START_DATE = "2000-01-01"
END_DATE = "2024-10-01"

In [40]:
# const variable
index_name = 'crab-test-tape-recall-history' # always put test index prefix

In [41]:
# if cronjob, replace constant with value from env
if START and END:
    START_DATE = START
    END_DATE = END
# use prod index pattern if this execution is for production
if PROD:
    index_name = f'crab-{"-".join(index_name.split("-")[2:])}'
# datetime object
start_datetime = datetime.strptime(START_DATE, "%Y-%m-%d").replace(tzinfo=timezone.utc)
end_datetime = datetime.strptime(END_DATE, "%Y-%m-%d").replace(tzinfo=timezone.utc)
if end_datetime < start_datetime:
    raise Exception(f"end date ({END_DATE}) is less than start date ({START_DATE})")

In [42]:
# debug
print(START_DATE, 
      start_datetime, 
      END_DATE, 
      end_datetime, 
      index_name,
      sep='\n')

2000-01-01
2000-01-01 00:00:00+00:00
2024-10-01
2024-10-01 00:00:00+00:00
crab-test-tape-recall-history


In [43]:
# Import data into spark

HDFS_RUCIO_RULES_HISTORY = f'/project/awg/cms/rucio/{END_DATE}/rules_history/'

print("==============================================="
      , "RUCIO : Rules History"
      , "==============================================="
      , "File Directory:", HDFS_RUCIO_RULES_HISTORY
      , "Work Directory:", os.getcwd()
      , "==============================================="
      , "===============================================", sep='\n')

# we only interest in the rules where state does not change anymore.
# which means, only the rules that already expired.
rucio_rules_history = (
    spark.read.format('avro').load(HDFS_RUCIO_RULES_HISTORY).withColumn('ID', lower(_hex(col('ID'))))
         .select("ID", "ACCOUNT", "NAME", "STATE", "EXPIRES_AT", "UPDATED_AT", "CREATED_AT").filter(f"""ACTIVITY = 'Analysis TapeRecall'""").cache()
         .filter(f"""\
                  1=1
                  AND EXPIRES_AT >= {int(start_datetime.timestamp()) * 1000}
                  AND EXPIRES_AT < {int(end_datetime.timestamp()) * 1000}
                  """)
         .cache()
)
rucio_rules_history.createOrReplaceTempView("rules_history")

HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{END_DATE}/'
print("==============================================="
      , "CRAB Table"
      , "==============================================="
      , "File Directory:", HDFS_CRAB_part
      , "Work Directory:", os.getcwd()
      , "==============================================="
      , "===============================================", sep='\n')

# do not filter taskdb by create time (TM_START_TIME) because it is possible that rules are created 6 months ago
tasks_df = (
    spark.read.format('avro').load(HDFS_CRAB_part)
         .select("TM_TASKNAME","TM_START_TIME","TM_TASK_STATUS",  'TM_TASKNAME', 'TM_START_TIME', 'TM_TASK_STATUS' , 'TM_DDM_REQID')
         .cache()
)
tasks_df.createOrReplaceTempView("tasks")

RUCIO : Rules History
File Directory:
/project/awg/cms/rucio/2024-10-01/rules_history/
Work Directory:
/eos/home-i00/t/tseethon/SWAN_projects/CRABServer/src/script/Monitor/crab-spark/notebooks
CRAB Table
File Directory:
/project/awg/cms/crab/tasks/2024-10-01/
Work Directory:
/eos/home-i00/t/tseethon/SWAN_projects/CRABServer/src/script/Monitor/crab-spark/notebooks


In [53]:
# Query data in daily

query = f"""\
WITH rn_t AS (
SELECT ID, ACCOUNT, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT,
       row_number() over(partition by ID order by UPDATED_AT desc) as row_num
FROM rules_history
),
latestupdate_t AS (
SELECT * FROM rn_t 
WHERE row_num = 1
),
calc_days_t AS (
SELECT ID, ACCOUNT, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT,
       CASE 
           WHEN STATE = 'O' THEN ceil((UPDATED_AT-CREATED_AT)/86400000)  
           WHEN STATE != 'O' AND EXPIRES_AT < {int(end_datetime.timestamp()) * 1000} THEN ceil((EXPIRES_AT-CREATED_AT)/86400000)
           ELSE 0
       END AS DAYS
FROM latestupdate_t
),
join_t AS (
SELECT * FROM calc_days_t
LEFT JOIN tasks ON calc_days_t.ID = tasks.TM_DDM_REQID
),
window_t AS (
SELECT ID, ACCOUNT, NAME, STATE, DAYS, EXPIRES_AT, UPDATED_AT, CREATED_AT, TM_TASKNAME, TM_START_TIME, TM_TASK_STATUS, 
       row_number() OVER (PARTITION BY join_t.ID ORDER BY join_t.TM_START_TIME DESC) AS row_num
FROM join_t 
),
uniqueid_t AS (
SELECT *
FROM window_t
WHERE row_num =1
), 
finalize_t AS (
SELECT ID, ACCOUNT, NAME, STATE, DAYS, EXPIRES_AT, UPDATED_AT, CREATED_AT, TM_TASKNAME, IFNULL(TM_START_TIME, 0) as TM_START_TIME, TM_TASK_STATUS, 
       CREATED_AT AS timestamp,
       'tape_recall_history' AS type
FROM uniqueid_t
)
SELECT * 
FROM finalize_t
"""

tmpdf = spark.sql(query)
tmpdf.show()


+--------------------+--------+--------------------+-----+----+-------------+-------------+-------------+--------------------+-------------+--------------+-------------+-------------------+
|                  ID| ACCOUNT|                NAME|STATE|DAYS|   EXPIRES_AT|   UPDATED_AT|   CREATED_AT|         TM_TASKNAME|TM_START_TIME|TM_TASK_STATUS|    timestamp|               type|
+--------------------+--------+--------------------+-----+----+-------------+-------------+-------------+--------------------+-------------+--------------+-------------+-------------------+
|0006907403fe4e948...|   tihsu|/DoublePhoton_Fla...|    O|   6|1726479827000|1726134227000|1725642766000|240906_171202:tih...|1725635522631|     SUBMITTED|1725642766000|tape_recall_history|
|002657a392fe46799...|  joshin|/Muon1/Run2024C-P...|    O|   6|1726479828000|1726436909000|1725985568000|240910_162509:jos...|1725978309883|        KILLED|1725985568000|tape_recall_history|
|007fa987e8b94ae98...|   tihsu|/SinglePhoton_Pt-..

In [54]:
tmpdf.count()

319

In [55]:
docs = tmpdf.toPandas().to_dict('records')


In [58]:
schema = {
        "settings": {"index": {"number_of_shards": "1", "number_of_replicas": "1"}},
        "mappings": {
            "properties": {
                "ID": {"ignore_above": 2048, "type": "keyword"},
                "ACCOUNT": {"ignore_above": 2048, "type": "keyword"},
                "NAME": {"ignore_above": 2048, "type": "keyword"},
                "STATE": {"ignore_above": 2048, "type": "keyword"},
                "DAYS": {"type": "long"},
                "EXPIRES_AT": {"format": "epoch_millis", "type": "date"},
                "UPDATED_AT": {"format": "epoch_millis", "type": "date"},
                "CREATED_AT": {"format": "epoch_millis", "type": "date"},
                "TM_TASKNAME": {"ignore_above": 2048, "type": "keyword"},
                "TM_START_TIME": {"format": "epoch_millis", "type": "date"},
                "TM_TASK_STATUS": {"ignore_above": 2048, "type": "keyword"},
                "type": {"ignore_above": 2048, "type": "keyword"},
                "timestamp": {"format": "epoch_millis", "type": "date"},
            }

        }

    }


In [60]:
import importlib
importlib.reload(osearch)
timestamp_str = int((end_datetime-timedelta(days=1)).timestamp()) # to convert to 'crab-test-taskdb-2024-09' in osearch lib, unit is seconds
print(timestamp_str)

1727654400


In [None]:
osearch.send_os(docs, index_name, schema, secretpath, timestamp_str)