In [1]:
from datetime import datetime, timedelta, timezone
import os
import time
import pandas as pd

from pyspark import SparkContext, StorageLevel
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    current_user,
    col, collect_list, concat_ws, greatest, lit, lower, when,
    avg as _avg,
    count as _count,
    hex as _hex,
    max as _max,
    min as _min,
    round as _round,
    sum as _sum,
)
from pyspark.sql.types import (
    StructType,
    LongType,
    StringType,
    StructField,
    DoubleType,
    IntegerType,
)

In [2]:
spark = SparkSession\
        .builder\
        .appName('crab-taskdb')\
        .getOrCreate()
spark

24/10/01 23:08:26 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [31]:
spark.catalog.clearCache()

In [3]:
# arguments
# secret path, also check if file exists
secretpath = os.environ.get('OPENSEARCH_SECRET_PATH', f'{os.getcwd()}/../workdir/secret_opensearch.txt')
if not os.path.isfile(secretpath): 
    raise Exception(f'OS secrets file {secretpath} does not exists')
# if PROD, index prefix will be `crab-*`, otherwise `crab-test-*`
PROD = os.environ.get('PROD', 'false').lower() in ('true', '1', 't')
# FROM_DATE, in strptime("%Y-%m-%d")
START = os.environ.get('START_DATE', None) 
END = os.environ.get('END_DATE', None)

In [4]:
# try to import osearch from current directory, fallback to $PWD/../workdir if not found
try:
    import osearch
except ModuleNotFoundError:
    import sys
    sys.path.insert(0, f'{os.getcwd()}/../workdir')
    import osearch

In [22]:
# variables for run inside notebook
START_DATE = "2020-01-01"
END_DATE = "2024-10-31"

In [23]:
# const variable
index_name = 'crab-test-taskdb' # always put test index prefix

In [24]:
# if cronjob, replace constant with value from env
if START and END:
    START_DATE = START
    END_DATE = END
# use prod index pattern if this execution is for production
if PROD:
    index_name = f'crab-{"-".join(index_name.split("-")[2:])}'
# datetime object
start_datetime = datetime.strptime(START_DATE, "%Y-%m-%d").replace(tzinfo=timezone.utc)
end_datetime = datetime.strptime(END_DATE, "%Y-%m-%d").replace(tzinfo=timezone.utc)
if end_datetime < start_datetime:
    raise Exception(f"end date ({END_DATE}) is less than start date ({START_DATE})")

In [32]:
# debug
print(START_DATE, 
      start_datetime, 
      END_DATE, 
      end_datetime, 
      index_name, 
      sep='\n')

2024-01-01
2024-01-01 00:00:00+00:00
2024-09-18
2024-09-18 00:00:00+00:00
crab-test-taskdb


In [33]:
HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{END_DATE}/' # data each day in hdfs contain whole table
print("==============================================="
      , "CRAB Table"
      , "==============================================="
      , "File Directory:", HDFS_CRAB_part
      , "Work Directory:", os.getcwd()
      , "==============================================="
      , "===============================================", sep='\n')

tasks_df = spark.read.format('avro').load(HDFS_CRAB_part).cache()
tasks_df = ( 
    crab_part.select("TM_TASKNAME","TM_START_TIME","TM_TASK_STATUS","TM_SPLIT_ALGO","TM_USERNAME","TM_USER_ROLE","TM_JOB_TYPE","TM_IGNORE_LOCALITY","TM_SCRIPTEXE","TM_USER_CONFIG")
             .filter(f"""\
                  1=1
                  AND TM_START_TIME >= {int(start_datetime.timestamp()) * 1000}
                  AND TM_START_TIME < {int(end_datetime.timestamp()) * 1000}
              """)
             .cache()
)
tasks_df.createOrReplaceTempView("tasks")

CRAB Table
File Directory:
/project/awg/cms/crab/tasks/2024-09-18/
Work Directory:
/eos/home-i00/t/tseethon/SWAN_projects/CRABServer/src/script/Monitor/crab-spark/notebooks
+------------------------------------------------------------------------------------------+-------------+--------------+-------------+-----------+------------+-----------+------------------+--------------+------------------------------------------------------------------------------------------------------+
|TM_TASKNAME                                                                               |TM_START_TIME|TM_TASK_STATUS|TM_SPLIT_ALGO|TM_USERNAME|TM_USER_ROLE|TM_JOB_TYPE|TM_IGNORE_LOCALITY|TM_SCRIPTEXE  |TM_USER_CONFIG                                                                                        |
+------------------------------------------------------------------------------------------+-------------+--------------+-------------+-----------+------------+-----------+------------------+--------------+-

In [34]:
query = f"""\
WITH reqacc_tb AS (         
SELECT TM_TASKNAME, TM_START_TIME, TM_TASK_STATUS, TM_SPLIT_ALGO, TM_USERNAME, TM_USER_ROLE, TM_JOB_TYPE, TM_IGNORE_LOCALITY, TM_SCRIPTEXE,
       coalesce(get_json_object(TM_USER_CONFIG, '$.requireaccelerator'), 'false') AS REQUIRE_ACCELERATOR
FROM tasks
),
finalize_tb AS (
SELECT TM_TASKNAME, TM_START_TIME, TM_TASK_STATUS, TM_SPLIT_ALGO, TM_USERNAME, TM_USER_ROLE, TM_JOB_TYPE, TM_IGNORE_LOCALITY, TM_SCRIPTEXE, REQUIRE_ACCELERATOR,
       TM_START_TIME AS timestamp,
       'taskdb' AS type
FROM reqacc_tb
)
SELECT * FROM finalize_tb
"""

tmpdf = spark.sql(query)
tmpdf.show(10)



+--------------------+-------------+--------------+-------------+-----------+------------+-----------+------------------+--------------+-------------------+-------------+------+
|         TM_TASKNAME|TM_START_TIME|TM_TASK_STATUS|TM_SPLIT_ALGO|TM_USERNAME|TM_USER_ROLE|TM_JOB_TYPE|TM_IGNORE_LOCALITY|  TM_SCRIPTEXE|REQUIRE_ACCELERATOR|    timestamp|  type|
+--------------------+-------------+--------------+-------------+-----------+------------+-----------+------------------+--------------+-------------------+-------------+------+
|240103_102550:cdi...|1704273950144|     SUBMITTED|    LumiBased|   cdifraia|        NULL|   Analysis|                 F|crab_script.sh|              false|1704273950144|taskdb|
|240103_130243:mhu...|1704283363607|     SUBMITTED|    LumiBased|   mhuwiler|        NULL|   Analysis|                 F|          NULL|              false|1704283363607|taskdb|
|240104_171945:ans...|1704385185624|        KILLED|    LumiBased|   anstahll|        NULL|   Analysis|        

In [35]:
docs = tmpdf.toPandas().to_dict('records')

In [36]:
schema = {
            "settings": {"index": {"number_of_shards": "1", "number_of_replicas": "1"}},
            "mappings": {
                "properties": {
                    "TM_TASKNAME": {"ignore_above": 2048, "type": "keyword"},
                    "TM_START_TIME": {"format": "epoch_millis", "type": "date"},
                    'TM_TASK_STATUS': {"ignore_above": 2048, "type": "keyword"},
                    "TM_SPLIT_ALGO": {"ignore_above": 2048, "type": "keyword"},
                    "TM_USERNAME": {"ignore_above": 2048, "type": "keyword"},
                    "TM_USER_ROLE": {"ignore_above": 2048, "type": "keyword"},
                    "TM_JOB_TYPE": {"ignore_above": 2048, "type": "keyword"},
                    "TM_IGNORE_LOCALITY": {"ignore_above": 2048, "type": "keyword"},
                    "TM_SCRIPTEXE": {"ignore_above": 2048, "type": "keyword"},
                    "REQUIRE_ACCELERATOR": {"ignore_above": 2048, "type": "keyword"},
                    "type": {"ignore_above": 2048, "type": "keyword"},
                    "timestamp": {"format": "epoch_millis", "type": "date"},
                }
            }
        }

In [40]:
import importlib
importlib.reload(osearch)
timestamp_str = int((end_datetime-timedelta(days=1)).timestamp()) # to convert to 'crab-test-taskdb-2024-09' in osearch lib, unit is seconds
print(timestamp_str)

1726531200


In [None]:
osearch.send_os(docs, index_name, schema, secretpath, timestamp_str)