In [20]:
import pandas as pd

# Serialize SEAL dumps
---

In this step, we load the files that contain the dumps provided by SEAL and store them in a TimeRangeSourceData format. This format is a dictionary that contains the following keys:
start: the start time of the dump
end: the end time of the dump
file: the file that contains the dump
source: the source of the dump ( SEAL or Rucio ). In this case, it will be SEAL.

In [17]:
from infratructure.repository.data_repository import list_files
from datetime import datetime
from core.entity import TimeRangeSourceData
dir = 'data/seal'
data_files = list_files(dir)
data_files = [x.split('.')[0] for x in data_files]
seal_dumps_time_ranges = []
for file in data_files:
    _, start_date, end_date = file.split('_')
    start_date = datetime.strptime(start_date, '%Y%m%d')
    end_date = datetime.strptime(end_date, '%Y%m%d')
    seal_dumps_time_ranges.append(TimeRangeSourceData(start=start_date, end=end_date, source='SEAL', file=f"{dir}/{file}.csv"))

print(seal_dumps_time_ranges)

[TimeRangeSourceData(start=datetime.datetime(2022, 1, 1, 0, 0), end=datetime.datetime(2022, 10, 1, 0, 0), file='data/seal/rucio_20220101_20221001.csv', source='SEAL'), TimeRangeSourceData(start=datetime.datetime(2022, 10, 1, 0, 0), end=datetime.datetime(2022, 12, 1, 0, 0), file='data/seal/rucio_20221001_20221201.csv', source='SEAL'), TimeRangeSourceData(start=datetime.datetime(2022, 12, 1, 0, 0), end=datetime.datetime(2023, 2, 1, 0, 0), file='data/seal/rucio_20221201_20230201.csv', source='SEAL'), TimeRangeSourceData(start=datetime.datetime(2023, 1, 31, 0, 0), end=datetime.datetime(2023, 4, 10, 0, 0), file='data/seal/rucio_20230131_20230410.csv', source='SEAL')]


## Check the continuity of the dumps

For all the files that we have, we will check if the time intervals are chained. This means that the end time of a dump is the same as the start time of the next dump. If this is not the case, we will print an error message.

In [18]:
from datetime import timedelta
# sort by start date
seal_dumps_time_ranges = sorted(seal_dumps_time_ranges, key=lambda x: x.start)

# check if the entrires form a continuous time range
for i in range(1, len(seal_dumps_time_ranges)):
    if seal_dumps_time_ranges[i].start != seal_dumps_time_ranges[i-1].end:
        print(f"Error: {seal_dumps_time_ranges[i-1].end} != {seal_dumps_time_ranges[i].start}")
        print(f"Check files {seal_dumps_time_ranges[i-1].file} and {seal_dumps_time_ranges[i].file}")
        


Error: 2023-02-01 00:00:00 != 2023-01-31 00:00:00
Check files data/seal/rucio_20221201_20230201.csv and data/seal/rucio_20230131_20230410.csv


## Merge the dumps

Load the dumps as pandas dataframes and merge them into a single dataframe.

In [21]:
selected_time_ranges = seal_dumps_time_ranges[0:]

for selected_time_range in selected_time_ranges:
    df = pd.read_csv(selected_time_range.file)
    print(df.head())

                                name  \
0  HITS.10075481._000432.pool.root.1   
1  HITS.10075481._000433.pool.root.1   
2  HITS.10075481._000434.pool.root.1   
3  HITS.10075481._000435.pool.root.1   
4  HITS.10075481._000444.pool.root.1   

                                           path      size  sha256  \
0  mc15_14TeV/HITS.10075481._000432.pool.root.1  16535161     NaN   
1  mc15_14TeV/HITS.10075481._000433.pool.root.1  16438598     NaN   
2  mc15_14TeV/HITS.10075481._000434.pool.root.1  16186512     NaN   
3  mc15_14TeV/HITS.10075481._000435.pool.root.1  17000891     NaN   
4  mc15_14TeV/HITS.10075481._000444.pool.root.1  17039812     NaN   

                  mtime  
0  2022-05-09T19:54:06Z  
1  2022-05-09T19:53:36Z  
2  2022-05-09T19:53:27Z  
3  2022-05-09T19:55:58Z  
4  2022-05-09T19:53:01Z  
                              name                                       path  \
0  HITS.580376._000432.pool.root.1  mc11_7TeV/HITS.580376._000432.pool.root.1   
1  HITS.743322._010041.poo

In [1]:
from rucio.api.did import list_dids

list_dids(scope='user.mlassnig')

<generator object stream_session.<locals>.new_funct at 0x11969c6a0>