
# Check globus sync
This notebook analyzes and compares file synchronization between two systems, Anvil and Bridges2, by identifying files to be updated, added, or removed. It provides summary statistics, validation checks, and detailed insights into the synchronization status.

In [1]:
%load_ext autoreload
%autoreload 2

import os
from os.path import join
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
# !sbatch /home/x-mho1/git/ltu-cmass-run/jobs/check_globus_sync.sh

In [3]:
wdir = '/anvil/scratch/x-mho1/globus'
os.listdir(wdir)

['quijotez0',
 'logs',
 'quijote',
 'scratch',
 'mtnglike',
 'bridges_files.txt',
 'mtng',
 'quijotelike',
 'rundir_cmass',
 'images',
 'anvil_files.txt',
 'old',
 'cmass_ngc',
 'for_sammy',
 'obs',
 'literature',
 'abacuslike',
 'abacus',
 'quijotelike-fid',
 'abacus1gpch',
 'subdirs.txt',
 'quijote3gpch',
 'shivamlike']

In [4]:
# def load_filelist(filepath):
#     df = pd.read_csv(filepath, sep='|', skipinitialspace=True, skiprows=[1])
#     df = df.map(lambda x: x.strip() if isinstance(x, str) else x)
#     df.columns = df.columns.str.strip()
#     df['Last Modified'] = pd.to_datetime(df['Last Modified'])
#     mask = df['File Type'] == 'file'
#     df = df[mask]
#     return df
import pandas as pd
import datetime

import pandas as pd
import datetime


def parse_ls_output(file_path):
    """Parses an `ls -lR` output file and returns a Pandas DataFrame, excluding directories and symbolic links."""

    with open(file_path, "r") as f:
        lines = f.readlines()

    entries = []
    current_dir = ""
    current_year = datetime.datetime.now().year

    for line in lines:
        line = line.strip()

        if line.endswith(":"):
            current_dir = line[:-1].split("cmass-ili/")[-1]

        elif line and not line.startswith("total"):
            parts = line.split(maxsplit=8)
            if len(parts) == 9:
                permissions, _, owner, group, size, month, day, time_or_year, filename = parts

                # **Skip directories and symbolic links**
                if permissions.startswith(("d", "l")):
                    continue

                # **Skip models/diag/galaxies/sgc_lightcone/ngc_lightcone/simbig_lightcone/mtng_lightcone**
                if any(
                    x in current_dir for x in
                    ["models", "diag", "galaxies", "sgc_lightcone",
                     "ngc_lightcone", "simbig_lightcone", "mtng_lightcone"]
                ):
                    continue

                full_path = f"{current_dir}/{filename}"

                # Parse last modified date
                if ":" in time_or_year:
                    file_datetime = datetime.datetime.strptime(
                        f"{month} {day} {current_year} {time_or_year}", "%b %d %Y %H:%M"
                    )
                else:
                    file_datetime = datetime.datetime.strptime(
                        f"{month} {day} {time_or_year}", "%b %d %Y"
                    )

                entries.append([full_path, permissions, int(
                    size), owner, group, file_datetime])

    return pd.DataFrame(entries, columns=["Filename", "Permissions", "Size", "Owner", "Group", "Last Modified"])


def changed(dfa, dfb):
    # Merge the two dataframes on the 'Filename' column to compare files
    merged_df = pd.merge(dfa, dfb, on='Filename', suffixes=(
        '_anvil', '_bridges2'), how='inner')

    # Identify files that will be changed on Bridges2
    # A file will be changed if its 'Last Modified' timestamp or 'Size' is different
    changed_files = merged_df[
        (merged_df['Last Modified_anvil'] > merged_df['Last Modified_bridges2']) |
        (merged_df['Size_anvil'] != merged_df['Size_bridges2'])
    ]

    # Display the list of files that will be changed
    return changed_files[['Filename', 'Last Modified_anvil',
                          'Last Modified_bridges2', 'Size_anvil', 'Size_bridges2']]


def count_parents(df, depth=1):
    parent_dirs = df['Filename'].apply(
        lambda x: "/".join(x.split("/")[:depth + 1]))
    counts = Counter(parent_dirs)

    for directory, count in counts.items():
        print(f"{directory}: {count}")

    return dict(counts)


def missing(dfa, dfb):
    # Identify new files in dfa that do not exist in dfb
    new_files = dfa[~dfa['Filename'].isin(dfb['Filename'])]
    return new_files[['Filename', 'Last Modified', 'Size']]

In [5]:
# check all

s = pd.DataFrame(columns=['sim', 'to_change', 'to_add', 'to_remove'])
for i, sim in enumerate(os.listdir(wdir)):
    simdir = join(wdir, sim)
    if not os.path.isdir(simdir):
        continue
    try:
        dfa = parse_ls_output(join(simdir, 'anvil_files.txt'))
        dfb = parse_ls_output(join(simdir, 'bridges_files.txt'))
    except Exception as e:
        print(sim, '\t', e)
        continue

    o1 = changed(dfa, dfb)
    o2 = missing(dfa, dfb)
    o3 = missing(dfb, dfa)
    s.loc[i] = [sim, len(o1), len(o2), len(o3)]
s = s.sort_values(by='sim')

In [7]:
print('When transferring from Anvil to Bridges2:')
s

When transferring from Anvil to Bridges2:


Unnamed: 0,sim,to_change,to_add,to_remove
17,abacus,0,0,0
19,abacus1gpch,119,0,0
16,abacuslike,1,0,0
12,cmass_ngc,0,0,0
13,for_sammy,0,0,0
9,images,0,0,0
15,literature,0,0,0
1,logs,0,1527,71
6,mtng,0,0,0
4,mtnglike,0,0,0


In [24]:
dirname = 'quijote'
simdir = join(wdir, dirname)

dfa = parse_ls_output(join(simdir, 'anvil_files.txt'))
dfb = parse_ls_output(join(simdir, 'bridges_files.txt'))

print('NumAnvil:', len(dfa))
print('NumBridges:', len(dfb))

is_subset = set(dfa['Filename']).issubset(set(dfb['Filename']))
print("\nAre all filenames on anvil a subset of bridges's filenames?", is_subset)

is_subset = set(dfb['Filename']).issubset(set(dfa['Filename']))
print("Are all filenames on bridges a subset of anvil's filenames?", is_subset)

NumAnvil: 11925
NumBridges: 9931

Are all filenames on anvil a subset of bridges's filenames? False
Are all filenames on bridges a subset of anvil's filenames? False


In [25]:
print(f"Files which will be changed on Bridges2:")
o = changed(dfa, dfb)

count_parents(o, 2)
o['Filename'].head(100).values

Files which will be changed on Bridges2:
quijote/nbody/L1000-N128: 2000


array(['quijote/nbody/L1000-N128/0/config.yaml',
       'quijote/nbody/L1000-N128/1/config.yaml',
       'quijote/nbody/L1000-N128/10/config.yaml',
       'quijote/nbody/L1000-N128/100/config.yaml',
       'quijote/nbody/L1000-N128/1000/config.yaml',
       'quijote/nbody/L1000-N128/1001/config.yaml',
       'quijote/nbody/L1000-N128/1002/config.yaml',
       'quijote/nbody/L1000-N128/1003/config.yaml',
       'quijote/nbody/L1000-N128/1004/config.yaml',
       'quijote/nbody/L1000-N128/1005/config.yaml',
       'quijote/nbody/L1000-N128/1006/config.yaml',
       'quijote/nbody/L1000-N128/1007/config.yaml',
       'quijote/nbody/L1000-N128/1008/config.yaml',
       'quijote/nbody/L1000-N128/1009/config.yaml',
       'quijote/nbody/L1000-N128/101/config.yaml',
       'quijote/nbody/L1000-N128/1010/config.yaml',
       'quijote/nbody/L1000-N128/1011/config.yaml',
       'quijote/nbody/L1000-N128/1012/config.yaml',
       'quijote/nbody/L1000-N128/1013/config.yaml',
       'quijote/nbody/

In [26]:
print("Files on Anvil that do not exist on Bridges2:")
o = missing(dfa, dfb)

count_parents(o, 2)
o['Filename'].values  # .iloc[::500]  # .head()

Files on Anvil that do not exist on Bridges2:
quijote/nbody_fof/L1000-N128: 4000


array(['quijote/nbody_fof/L1000-N128/0/config.yaml',
       'quijote/nbody_fof/L1000-N128/0/halos.h5',
       'quijote/nbody_fof/L1000-N128/1/config.yaml', ...,
       'quijote/nbody_fof/L1000-N128/998/halos.h5',
       'quijote/nbody_fof/L1000-N128/999/config.yaml',
       'quijote/nbody_fof/L1000-N128/999/halos.h5'], dtype=object)

In [27]:
print("Files on Bridges2 that do not exist on Anvil:")
o = missing(dfb, dfa)

count_parents(o, 2)
o['Filename'].values  # [::10]  # .head()

Files on Bridges2 that do not exist on Anvil:
quijote/meshed/L1000-N128: 2000
quijote/wn/N1152: 1
quijote/wn/N128: 2
quijote/wn/N256: 3


array(['quijote/meshed/L1000-N128/0/config.yaml',
       'quijote/meshed/L1000-N128/1/config.yaml',
       'quijote/meshed/L1000-N128/10/config.yaml', ...,
       'quijote/wn/N256/wn_0.dat', 'quijote/wn/N256/wn_3.dat',
       'quijote/wn/N256/wn_5.dat'], dtype=object)