
# Check globus sync
This notebook analyzes and compares file synchronization between two systems, Anvil and Bridges2, by identifying files to be updated, added, or removed. It provides summary statistics, validation checks, and detailed insights into the synchronization status.

In [1]:
%load_ext autoreload
%autoreload 2

import os
from os.path import join
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

In [1]:
# !sbatch /home/x-mho1/git/ltu-cmass-run/jobs/check_globus_sync.sh

In [2]:
wdir = '/anvil/scratch/x-mho1/globus'
os.listdir(wdir)

['quijotez0',
 'logs',
 'quijote',
 'scratch',
 'mtnglike',
 'mtng',
 'quijotelike',
 'rundir_cmass',
 'images',
 'old',
 'cmass_ngc',
 'obs',
 'literature',
 'abacuslike',
 'abacus',
 'quijotelike-fid',
 'subdirs.txt',
 'quijote3gpch',
 'shivamlike']

In [47]:
def load_filelist(filepath):
    df = pd.read_csv(filepath, sep='|', skipinitialspace=True, skiprows=[1])
    df = df.map(lambda x: x.strip() if isinstance(x, str) else x)
    df.columns = df.columns.str.strip()
    df['Last Modified'] = pd.to_datetime(df['Last Modified'])
    mask = df['File Type'] == 'file'
    df = df[mask]
    return df


def changed(dfa, dfb):
    # Merge the two dataframes on the 'Filename' column to compare files
    merged_df = pd.merge(dfa, dfb, on='Filename', suffixes=(
        '_anvil', '_bridges2'), how='inner')

    # Identify files that will be changed on Bridges2
    # A file will be changed if its 'Last Modified' timestamp or 'Size' is different
    changed_files = merged_df[
        (merged_df['Last Modified_anvil'] > merged_df['Last Modified_bridges2']) |
        (merged_df['Size_anvil'] != merged_df['Size_bridges2'])
    ]

    # Display the list of files that will be changed
    return changed_files[['Filename', 'Last Modified_anvil',
                          'Last Modified_bridges2', 'Size_anvil', 'Size_bridges2']]


def count_parents(df, label=''):
    parent_dirs = df['Filename'].apply(lambda x: x.split('/')[0])
    c = Counter(parent_dirs)
    for k, v in c.items():
        print(f"{label}/{k}: {v}")


def missing(dfa, dfb):
    # Identify new files in dfa that do not exist in dfb
    new_files = dfa[~dfa['Filename'].isin(dfb['Filename'])]
    return new_files[['Filename', 'Last Modified', 'Size']]

In [48]:
# check all

s = pd.DataFrame(columns=['sim', 'to_change', 'to_add', 'to_remove'])
for i, sim in enumerate(os.listdir(wdir)):
    simdir = join(wdir, sim)
    if not os.path.isdir(simdir):
        continue
    try:
        dfa = load_filelist(join(simdir, 'anvil_files.txt'))
        dfb = load_filelist(join(simdir, 'bridges_files.txt'))
    except Exception as e:
        print(sim, '\t', e)
        continue

    o1 = changed(dfa, dfb)
    o2 = missing(dfa, dfb)
    o3 = missing(dfb, dfa)
    s.loc[i] = [sim, len(o1), len(o2), len(o3)]
s = s.sort_values(by='sim')

logs 	 No columns to parse from file
quijote 	 No columns to parse from file
mtnglike 	 No columns to parse from file
quijotelike 	 No columns to parse from file
abacuslike 	 No columns to parse from file
abacus 	 No columns to parse from file
shivamlike 	 No columns to parse from file


In [49]:
print('When transferring from Anvil to Bridges2:')
s

When transferring from Anvil to Bridges2:


Unnamed: 0,sim,to_change,to_add,to_remove
10,cmass_ngc,0,0,0
8,images,0,0,0
12,literature,0,0,0
5,mtng,0,0,0
11,obs,0,0,0
9,old,0,0,0
17,quijote3gpch,0,4,0
15,quijotelike-fid,0,0,0
0,quijotez0,0,0,0
7,rundir_cmass,0,0,0


In [50]:
dirname = 'mtng'
simdir = join(wdir, dirname)

dfa = load_filelist(join(simdir, 'anvil_files.txt'))
dfb = load_filelist(join(simdir, 'bridges_files.txt'))

print('NumAnvil:', len(dfa))
print('NumBridges:', len(dfb))

is_subset = set(dfa['Filename']).issubset(set(dfb['Filename']))
print("\nAre all filenames on anvil a subset of bridges's filenames?", is_subset)

is_subset = set(dfb['Filename']).issubset(set(dfa['Filename']))
print("Are all filenames on bridges a subset of anvil's filenames?", is_subset)

NumAnvil: 17
NumBridges: 17

Are all filenames on anvil a subset of bridges's filenames? True
Are all filenames on bridges a subset of anvil's filenames? True


In [51]:
print(f"Files which will be changed on Bridges2:")
o = changed(dfa, dfb)

count_parents(o, dirname)
o

Files which will be changed on Bridges2:


Unnamed: 0,Filename,Last Modified_anvil,Last Modified_bridges2,Size_anvil,Size_bridges2


In [52]:
print("Files on Anvil that do not exist on Bridges2:")
o = missing(dfa, dfb)

count_parents(o, dirname)
o

Files on Anvil that do not exist on Bridges2:


Unnamed: 0,Filename,Last Modified,Size


In [53]:
print("Files on Bridges2 that do not exist on Anvil:")
o = missing(dfb, dfa)

count_parents(o, 'bridges2')
o

Files on Bridges2 that do not exist on Anvil:


Unnamed: 0,Filename,Last Modified,Size
