In [2]:
import os
import pwd
import grp
import time
import xattr
from langchain.chains import RetrievalQA

In [3]:
# ANSI escape sequences for colors
CYAN = '\033[36m'
YELLOW = '\033[33m'
BLUE = '\033[34m'
GREEN = '\033[32m'
RESET = '\033[0m'

In [4]:
def list_xattrs_of_file(file_path):
    """
    List a file's extended attributes with colored output, including metadata similar to 'ls -l' command.

    Parameters:
    file_path (str): The path to the file whose xattrs will be listed.

    Returns:
    None
    """
    # Check if the file exists
    if not os.path.exists(file_path):
        print(f"The file '{file_path}' does not exist.")
        return
    if not os.path.isfile(file_path):
        print(f"The path '{file_path}' is not a file.")
        return

    # Get file stats
    stats = os.stat(file_path)
    # Calculate block size (assuming 512 bytes per block)
    block_size = stats.st_blocks * 512
    # Get file permissions
    permissions = oct(stats.st_mode)[-3:]
    # Get number of links
    links = stats.st_nlink
    # Get owner name
    owner = pwd.getpwuid(stats.st_uid).pw_name
    # Get group name
    group = grp.getgrgid(stats.st_gid).gr_name
    # Get file size in bytes
    size = stats.st_size
    # Get last modification time
    mtime = time.strftime('%b %d %H:%M', time.localtime(stats.st_mtime))

    # Print file metadata as key-value pairs with blue keys and green values
    print(f"{BLUE}file_name{RESET}: {GREEN}{os.path.basename(file_path)}{RESET}")
    print(f"{BLUE}block_size{RESET}: {GREEN}{block_size}{RESET}")
    print(f"{BLUE}permission{RESET}: {GREEN}{permissions}{RESET}")
    print(f"{BLUE}links{RESET}: {GREEN}{links}{RESET}")
    print(f"{BLUE}owner{RESET}: {GREEN}{owner}{RESET}")
    print(f"{BLUE}group{RESET}: {GREEN}{group}{RESET}")
    print(f"{BLUE}size{RESET}: {GREEN}{size}{RESET}")
    print(f"{BLUE}mod_time{RESET}: {GREEN}{mtime}{RESET}")

    try:
        # Get the list of xattr keys for the file
        attrs = xattr.listxattr(file_path)
        # Retrieve the value for each xattr key
        xattrs_dict = {key: xattr.getxattr(file_path, key) for key in attrs}
    except (IOError, PermissionError) as e:
        print(f"Could not read xattrs for '{os.path.basename(file_path)}': {e}")
        xattrs_dict = {}

    # Pretty-print the file's xattrs with color
    if xattrs_dict:
        print(f"{CYAN}plus xattr metadata{RESET}:")
        for key, value in xattrs_dict.items():
            # Decode the value for better readability if it's binary data
            try:
                value = value.decode('utf-8')
            except UnicodeDecodeError:
                value = value.hex()
            # Print key in cyan and value in yellow
            print(f"  {CYAN}{key}{RESET}: {YELLOW}{value}{RESET}")
    else:
        print(f"{CYAN}plus xattr metadata{RESET}: None")
    print()  # Add an empty line for better separation

In [5]:
synthetic_data ='/nfs/enclave_a/llm/synthetic_data'

In [7]:
if not os.path.isdir(synthetic_data):
    raise RuntimeError(f"Directory not found: {synthetic_data}")

for entry in sorted(os.listdir(synthetic_data)):
    path = os.path.join(synthetic_data, entry)
    if os.path.isfile(path):
        list_xattrs_of_file(path)

[34mfile_name[0m: [32mdoc_0_96543947.txt[0m
[34mblock_size[0m: [32m0[0m
[34mpermission[0m: [32m664[0m
[34mlinks[0m: [32m1[0m
[34mowner[0m: [32mjrsmith[0m
[34mgroup[0m: [32mjrsmith[0m
[34msize[0m: [32m48[0m
[34mmod_time[0m: [32mAug 26 15:31[0m
[36mplus xattr metadata[0m:
  [36muser.classification_level[0m: [33m{"doc_id": "doc_0_96543947", "tag": "classification_level", "value": "read"}[0m
  [36muser.originator_id[0m: [33m{"doc_id": "doc_0_96543947", "tag": "originator_id", "value": "treatment"}[0m
  [36muser.data_type[0m: [33m{"doc_id": "doc_0_96543947", "tag": "data_type", "value": "over"}[0m
  [36muser.sensitivity[0m: [33m{"doc_id": "doc_0_96543947", "tag": "sensitivity", "value": "since"}[0m
  [36muser.transfer_control[0m: [33m{"doc_id": "doc_0_96543947", "tag": "transfer_control", "value": "tend"}[0m
  [36muser.data_owner[0m: [33m{"doc_id": "doc_0_96543947", "tag": "data_owner", "value": "whatever"}[0m
  [36muser.data_stewar