In [11]:
import pandas as pd
import zipfile
import os
import xml.etree.ElementTree as ET
from tqdm import tqdm

In [12]:
def parse_top_level_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    return root

top_level_xml_path = './avocado-1.0.2/data/collection.xml'
top_level_root = parse_top_level_xml(top_level_xml_path)

custodians = []
for include in top_level_root.findall('.//{http://www.w3.org/2001/XInclude}include'):
    custodian_file = include.get('href')
    custodians.append(custodian_file)

# Example output
for custodian_file in custodians:
    print(f"Custodian File: {custodian_file}")

Custodian File: custodians/001.xml
Custodian File: custodians/002.xml
Custodian File: custodians/003.xml
Custodian File: custodians/004.xml
Custodian File: custodians/005.xml
Custodian File: custodians/006.xml
Custodian File: custodians/007.xml
Custodian File: custodians/008.xml
Custodian File: custodians/009.xml
Custodian File: custodians/010.xml
Custodian File: custodians/011.xml
Custodian File: custodians/012.xml
Custodian File: custodians/013.xml
Custodian File: custodians/014.xml
Custodian File: custodians/015.xml
Custodian File: custodians/016.xml
Custodian File: custodians/017.xml
Custodian File: custodians/018.xml
Custodian File: custodians/019.xml
Custodian File: custodians/020.xml
Custodian File: custodians/021.xml
Custodian File: custodians/022.xml
Custodian File: custodians/023.xml
Custodian File: custodians/024.xml
Custodian File: custodians/025.xml
Custodian File: custodians/026.xml
Custodian File: custodians/027.xml
Custodian File: custodians/028.xml
Custodian File: cust

In [28]:
import os
import zipfile
import xml.etree.ElementTree as ET
import pandas as pd
from tqdm import tqdm

def parse_custodian_xml(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        return root
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except ET.ParseError as e:
        print(f"Error parsing file {file_path}: {e}")
        return None

def extract_metadata(custodian_root):
    emails = []
    if custodian_root is not None:
        items = custodian_root.findall('.//item[@type="email"]')
        for item in items:
            email_data = {field.get('name'): field.text.strip() if field.text else '' for field in item.findall('.//metadata/field')}
            body_file_path = item.find('.//file').get('path', '') if item.find('.//file') is not None else ''
            email_data['body_file_path'] = body_file_path
            emails.append(email_data)
    return emails

def read_body_file_from_zip(zip_file_path, body_file_path):
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            normalized_path = body_file_path.lstrip('text/')
            if normalized_path in zip_ref.namelist():
                with zip_ref.open(normalized_path) as body_file:
                    return body_file.read().decode('utf-8')
            else:
                print(f"File {normalized_path} not found in zip {zip_file_path}")
                return ''
    except FileNotFoundError:
        print(f"Zip file not found: {zip_file_path}")
        return ''
    except Exception as e:
        print(f"Error reading {body_file_path} from {zip_file_path}: {e}")
        return ''

# Base path where XML and zip files are located
base_path = 'avocado-1.0.2/data/'
custodian_folder = 'custodians/'
text_folder = 'text/'

# List all custodian XML files
custodian_path = os.path.join(base_path, custodian_folder)
text_path = os.path.join(base_path, text_folder)
xml_files = [f for f in os.listdir(custodian_path) if f.endswith('.xml')]

# CSV file to write the emails incrementally
csv_file_path = 'emails_new.csv'

# Function to write email data to CSV
def write_emails_to_csv(emails, file_path, mode='a', header=False):
    df = pd.DataFrame(emails)
    df.to_csv(file_path, mode=mode, header=header, index=False)

first_write = True

# Process each XML file with tqdm progress bar
for xml_file in tqdm(xml_files, desc="Processing Custodian XML Files"):
    tqdm.write(f"Currently processing: {xml_file}")
    xml_path = os.path.join(custodian_path, xml_file)
    
    # Parse XML and extract metadata
    custodian_root = parse_custodian_xml(xml_path)
    if custodian_root is None:
        continue
    emails = extract_metadata(custodian_root)
    
    # Determine corresponding zip file
    zip_file_name = xml_file.replace('.xml', '.zip')
    zip_file_path = os.path.join(text_path, zip_file_name)
    
    if os.path.exists(zip_file_path):
        # Add body content to email data with tqdm progress bar
        for email in tqdm(emails, desc=f"Processing Emails in {xml_file}", leave=False):
            body_file_path = email.get('body_file_path')
            if body_file_path:
                email['body_content'] = read_body_file_from_zip(zip_file_path, body_file_path)
            else:
                email['body_content'] = ''
        
        # Write emails to CSV
        write_emails_to_csv(emails, csv_file_path, mode='a', header=first_write)
        if first_write:
            first_write = False
    else:
        print(f"Zip file not found for XML: {zip_file_path}")


Processing Custodian XML Files:   0%|          | 0/279 [00:00<?, ?it/s]

Currently processing: 162.xml


Processing Custodian XML Files:   0%|          | 1/279 [01:34<7:18:28, 94.63s/it]

Currently processing: 176.xml


Processing Custodian XML Files:   0%|          | 1/279 [01:34<7:18:28, 94.63s/it]

Currently processing: 189.xml


Processing Custodian XML Files:   1%|          | 3/279 [02:31<3:29:03, 45.45s/it]

Currently processing: 214.xml


Processing Custodian XML Files:   1%|▏         | 4/279 [02:31<2:16:38, 29.81s/it]

Currently processing: 200.xml


Processing Custodian XML Files:   2%|▏         | 5/279 [03:50<3:30:49, 46.17s/it]

Currently processing: 228.xml


Processing Custodian XML Files:   2%|▏         | 6/279 [03:51<2:23:50, 31.61s/it]

Currently processing: 016.xml


Processing Custodian XML Files:   3%|▎         | 7/279 [06:05<4:49:01, 63.75s/it]

Currently processing: 002.xml


Processing Custodian XML Files:   3%|▎         | 7/279 [06:05<4:49:01, 63.75s/it]

Currently processing: 003.xml


Processing Custodian XML Files:   3%|▎         | 7/279 [06:05<4:49:01, 63.75s/it]

Currently processing: 017.xml


Processing Custodian XML Files:   4%|▎         | 10/279 [06:06<2:03:09, 27.47s/it]

Currently processing: 229.xml


Processing Custodian XML Files:   4%|▍         | 11/279 [06:07<1:36:46, 21.67s/it]

Currently processing: 201.xml


Processing Custodian XML Files:   4%|▍         | 12/279 [06:11<1:18:27, 17.63s/it]

Currently processing: 215.xml


Processing Custodian XML Files:   5%|▍         | 13/279 [07:39<2:36:14, 35.24s/it]

Currently processing: 188.xml


Processing Custodian XML Files:   5%|▌         | 14/279 [09:07<3:37:10, 49.17s/it]

Currently processing: 177.xml


Processing Custodian XML Files:   5%|▌         | 14/279 [09:07<3:37:10, 49.17s/it]

Currently processing: 163.xml


Processing Custodian XML Files:   6%|▌         | 16/279 [22:17<14:27:26, 197.90s/it]

Currently processing: 175.xml


Processing Custodian XML Files:   6%|▌         | 17/279 [23:26<12:11:28, 167.51s/it]

Currently processing: 161.xml


Processing Custodian XML Files:   6%|▋         | 18/279 [31:45<18:11:45, 250.98s/it]

Currently processing: 149.xml


Processing Custodian XML Files:   7%|▋         | 19/279 [31:46<13:20:27, 184.72s/it]

Currently processing: 203.xml


Processing Custodian XML Files:   7%|▋         | 20/279 [32:10<10:07:53, 140.82s/it]

Currently processing: 217.xml


Processing Custodian XML Files:   7%|▋         | 20/279 [32:10<10:07:53, 140.82s/it]

Currently processing: 001.xml


Processing Custodian XML Files:   8%|▊         | 22/279 [33:09<6:34:02, 91.99s/it]  

Currently processing: 015.xml


Processing Custodian XML Files:   8%|▊         | 23/279 [33:48<5:38:12, 79.27s/it]

Currently processing: 029.xml


Processing Custodian XML Files:   9%|▊         | 24/279 [34:10<4:35:27, 64.82s/it]

Currently processing: 028.xml


Processing Custodian XML Files:   9%|▊         | 24/279 [34:11<4:35:27, 64.82s/it]

Currently processing: 014.xml


Processing Custodian XML Files:   9%|▉         | 26/279 [34:34<2:59:03, 42.47s/it]

Currently processing: 216.xml


Processing Custodian XML Files:  10%|▉         | 27/279 [1:03:23<30:15:58, 432.38s/it]

Currently processing: 202.xml


Processing Custodian XML Files:  10%|█         | 28/279 [1:03:24<22:39:59, 325.10s/it]

Currently processing: 148.xml


Processing Custodian XML Files:  10%|█         | 28/279 [1:03:24<22:39:59, 325.10s/it]

Currently processing: 160.xml


Processing Custodian XML Files:  11%|█         | 30/279 [1:04:06<13:46:17, 199.11s/it]

Currently processing: 174.xml


Processing Custodian XML Files:  11%|█         | 31/279 [1:04:07<10:35:25, 153.73s/it]

Currently processing: 158.xml


Processing Custodian XML Files:  11%|█▏        | 32/279 [1:04:18<8:08:00, 118.54s/it] 

Currently processing: 170.xml


Processing Custodian XML Files:  12%|█▏        | 33/279 [1:04:20<6:02:07, 88.32s/it] 

Currently processing: 164.xml


Processing Custodian XML Files:  12%|█▏        | 34/279 [1:04:33<4:36:25, 67.69s/it]

Currently processing: 206.xml


Processing Custodian XML Files:  13%|█▎        | 35/279 [1:11:34<11:16:49, 166.43s/it]

Currently processing: 212.xml


Processing Custodian XML Files:  13%|█▎        | 36/279 [1:11:35<8:02:32, 119.15s/it] 

Currently processing: 038.xml


Processing Custodian XML Files:  13%|█▎        | 37/279 [1:11:36<5:43:30, 85.17s/it] 

Currently processing: 004.xml


Processing Custodian XML Files:  14%|█▎        | 38/279 [1:11:45<4:12:20, 62.83s/it]

Currently processing: 010.xml


Processing Custodian XML Files:  14%|█▎        | 38/279 [1:11:45<4:12:20, 62.83s/it]

Currently processing: 011.xml


Processing Custodian XML Files:  14%|█▍        | 40/279 [1:12:45<3:10:25, 47.81s/it]

Currently processing: 005.xml


Processing Custodian XML Files:  15%|█▍        | 41/279 [1:13:01<2:39:11, 40.13s/it]

Currently processing: 039.xml


Processing Custodian XML Files:  15%|█▌        | 42/279 [1:13:01<1:57:30, 29.75s/it]

Currently processing: 213.xml


Processing Custodian XML Files:  15%|█▌        | 43/279 [1:13:13<1:37:39, 24.83s/it]

Currently processing: 207.xml


Processing Custodian XML Files:  16%|█▌        | 44/279 [1:13:47<1:47:15, 27.38s/it]

Currently processing: 165.xml


Processing Custodian XML Files:  16%|█▌        | 45/279 [1:13:55<1:25:46, 22.00s/it]

Currently processing: 171.xml


Processing Custodian XML Files:  16%|█▌        | 45/279 [1:13:55<1:25:46, 22.00s/it]

Currently processing: 159.xml


Processing Custodian XML Files:  17%|█▋        | 47/279 [1:13:56<47:15, 12.22s/it]  

Currently processing: 167.xml


Processing Custodian XML Files:  17%|█▋        | 48/279 [1:28:53<14:34:12, 227.07s/it]

Currently processing: 173.xml


Processing Custodian XML Files:  18%|█▊        | 49/279 [1:50:39<32:14:32, 504.66s/it]

Currently processing: 198.xml


Processing Custodian XML Files:  18%|█▊        | 50/279 [1:51:01<23:50:56, 374.92s/it]

Currently processing: 239.xml


Processing Custodian XML Files:  18%|█▊        | 51/279 [1:53:20<19:36:11, 309.52s/it]

Currently processing: 211.xml


Processing Custodian XML Files:  19%|█▊        | 52/279 [1:54:24<15:07:10, 239.78s/it]

Currently processing: 205.xml


Processing Custodian XML Files:  19%|█▉        | 53/279 [2:01:44<18:40:53, 297.58s/it]

Currently processing: 013.xml


Processing Custodian XML Files:  19%|█▉        | 54/279 [2:01:44<13:10:35, 210.82s/it]

Currently processing: 007.xml


Processing Custodian XML Files:  20%|█▉        | 55/279 [2:14:54<23:42:36, 381.06s/it]

Currently processing: 006.xml


Processing Custodian XML Files:  20%|██        | 56/279 [2:15:00<16:44:07, 270.17s/it]

Currently processing: 012.xml


Processing Custodian XML Files:  20%|██        | 57/279 [2:15:17<12:00:56, 194.85s/it]

Currently processing: 204.xml


Processing Custodian XML Files:  21%|██        | 58/279 [2:16:14<9:26:03, 153.68s/it] 

Currently processing: 210.xml


Processing Custodian XML Files:  21%|██        | 59/279 [2:17:26<7:54:48, 129.49s/it]

Currently processing: 238.xml


Processing Custodian XML Files:  22%|██▏       | 60/279 [2:17:34<5:39:36, 93.05s/it] 

Currently processing: 199.xml


Processing Custodian XML Files:  22%|██▏       | 61/279 [2:17:34<3:57:01, 65.24s/it]

Currently processing: 172.xml


Processing Custodian XML Files:  22%|██▏       | 62/279 [2:19:21<4:41:30, 77.84s/it]

Currently processing: 166.xml


Processing Custodian XML Files:  23%|██▎       | 63/279 [2:20:06<4:04:55, 68.03s/it]

Currently processing: 101.xml


Processing Custodian XML Files:  23%|██▎       | 64/279 [2:20:08<2:52:47, 48.22s/it]

Currently processing: 115.xml


Processing Custodian XML Files:  23%|██▎       | 65/279 [2:20:33<2:26:40, 41.12s/it]

Currently processing: 129.xml


Processing Custodian XML Files:  24%|██▎       | 66/279 [2:20:34<1:42:57, 29.00s/it]

Currently processing: 277.xml


Processing Custodian XML Files:  24%|██▍       | 67/279 [2:39:46<21:32:48, 365.89s/it]

Currently processing: 263.xml


Processing Custodian XML Files:  24%|██▍       | 68/279 [2:41:21<16:41:15, 284.72s/it]

Currently processing: 075.xml


Processing Custodian XML Files:  24%|██▍       | 68/279 [2:41:21<16:41:15, 284.72s/it]

Currently processing: 061.xml


Processing Custodian XML Files:  25%|██▌       | 70/279 [2:41:49<9:16:23, 159.73s/it] 

Currently processing: 049.xml


Processing Custodian XML Files:  25%|██▌       | 71/279 [2:42:22<7:24:34, 128.24s/it]

Currently processing: 048.xml


Processing Custodian XML Files:  26%|██▌       | 72/279 [2:42:57<5:58:59, 104.06s/it]

Currently processing: 060.xml


Processing Custodian XML Files:  26%|██▌       | 73/279 [2:45:39<6:51:07, 119.75s/it]

Currently processing: 074.xml


Processing Custodian XML Files:  26%|██▌       | 73/279 [2:45:39<6:51:07, 119.75s/it]

Currently processing: 262.xml


Processing Custodian XML Files:  27%|██▋       | 75/279 [2:45:39<3:49:16, 67.43s/it] 

Currently processing: 276.xml


Processing Custodian XML Files:  27%|██▋       | 76/279 [2:45:52<3:03:56, 54.37s/it]

Currently processing: 128.xml


Processing Custodian XML Files:  28%|██▊       | 77/279 [2:46:25<2:44:43, 48.93s/it]

Currently processing: 114.xml


Processing Custodian XML Files:  28%|██▊       | 77/279 [2:46:25<2:44:43, 48.93s/it]

Currently processing: 100.xml


Processing Custodian XML Files:  28%|██▊       | 79/279 [2:46:25<1:34:38, 28.39s/it]

Currently processing: 116.xml


Processing Custodian XML Files:  29%|██▊       | 80/279 [2:46:32<1:17:53, 23.49s/it]

Currently processing: 102.xml


Processing Custodian XML Files:  29%|██▉       | 81/279 [2:46:33<58:38, 17.77s/it]  

Currently processing: 260.xml


Processing Custodian XML Files:  29%|██▉       | 81/279 [2:46:33<58:38, 17.77s/it]

Currently processing: 274.xml


Processing Custodian XML Files:  30%|██▉       | 83/279 [2:46:35<35:20, 10.82s/it]

Currently processing: 248.xml


Processing Custodian XML Files:  30%|██▉       | 83/279 [2:46:35<35:20, 10.82s/it]

Currently processing: 062.xml


Processing Custodian XML Files:  30%|███       | 85/279 [6:41:40<141:46:24, 2630.85s/it]

Currently processing: 076.xml


Processing Custodian XML Files:  31%|███       | 86/279 [6:46:13<114:29:48, 2135.69s/it]

Currently processing: 089.xml


Processing Custodian XML Files:  31%|███       | 87/279 [6:52:56<92:34:42, 1735.85s/it] 

Currently processing: 088.xml


Processing Custodian XML Files:  32%|███▏      | 88/279 [6:52:56<69:16:22, 1305.67s/it]

Currently processing: 077.xml


Processing Custodian XML Files:  32%|███▏      | 89/279 [6:52:57<50:54:01, 964.43s/it] 

Currently processing: 063.xml


Processing Custodian XML Files:  32%|███▏      | 90/279 [7:25:24<64:39:42, 1231.66s/it]

Currently processing: 249.xml


Processing Custodian XML Files:  33%|███▎      | 91/279 [7:25:24<46:19:29, 887.07s/it] 

Currently processing: 275.xml


Processing Custodian XML Files:  33%|███▎      | 92/279 [7:25:40<33:09:13, 638.26s/it]

Currently processing: 261.xml


Processing Custodian XML Files:  33%|███▎      | 93/279 [7:26:26<24:06:44, 466.69s/it]

Currently processing: 103.xml


Processing Custodian XML Files:  34%|███▎      | 94/279 [7:26:47<17:16:12, 336.06s/it]

Currently processing: 117.xml


Processing Custodian XML Files:  34%|███▍      | 95/279 [7:31:51<16:41:30, 326.58s/it]

Currently processing: 107.xml


Processing Custodian XML Files:  34%|███▍      | 96/279 [7:43:15<21:59:46, 432.71s/it]

Currently processing: 259.xml


Processing Custodian XML Files:  35%|███▍      | 97/279 [7:43:30<15:35:07, 308.28s/it]

Currently processing: 265.xml


Processing Custodian XML Files:  35%|███▌      | 98/279 [7:43:30<10:52:43, 216.37s/it]

Currently processing: 271.xml


Processing Custodian XML Files:  35%|███▌      | 99/279 [7:43:33<7:37:51, 152.62s/it] 

Currently processing: 067.xml


Processing Custodian XML Files:  35%|███▌      | 99/279 [7:43:33<7:37:51, 152.62s/it]

Currently processing: 073.xml


Processing Custodian XML Files:  35%|███▌      | 99/279 [7:43:33<7:37:51, 152.62s/it]

Currently processing: 098.xml


Processing Custodian XML Files:  37%|███▋      | 102/279 [7:43:36<3:19:25, 67.60s/it]

Currently processing: 099.xml


Processing Custodian XML Files:  37%|███▋      | 102/279 [7:43:36<3:19:25, 67.60s/it]

Currently processing: 072.xml


Processing Custodian XML Files:  37%|███▋      | 104/279 [7:43:37<2:08:33, 44.08s/it]

Currently processing: 066.xml


Processing Custodian XML Files:  37%|███▋      | 104/279 [7:43:37<2:08:33, 44.08s/it]

Currently processing: 270.xml


Processing Custodian XML Files:  38%|███▊      | 106/279 [7:46:00<2:33:36, 53.27s/it]

Currently processing: 264.xml


Processing Custodian XML Files:  38%|███▊      | 107/279 [7:46:00<2:03:31, 43.09s/it]

Currently processing: 258.xml


Processing Custodian XML Files:  39%|███▊      | 108/279 [7:46:44<2:03:04, 43.18s/it]

Currently processing: 106.xml


Processing Custodian XML Files:  39%|███▉      | 109/279 [7:46:50<1:37:27, 34.40s/it]

Currently processing: 112.xml


Processing Custodian XML Files:  39%|███▉      | 110/279 [7:48:55<2:41:34, 57.36s/it]

Currently processing: 138.xml


Processing Custodian XML Files:  39%|███▉      | 110/279 [7:48:55<2:41:34, 57.36s/it]

Currently processing: 104.xml


Processing Custodian XML Files:  40%|████      | 112/279 [7:48:57<1:33:45, 33.69s/it]

Currently processing: 110.xml


Processing Custodian XML Files:  40%|████      | 112/279 [7:48:57<1:33:45, 33.69s/it]

Currently processing: 272.xml


Processing Custodian XML Files:  41%|████      | 114/279 [7:49:00<59:38, 21.69s/it]  

Currently processing: 266.xml


Processing Custodian XML Files:  41%|████      | 115/279 [7:49:00<46:57, 17.18s/it]

Currently processing: 058.xml


Processing Custodian XML Files:  42%|████▏     | 116/279 [7:50:36<1:36:16, 35.44s/it]

Currently processing: 070.xml


Processing Custodian XML Files:  42%|████▏     | 117/279 [7:50:39<1:13:24, 27.19s/it]

Currently processing: 064.xml
Zip file not found for XML: avocado-1.0.2/data/text/064.zip
Currently processing: 065.xml


Processing Custodian XML Files:  43%|████▎     | 119/279 [7:50:45<45:48, 17.18s/it]  

Currently processing: 071.xml


Processing Custodian XML Files:  43%|████▎     | 120/279 [7:50:50<38:12, 14.42s/it]

Currently processing: 059.xml


Processing Custodian XML Files:  43%|████▎     | 121/279 [7:55:24<3:26:42, 78.50s/it]

Currently processing: 267.xml


Processing Custodian XML Files:  44%|████▎     | 122/279 [7:58:56<4:55:57, 113.11s/it]

Currently processing: 273.xml


Processing Custodian XML Files:  44%|████▍     | 123/279 [7:59:09<3:43:32, 85.97s/it] 

Currently processing: 111.xml


Processing Custodian XML Files:  44%|████▍     | 124/279 [7:59:13<2:42:53, 63.06s/it]

Currently processing: 105.xml


Processing Custodian XML Files:  45%|████▍     | 125/279 [7:59:34<2:11:21, 51.18s/it]

Currently processing: 139.xml


Processing Custodian XML Files:  45%|████▌     | 126/279 [7:59:37<1:34:38, 37.11s/it]

Currently processing: 120.xml


Processing Custodian XML Files:  46%|████▌     | 127/279 [7:59:40<1:09:05, 27.27s/it]

Currently processing: 134.xml


Processing Custodian XML Files:  46%|████▌     | 128/279 [7:59:42<49:47, 19.78s/it]  

Currently processing: 108.xml


Processing Custodian XML Files:  46%|████▌     | 129/279 [8:00:09<54:46, 21.91s/it]

Currently processing: 256.xml


Processing Custodian XML Files:  47%|████▋     | 130/279 [8:00:10<38:42, 15.59s/it]

Currently processing: 242.xml


Processing Custodian XML Files:  47%|████▋     | 131/279 [8:00:48<55:28, 22.49s/it]

Currently processing: 281.xml


Processing Custodian XML Files:  47%|████▋     | 132/279 [8:07:28<5:31:05, 135.14s/it]

Currently processing: 054.xml


Processing Custodian XML Files:  48%|████▊     | 133/279 [8:07:28<3:50:35, 94.77s/it] 

Currently processing: 040.xml


Processing Custodian XML Files:  48%|████▊     | 134/279 [8:09:31<4:09:11, 103.12s/it]

Currently processing: 068.xml


Processing Custodian XML Files:  48%|████▊     | 135/279 [8:12:58<5:21:56, 134.14s/it]

Currently processing: 097.xml


Processing Custodian XML Files:  49%|████▊     | 136/279 [8:13:06<3:50:09, 96.57s/it] 

Currently processing: 083.xml


Processing Custodian XML Files:  49%|████▉     | 137/279 [8:13:09<2:41:40, 68.31s/it]

Currently processing: 082.xml


Processing Custodian XML Files:  49%|████▉     | 138/279 [8:13:25<2:04:11, 52.85s/it]

Currently processing: 096.xml


Processing Custodian XML Files:  50%|████▉     | 139/279 [8:18:09<4:45:08, 122.20s/it]

Currently processing: 069.xml


Processing Custodian XML Files:  50%|█████     | 140/279 [8:18:22<3:26:58, 89.34s/it] 

Currently processing: 041.xml


Processing Custodian XML Files:  50%|█████     | 140/279 [8:18:22<3:26:58, 89.34s/it]

Currently processing: 055.xml


Processing Custodian XML Files:  51%|█████     | 142/279 [9:36:25<42:56:53, 1128.57s/it]

Currently processing: 280.xml


Processing Custodian XML Files:  51%|█████▏    | 143/279 [9:36:27<32:05:26, 849.46s/it] 

Currently processing: 243.xml


Processing Custodian XML Files:  52%|█████▏    | 144/279 [9:37:52<24:21:07, 649.39s/it]

Currently processing: 257.xml


Processing Custodian XML Files:  52%|█████▏    | 145/279 [9:41:09<19:35:37, 526.40s/it]

Currently processing: 109.xml


Processing Custodian XML Files:  52%|█████▏    | 145/279 [9:41:09<19:35:37, 526.40s/it]

Currently processing: 135.xml


Processing Custodian XML Files:  53%|█████▎    | 147/279 [9:42:32<11:31:36, 314.37s/it]

Currently processing: 121.xml


Processing Custodian XML Files:  53%|█████▎    | 148/279 [9:42:43<8:48:24, 242.02s/it] 

Currently processing: 137.xml


Processing Custodian XML Files:  53%|█████▎    | 149/279 [9:43:16<6:49:57, 189.21s/it]

Currently processing: 123.xml


Processing Custodian XML Files:  54%|█████▍    | 150/279 [9:43:22<5:01:30, 140.24s/it]

Currently processing: 241.xml


Processing Custodian XML Files:  54%|█████▍    | 151/279 [9:43:22<3:36:55, 101.69s/it]

Currently processing: 255.xml


Processing Custodian XML Files:  54%|█████▍    | 152/279 [9:43:23<2:34:51, 73.16s/it] 

Currently processing: 269.xml


Processing Custodian XML Files:  55%|█████▍    | 153/279 [9:43:26<1:51:52, 53.27s/it]

Currently processing: 043.xml


Processing Custodian XML Files:  55%|█████▍    | 153/279 [9:43:26<1:51:52, 53.27s/it]

Currently processing: 057.xml


Processing Custodian XML Files:  56%|█████▌    | 155/279 [9:55:42<6:42:57, 194.98s/it]

Currently processing: 080.xml


Processing Custodian XML Files:  56%|█████▌    | 156/279 [10:03:34<8:58:19, 262.59s/it]

Currently processing: 094.xml


Processing Custodian XML Files:  56%|█████▋    | 157/279 [10:03:43<6:40:34, 197.00s/it]

Currently processing: 095.xml


Processing Custodian XML Files:  57%|█████▋    | 158/279 [10:04:02<5:00:57, 149.23s/it]

Currently processing: 081.xml


Processing Custodian XML Files:  57%|█████▋    | 159/279 [10:04:03<3:35:42, 107.86s/it]

Currently processing: 056.xml


Processing Custodian XML Files:  57%|█████▋    | 160/279 [10:04:25<2:45:43, 83.56s/it] 

Currently processing: 042.xml


Processing Custodian XML Files:  57%|█████▋    | 160/279 [10:04:25<2:45:43, 83.56s/it]

Currently processing: 283.xml
Zip file not found for XML: avocado-1.0.2/data/text/283.zip
Currently processing: 268.xml


Processing Custodian XML Files:  57%|█████▋    | 160/279 [10:04:25<2:45:43, 83.56s/it]

Currently processing: 254.xml


Processing Custodian XML Files:  59%|█████▉    | 164/279 [10:04:25<1:01:05, 31.88s/it]

Currently processing: 240.xml


Processing Custodian XML Files:  59%|█████▉    | 165/279 [10:04:28<50:25, 26.54s/it]  

Currently processing: 122.xml


Processing Custodian XML Files:  59%|█████▉    | 166/279 [10:04:30<40:30, 21.51s/it]

Currently processing: 136.xml


Processing Custodian XML Files:  60%|█████▉    | 167/279 [10:04:30<31:05, 16.65s/it]

Currently processing: 132.xml


Processing Custodian XML Files:  60%|██████    | 168/279 [10:04:36<26:01, 14.07s/it]

Currently processing: 126.xml


Processing Custodian XML Files:  60%|██████    | 168/279 [10:04:36<26:01, 14.07s/it]

Currently processing: 278.xml


Processing Custodian XML Files:  61%|██████    | 170/279 [10:04:37<15:16,  8.40s/it]

Currently processing: 244.xml


Processing Custodian XML Files:  61%|██████▏   | 171/279 [10:05:11<25:46, 14.32s/it]

Currently processing: 250.xml


Processing Custodian XML Files:  62%|██████▏   | 172/279 [10:19:25<6:32:27, 220.07s/it]

Currently processing: 046.xml


Processing Custodian XML Files:  62%|██████▏   | 173/279 [10:21:15<5:38:27, 191.58s/it]

Currently processing: 052.xml


Processing Custodian XML Files:  62%|██████▏   | 174/279 [10:21:19<4:06:25, 140.81s/it]

Currently processing: 085.xml


Processing Custodian XML Files:  63%|██████▎   | 175/279 [10:21:49<3:10:35, 109.96s/it]

Currently processing: 091.xml


Processing Custodian XML Files:  63%|██████▎   | 176/279 [10:24:12<3:25:07, 119.49s/it]

Currently processing: 090.xml


Processing Custodian XML Files:  63%|██████▎   | 176/279 [10:24:12<3:25:07, 119.49s/it]

Currently processing: 084.xml


Processing Custodian XML Files:  64%|██████▍   | 178/279 [10:41:37<8:25:11, 300.11s/it]

Currently processing: 053.xml


Processing Custodian XML Files:  64%|██████▍   | 179/279 [10:44:24<7:26:16, 267.77s/it]

Currently processing: 047.xml


Processing Custodian XML Files:  65%|██████▍   | 180/279 [10:44:24<5:28:26, 199.06s/it]

Currently processing: 251.xml


Processing Custodian XML Files:  65%|██████▍   | 181/279 [10:44:30<4:00:11, 147.06s/it]

Currently processing: 245.xml


Processing Custodian XML Files:  65%|██████▌   | 182/279 [10:54:23<7:17:33, 270.65s/it]

Currently processing: 279.xml


Processing Custodian XML Files:  66%|██████▌   | 183/279 [10:56:57<6:20:07, 237.58s/it]

Currently processing: 127.xml


Processing Custodian XML Files:  66%|██████▌   | 184/279 [10:57:13<4:34:59, 173.67s/it]

Currently processing: 133.xml


Processing Custodian XML Files:  66%|██████▌   | 184/279 [10:57:13<4:34:59, 173.67s/it]

Currently processing: 119.xml


Processing Custodian XML Files:  67%|██████▋   | 186/279 [10:57:13<2:27:37, 95.24s/it] 

Currently processing: 125.xml


Processing Custodian XML Files:  67%|██████▋   | 186/279 [10:57:13<2:27:37, 95.24s/it]

Currently processing: 131.xml


Processing Custodian XML Files:  67%|██████▋   | 188/279 [10:57:19<1:29:38, 59.11s/it]

Currently processing: 253.xml


Processing Custodian XML Files:  68%|██████▊   | 189/279 [10:57:20<1:09:23, 46.26s/it]

Currently processing: 247.xml


Processing Custodian XML Files:  68%|██████▊   | 189/279 [10:57:20<1:09:23, 46.26s/it]

Currently processing: 079.xml


Processing Custodian XML Files:  68%|██████▊   | 189/279 [10:57:20<1:09:23, 46.26s/it]

Currently processing: 051.xml


Processing Custodian XML Files:  69%|██████▉   | 192/279 [10:59:14<1:01:21, 42.32s/it]

Currently processing: 045.xml


Processing Custodian XML Files:  69%|██████▉   | 193/279 [10:59:16<49:45, 34.71s/it]  

Currently processing: 092.xml


Processing Custodian XML Files:  69%|██████▉   | 193/279 [10:59:16<49:45, 34.71s/it]

Currently processing: 086.xml


Processing Custodian XML Files:  70%|██████▉   | 195/279 [10:59:17<32:02, 22.88s/it]

Currently processing: 087.xml


Processing Custodian XML Files:  70%|██████▉   | 195/279 [10:59:17<32:02, 22.88s/it]

Currently processing: 093.xml


Processing Custodian XML Files:  71%|███████   | 197/279 [11:02:00<57:47, 42.29s/it]

Currently processing: 044.xml


Processing Custodian XML Files:  71%|███████   | 197/279 [11:02:00<57:47, 42.29s/it]

Currently processing: 050.xml


Processing Custodian XML Files:  71%|███████▏  | 199/279 [11:02:00<38:17, 28.72s/it]

Currently processing: 078.xml


Processing Custodian XML Files:  71%|███████▏  | 199/279 [11:02:00<38:17, 28.72s/it]

Currently processing: 246.xml


Processing Custodian XML Files:  72%|███████▏  | 201/279 [11:02:00<25:35, 19.69s/it]

Currently processing: 252.xml


Processing Custodian XML Files:  72%|███████▏  | 201/279 [11:02:00<25:35, 19.69s/it]

Currently processing: 130.xml


Processing Custodian XML Files:  72%|███████▏  | 201/279 [11:02:00<25:35, 19.69s/it]

Currently processing: 124.xml


Processing Custodian XML Files:  73%|███████▎  | 204/279 [11:02:01<14:45, 11.80s/it]

Currently processing: 118.xml


Processing Custodian XML Files:  73%|███████▎  | 205/279 [11:02:05<13:05, 10.62s/it]

Currently processing: 143.xml


Processing Custodian XML Files:  74%|███████▍  | 206/279 [11:02:12<12:03,  9.91s/it]

Currently processing: 157.xml


Processing Custodian XML Files:  74%|███████▍  | 207/279 [11:03:26<28:15, 23.54s/it]

Currently processing: 180.xml


Processing Custodian XML Files:  75%|███████▍  | 208/279 [11:03:27<21:32, 18.21s/it]

Currently processing: 235.xml


Processing Custodian XML Files:  75%|███████▍  | 209/279 [12:41:14<28:36:59, 1471.70s/it]

Currently processing: 221.xml


Processing Custodian XML Files:  75%|███████▌  | 210/279 [12:41:14<20:49:07, 1086.19s/it]

Currently processing: 209.xml


Processing Custodian XML Files:  76%|███████▌  | 211/279 [12:41:37<15:02:49, 796.61s/it] 

Currently processing: 037.xml


Processing Custodian XML Files:  76%|███████▌  | 212/279 [12:44:00<11:25:09, 613.57s/it]

Currently processing: 023.xml


Processing Custodian XML Files:  76%|███████▌  | 212/279 [12:44:01<11:25:09, 613.57s/it]

Currently processing: 022.xml


Processing Custodian XML Files:  77%|███████▋  | 214/279 [12:44:20<6:14:01, 345.25s/it] 

Currently processing: 036.xml


Processing Custodian XML Files:  77%|███████▋  | 215/279 [12:44:22<4:40:02, 262.54s/it]

Currently processing: 208.xml


Processing Custodian XML Files:  77%|███████▋  | 216/279 [12:44:49<3:32:06, 202.00s/it]

Currently processing: 220.xml


Processing Custodian XML Files:  78%|███████▊  | 217/279 [12:44:50<2:33:15, 148.31s/it]

Currently processing: 234.xml


Processing Custodian XML Files:  78%|███████▊  | 218/279 [12:45:15<1:55:55, 114.02s/it]

Currently processing: 195.xml


Processing Custodian XML Files:  78%|███████▊  | 218/279 [12:45:15<1:55:55, 114.02s/it]

Currently processing: 181.xml


Processing Custodian XML Files:  79%|███████▉  | 220/279 [12:45:15<1:02:39, 63.72s/it] 

Currently processing: 156.xml


Processing Custodian XML Files:  79%|███████▉  | 221/279 [12:45:25<49:18, 51.01s/it]  

Currently processing: 142.xml


Processing Custodian XML Files:  80%|███████▉  | 222/279 [12:59:07<3:55:12, 247.58s/it]

Currently processing: 154.xml


Processing Custodian XML Files:  80%|███████▉  | 223/279 [13:11:10<5:49:36, 374.58s/it]

Currently processing: 140.xml


Processing Custodian XML Files:  80%|███████▉  | 223/279 [13:11:10<5:49:36, 374.58s/it]

Currently processing: 168.xml


Processing Custodian XML Files:  81%|████████  | 225/279 [13:11:10<3:11:13, 212.48s/it]

Currently processing: 197.xml


Processing Custodian XML Files:  81%|████████  | 226/279 [13:11:12<2:23:39, 162.63s/it]

Currently processing: 183.xml


Processing Custodian XML Files:  81%|████████  | 226/279 [13:11:12<2:23:39, 162.63s/it]

Currently processing: 222.xml


Processing Custodian XML Files:  81%|████████  | 226/279 [13:11:12<2:23:39, 162.63s/it]

Currently processing: 236.xml


Processing Custodian XML Files:  82%|████████▏ | 229/279 [13:11:25<1:09:18, 83.17s/it] 

Currently processing: 020.xml


Processing Custodian XML Files:  82%|████████▏ | 230/279 [13:19:00<2:06:32, 154.96s/it]

Currently processing: 034.xml


Processing Custodian XML Files:  83%|████████▎ | 231/279 [13:19:01<1:37:15, 121.58s/it]

Currently processing: 008.xml


Processing Custodian XML Files:  83%|████████▎ | 232/279 [13:29:26<3:08:22, 240.48s/it]

Currently processing: 009.xml


Processing Custodian XML Files:  84%|████████▎ | 233/279 [13:29:28<2:18:10, 180.23s/it]

Currently processing: 035.xml


Processing Custodian XML Files:  84%|████████▍ | 234/279 [13:29:28<1:39:23, 132.53s/it]

Currently processing: 021.xml


Processing Custodian XML Files:  84%|████████▍ | 234/279 [13:29:28<1:39:23, 132.53s/it]

Currently processing: 237.xml


Processing Custodian XML Files:  85%|████████▍ | 236/279 [15:33:52<20:06:17, 1683.20s/it]

Currently processing: 223.xml


Processing Custodian XML Files:  85%|████████▍ | 237/279 [15:33:53<15:01:03, 1287.22s/it]

Currently processing: 196.xml


Processing Custodian XML Files:  85%|████████▌ | 238/279 [15:52:03<14:05:50, 1237.82s/it]

Currently processing: 169.xml


Processing Custodian XML Files:  86%|████████▌ | 239/279 [15:52:04<10:07:04, 910.60s/it] 

Currently processing: 141.xml


Processing Custodian XML Files:  86%|████████▌ | 240/279 [15:52:29<7:14:14, 668.06s/it] 

Currently processing: 155.xml


Processing Custodian XML Files:  86%|████████▋ | 241/279 [15:52:37<5:05:27, 482.30s/it]

Currently processing: 179.xml


Processing Custodian XML Files:  87%|████████▋ | 242/279 [15:52:38<3:32:15, 344.19s/it]

Currently processing: 151.xml


Processing Custodian XML Files:  87%|████████▋ | 243/279 [15:52:58<2:30:01, 250.03s/it]

Currently processing: 145.xml


Processing Custodian XML Files:  87%|████████▋ | 243/279 [15:52:58<2:30:01, 250.03s/it]

Currently processing: 192.xml


Processing Custodian XML Files:  87%|████████▋ | 243/279 [15:52:58<2:30:01, 250.03s/it]

Currently processing: 186.xml


Processing Custodian XML Files:  88%|████████▊ | 246/279 [15:54:15<1:09:04, 125.59s/it]

Currently processing: 227.xml


Processing Custodian XML Files:  89%|████████▊ | 247/279 [15:54:41<55:53, 104.81s/it]  

Currently processing: 233.xml


Processing Custodian XML Files:  89%|████████▉ | 248/279 [16:04:10<1:49:15, 211.46s/it]

Currently processing: 019.xml


Processing Custodian XML Files:  89%|████████▉ | 249/279 [16:04:15<1:20:11, 160.39s/it]

Currently processing: 025.xml


Processing Custodian XML Files:  89%|████████▉ | 249/279 [16:04:15<1:20:11, 160.39s/it]

Currently processing: 031.xml


Processing Custodian XML Files:  90%|████████▉ | 251/279 [16:04:26<44:54, 96.22s/it]   

Currently processing: 030.xml


Processing Custodian XML Files:  90%|█████████ | 252/279 [16:04:26<33:26, 74.32s/it]

Currently processing: 024.xml


Processing Custodian XML Files:  91%|█████████ | 253/279 [16:07:42<45:09, 104.19s/it]

Currently processing: 018.xml


Processing Custodian XML Files:  91%|█████████ | 254/279 [16:09:35<44:23, 106.54s/it]

Currently processing: 232.xml


Processing Custodian XML Files:  91%|█████████▏| 255/279 [16:22:43<1:56:26, 291.10s/it]

Currently processing: 226.xml


Processing Custodian XML Files:  91%|█████████▏| 255/279 [16:22:43<1:56:26, 291.10s/it]

Currently processing: 187.xml


Processing Custodian XML Files:  92%|█████████▏| 257/279 [16:22:44<1:00:12, 164.22s/it]

Currently processing: 193.xml


Processing Custodian XML Files:  92%|█████████▏| 258/279 [16:23:02<45:23, 129.67s/it]  

Currently processing: 144.xml


Processing Custodian XML Files:  93%|█████████▎| 259/279 [16:43:23<2:15:23, 406.19s/it]

Currently processing: 150.xml


Processing Custodian XML Files:  93%|█████████▎| 260/279 [16:45:02<1:42:44, 324.46s/it]

Currently processing: 178.xml


Processing Custodian XML Files:  94%|█████████▎| 261/279 [17:08:53<3:08:42, 629.03s/it]

Currently processing: 146.xml


Processing Custodian XML Files:  94%|█████████▍| 262/279 [17:09:27<2:10:35, 460.91s/it]

Currently processing: 152.xml


Processing Custodian XML Files:  94%|█████████▍| 263/279 [17:09:42<1:28:48, 333.01s/it]

Currently processing: 185.xml


Processing Custodian XML Files:  95%|█████████▍| 264/279 [17:09:45<59:11, 236.74s/it]  

Currently processing: 191.xml


Processing Custodian XML Files:  95%|█████████▍| 264/279 [17:09:45<59:11, 236.74s/it]

Currently processing: 218.xml


Processing Custodian XML Files:  95%|█████████▌| 266/279 [17:11:40<33:40, 155.43s/it]

Currently processing: 230.xml


Processing Custodian XML Files:  96%|█████████▌| 267/279 [17:11:42<23:33, 117.81s/it]

Currently processing: 224.xml


Processing Custodian XML Files:  96%|█████████▌| 268/279 [17:11:42<16:00, 87.31s/it] 

Currently processing: 032.xml


Processing Custodian XML Files:  96%|█████████▋| 269/279 [17:14:34<18:23, 110.35s/it]

Currently processing: 026.xml


Processing Custodian XML Files:  97%|█████████▋| 270/279 [17:14:39<12:07, 80.80s/it] 

Currently processing: 027.xml


Processing Custodian XML Files:  97%|█████████▋| 271/279 [17:14:39<07:42, 57.85s/it]

Currently processing: 033.xml


Processing Custodian XML Files:  97%|█████████▋| 272/279 [17:14:39<04:48, 41.18s/it]

Currently processing: 225.xml


Processing Custodian XML Files:  97%|█████████▋| 272/279 [17:14:39<04:48, 41.18s/it]

Currently processing: 231.xml


Processing Custodian XML Files:  98%|█████████▊| 274/279 [17:14:42<01:56, 23.27s/it]

Currently processing: 219.xml


Processing Custodian XML Files:  99%|█████████▊| 275/279 [17:17:30<03:54, 58.72s/it]

Currently processing: 190.xml


Processing Custodian XML Files:  99%|█████████▉| 276/279 [17:18:21<02:49, 56.53s/it]

Currently processing: 184.xml


Processing Custodian XML Files:  99%|█████████▉| 277/279 [17:18:21<01:22, 41.32s/it]

Currently processing: 153.xml


Processing Custodian XML Files: 100%|█████████▉| 278/279 [17:18:21<00:29, 29.88s/it]

Currently processing: 147.xml


Processing Custodian XML Files: 100%|██████████| 279/279 [17:19:31<00:00, 223.55s/it]
