# Convert Avocado Data into a Dataframe

In [1]:
import pandas as pd
import zipfile
import os
import xml.etree.ElementTree as ET

In [2]:
def parse_top_level_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    return root

top_level_xml_path = './avocado-1.0.2/data/collection.xml'
top_level_root = parse_top_level_xml(top_level_xml_path)

custodians = []
for include in top_level_root.findall('.//{http://www.w3.org/2001/XInclude}include'):
    custodian_file = include.get('href')
    custodians.append(custodian_file)

# Example output
for custodian_file in custodians:
    print(f"Custodian File: {custodian_file}")

FileNotFoundError: [Errno 2] No such file or directory: './avocado-1.0.2/data/collection.xml'

In [24]:
def parse_custodian_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    return root

def extract_metadata(custodian_root):
    emails = []
    items = custodian_root.findall('.//item')  # Find all items
    
    for item in items:
        if item.get('type') == 'email':
            email_data = {}
            metadata = item.find('metadata')
            
            if metadata is not None:
                for field in metadata.findall('field'):
                    name = field.get('name')
                    text = field.text.strip() if field.text is not None else ''
                    email_data[name] = text
            
            body_file_path = item.find('.//file').get('path') if item.find('.//file') is not None else ''
            email_data['body_file_path'] = body_file_path
            emails.append(email_data)
            
    return emails

def read_body_file_from_zip(zip_file_path, body_file_path):
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            with zip_ref.open(body_file_path) as body_file:
                return body_file.read().decode('utf-8')
    except Exception as e:
        print(f"Error reading {body_file_path} from {zip_file_path}: {e}")
        return ''




In [30]:
base_path = 'avocado-1.0.2/data/'
custodian_folder = 'custodians/'
text_folder = 'text/'

# List all custodian XML files and corresponding zip files
xml_files = [f for f in os.listdir(os.path.join(base_path, custodian_folder)) if f.endswith('.xml')]
zip_files = [f for f in os.listdir(os.path.join(base_path, text_folder)) if f.endswith('.zip')]

all_emails = []

# Process each XML file
for xml_file in xml_files:
    xml_path = os.path.join(base_path, xml_file)
    
    # Parse XML and extract metadata
    custodian_root = parse_custodian_xml(xml_path)
    emails = extract_metadata(custodian_root)
    
    # Determine corresponding zip file
    zip_file_name = xml_file.replace('.xml', '.zip')
    zip_file_path = os.path.join(base_path, zip_file_name)
    
    # Add body content to email data
    for email in emails:
        body_file_path = email.get('body_file_path')
        if body_file_path:
            email['body_content'] = read_body_file_from_zip(zip_file_path, body_file_path)
        else:
            email['body_content'] = ''
    
    # Append emails to the list
    all_emails.extend(emails)

# Convert to DataFrame and save as CSV
df = pd.DataFrame(all_emails)
df.to_csv('emails.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'avocado-1.0.2/data/162.xml'

In [None]:
import xml.etree.ElementTree as ET
import os
import zipfile
import pandas as pd
from tqdm import tqdm

def parse_custodian_xml(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        return root
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except ET.ParseError as e:
        print(f"Error parsing file {file_path}: {e}")
        return None

def extract_metadata(custodian_root):
    emails = []
    if custodian_root is not None:
        items = custodian_root.findall('.//item')  # Find all items

        for item in items:
            if item.get('type') == 'email':
                email_data = {}
                metadata = item.find('metadata')

                if metadata is not None:
                    for field in metadata.findall('field'):
                        name = field.get('name')
                        text = field.text.strip() if field.text is not None else ''
                        email_data[name] = text

                body_file_path = item.find('.//file').get('path') if item.find('.//file') is not None else ''
                email_data['body_file_path'] = body_file_path
                emails.append(email_data)

    return emails

def read_body_file_from_zip(zip_file_path, body_file_path):
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            # Normalize path (remove leading 'text/' if present)
            normalized_path = body_file_path.lstrip('text/')
            if normalized_path in zip_ref.namelist():
                with zip_ref.open(normalized_path) as body_file:
                    return body_file.read().decode('utf-8')
            else:
                print(f"File {normalized_path} not found in zip {zip_file_path}")
                return ''
    except FileNotFoundError:
        print(f"Zip file not found: {zip_file_path}")
        return ''
    except Exception as e:
        print(f"Error reading {body_file_path} from {zip_file_path}: {e}")
        return ''

# Base path where XML and zip files are located
base_path = 'avocado-1.0.2/data/'
custodian_folder = 'custodians/'
text_folder = 'text/'

# List all custodian XML files and corresponding zip files
custodian_path = os.path.join(base_path, custodian_folder)
text_path = os.path.join(base_path, text_folder)

xml_files = [f for f in os.listdir(custodian_path) if f.endswith('.xml')]
zip_files = [f for f in os.listdir(text_path) if f.endswith('.zip')]

all_emails = []

# Process each XML file with tqdm progress bar
for xml_file in tqdm(xml_files, desc="Processing Custodian XML Files"):
    xml_path = os.path.join(custodian_path, xml_file)
    
    # Parse XML and extract metadata
    custodian_root = parse_custodian_xml(xml_path)
    emails = extract_metadata(custodian_root)
    
    # Determine corresponding zip file
    zip_file_name = xml_file.replace('.xml', '.zip')
    zip_file_path = os.path.join(text_path, zip_file_name)
    
    if os.path.exists(zip_file_path):
        # Add body content to email data with tqdm progress bar
        for email in tqdm(emails, desc=f"Processing Emails in {xml_file}", leave=False):
            body_file_path = email.get('body_file_path')
            if body_file_path:
                email['body_content'] = read_body_file_from_zip(zip_file_path, body_file_path)
            else:
                email['body_content'] = ''
    
        # Append emails to the list
        all_emails.extend(emails)
    else:
        print(f"Zip file not found for XML: {zip_file_path}")

# Convert to DataFrame and save as CSV
df = pd.DataFrame(all_emails)
df.to_csv('emails.csv', index=False)

print("Processing complete. Check 'emails.csv' for the results.")


In [2]:
import xml.etree.ElementTree as ET
import os
import zipfile
import pandas as pd
from tqdm import tqdm

def parse_custodian_xml(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        return root
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except ET.ParseError as e:
        print(f"Error parsing file {file_path}: {e}")
        return None

def extract_metadata(custodian_root):
    emails = []
    if custodian_root is not None:
        items = custodian_root.findall('.//item[@type="email"]')
        for item in items:
            email_data = {field.get('name'): field.text.strip() if field.text else '' for field in item.findall('.//metadata/field')}
            body_file_path = item.find('.//file').get('path', '') if item.find('.//file') is not None else ''
            email_data['body_file_path'] = body_file_path
            emails.append(email_data)
    return emails

def read_body_file_from_zip(zip_file_path, body_file_path):
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            normalized_path = body_file_path.lstrip('text/')
            if normalized_path in zip_ref.namelist():
                with zip_ref.open(normalized_path) as body_file:
                    return body_file.read().decode('utf-8')
            else:
                print(f"File {normalized_path} not found in zip {zip_file_path}")
                return ''
    except FileNotFoundError:
        print(f"Zip file not found: {zip_file_path}")
        return ''
    except Exception as e:
        print(f"Error reading {body_file_path} from {zip_file_path}: {e}")
        return ''

# Base path where XML and zip files are located
base_path = 'avocado-1.0.2/data/'
custodian_folder = 'custodians/'
text_folder = 'text/'

# List all custodian XML files
custodian_path = os.path.join(base_path, custodian_folder)
text_path = os.path.join(base_path, text_folder)
xml_files = [f for f in os.listdir(custodian_path) if f.endswith('.xml')]

all_emails = []

# Process each XML file with tqdm progress bar
for xml_file in tqdm(xml_files, desc="Processing Custodian XML Files"):
    xml_path = os.path.join(custodian_path, xml_file)
    
    # Parse XML and extract metadata
    custodian_root = parse_custodian_xml(xml_path)
    if custodian_root is None:
        continue
    emails = extract_metadata(custodian_root)
    
    # Determine corresponding zip file
    zip_file_name = xml_file.replace('.xml', '.zip')
    zip_file_path = os.path.join(text_path, zip_file_name)
    
    if os.path.exists(zip_file_path):
        # Add body content to email data with tqdm progress bar
        for email in tqdm(emails, desc=f"Processing Emails in {xml_file}", leave=False):
            body_file_path = email.get('body_file_path')
            if body_file_path:
                email['body_content'] = read_body_file_from_zip(zip_file_path, body_file_path)
            else:
                email['body_content'] = ''
    
        # Append emails to the list
        all_emails.extend(emails)
    else:
        print(f"Zip file not found for XML: {zip_file_path}")


Processing Custodian XML Files:  42%|████▏     | 117/279 [7:54:42<1:16:53, 28.48s/it]   

Zip file not found for XML: avocado-1.0.2/data/text/064.zip


Processing Custodian XML Files:  57%|█████▋    | 160/279 [10:23:34<3:08:13, 94.90s/it]  

Zip file not found for XML: avocado-1.0.2/data/text/283.zip


Processing Custodian XML Files: 100%|██████████| 279/279 [18:49:34<00:00, 242.92s/it]    


In [3]:
df = pd.DataFrame(all_emails)

# Define the chunk size
chunk_size = 1000  # Adjust as needed

# Initialize the progress bar
with tqdm(total=len(df)) as pbar:
    # Open the CSV file in write mode
    with open('emails.csv', 'w', newline='') as f:
        # Write the header
        df.iloc[:0].to_csv(f, index=False)
        
        # Write the data in chunks
        for i in range(0, len(df), chunk_size):
            df.iloc[i:i+chunk_size].to_csv(f, index=False, header=False)
            pbar.update(chunk_size)

939000it [00:51, 18065.34it/s]                            


In [None]:
# Convert to DataFrame and save as CSV
df = pd.DataFrame(all_emails)
df.to_csv('emails.csv', index=False)

print("Processing complete. Check 'emails.csv' for the results.")