In [5]:
import csv
import os
import re
import xml.etree.ElementTree as ET

def extract_domains_from_csv(csv_path):
    """Extract domains from the second column of a CSV file."""
    domains = set()
    try:
        with open(csv_path, newline='', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile, delimiter=';')
            next(reader, None)  # Skip header
            for row in reader:
                if len(row) > 1:
                    domains.add(row[1].strip())
    except Exception as e:
        print(f"Error reading CSV file {csv_path}: {e}")
    return domains


def search_domains_in_xml(xml_folder, domains, output_csv_path):
    """Search for specified domains in XML files and extract associated URLs."""
    url_pattern = re.compile(
        r'https?://(?!www\.tei-c\.org/ns/1\.0\b)(?:www\.)?[-\w@:%._\+~#=]{1,256}\.[a-zA-Z]{2,6}\b(?:[-\w@:%_\+.~#?&/=]*)'
    )
    domain_pattern = re.compile(r'\b(' + '|'.join(re.escape(domain) for domain in domains) + r')\b')
    matches = []
    
    os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
    
    for filename in os.listdir(xml_folder):
        if filename.endswith(".xml"):
            file_path = os.path.join(xml_folder, filename)
            try:
                with open(file_path, "r", encoding="utf-8") as file:
                    text = file.read().replace('\r', '').replace('\n', '')
                    found_urls = url_pattern.findall(text)
                    
                    for url in found_urls:
                        if domain_pattern.search(url):
                            matches.append([url, filename[:-4]])  # Remove .xml extension
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
    
    # Save results to CSV
    try:
        with open(output_csv_path, "w", encoding="utf-8", newline='') as output_file:
            writer = csv.writer(output_file, delimiter=';')
            writer.writerow(["url", "filename"])  # Header
            writer.writerows(matches)
        print(f"Results saved in {output_csv_path}")
    except Exception as e:
        print(f"Error writing CSV file {output_csv_path}: {e}")


# Paths
csv_path = os.path.join("..", "data", "SH", "SH_forge.csv")
xml_folder = os.path.join("..", "data", "xml")
output_path = os.path.join("..", "result", "xml_result_daniel.csv")

# Execution
domains = extract_domains_from_csv(csv_path)
search_domains_in_xml(xml_folder, domains, output_path)


Results saved in ..\result\xml_result_daniel.csv
