In [11]:
import csv
import os
import re

def extract_domains_from_csv(csv_path):
    domains = set()
    with open(csv_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            if len(row) > 1:
                domains.add(row[1].strip())  # Extract domain from second column
    return domains

def search_domains_in_texts(pdf_text_folder, domains, output_csv_path):
    domain_pattern = re.compile(r'\b(' + '|'.join(re.escape(domain) for domain in domains) + r')\b')
    url_pattern = re.compile(r'https?://\S+')  # Regex to extract URLs
    matches = []  # Changed to list to store rows for CSV
    
    # Créer le dossier result s'il n'existe pas
    os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
    
    for filename in os.listdir(pdf_text_folder):
        if filename.endswith(".txt"):
            file_path = os.path.join(pdf_text_folder, filename)
            with open(file_path, "r", encoding="utf-8") as txt_file:
                text = txt_file.read().splitlines()
                
                for line in text:
                    found = domain_pattern.findall(line)
                    if found:
                        urls = url_pattern.findall(line)
                        for url in urls:
                            # Ajouter seulement l'URL et le nom de fichier
                            matches.append([url, filename[:-4]])  # Supprime les 4 derniers caractères (".txt")

    
    # Écrire dans un fichier CSV
    with open(output_csv_path, "w", encoding="utf-8", newline='') as output_file:
        writer = csv.writer(output_file, delimiter=';')

        # Écrire l'en-tête
        writer.writerow(["url", "filename"])
        # Écrire les données
        writer.writerows(matches)
    
    print(f"Correspondances enregistrées dans {output_csv_path}")

# Chemins d'accès avec remontée d'un niveau
csv_path = os.path.join("..", "data", "SH", "SH_forge.csv")
txt_folder = os.path.join("..", "data", "txt")
output_path = os.path.join("..", "result", "txt_result_daniel.csv")

# Exécution
domains = extract_domains_from_csv(csv_path)
search_domains_in_texts(txt_folder, domains, output_path)

Correspondances enregistrées dans ..\result\txt_result_daniel.csv
