In [1]:
pip install PyMuPDF arabic_reshaper python-bidi

Collecting PyMuPDF
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting arabic_reshaper
  Downloading arabic_reshaper-3.0.0-py3-none-any.whl.metadata (12 kB)
Collecting python-bidi
  Downloading python_bidi-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m68.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading arabic_reshaper-3.0.0-py3-none-any.whl (20 kB)
Downloading python_bidi-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (292 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.9/292.9 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-bidi, arabic_reshaper, PyMuPDF
Successfully installed PyMuPDF-1.25.5 arabic_reshaper-3.0.0 python-bidi-0.6.

In [None]:
import os
import re
import json
import fitz  # PyMuPDF

# --- Configuration ---
MIN_PARAGRAPH_WORD_COUNT_INITIAL = 8  # Seuil pour qu'un bloc soit considéré après filtrage initial
MIN_FUSED_PARAGRAPH_WORD_COUNT = 15     # Seuil final pour un paragraphe après fusion
FUSE_CURRENT_PARA_MAX_WORDS = 35
FUSE_NEXT_PARA_MAX_WORDS = 25

PDF_DIRECTORY = "/content/pdfs_arabes_test"
OUTPUT_CONTEXT_FILE = "contextes_manuels_scolaires_v7_refined.jsonl"

# --- Fonctions de Nettoyage et Normalisation ---
def normalize_arabic_text(text):
    if not isinstance(text, str): return ""
    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
    text = re.sub(r"[إأآٱ]", "ا", text)
    text = re.sub(r"ى", "ي", text)
    text = re.sub(r"ـ", '', text)
    text = re.sub(r' +', ' ', text)
    # Les \n sont traités séparément pour la structure des paragraphes
    return text.strip() # Strip initial pour la fonction de filtrage

def clean_internal_newlines_and_final_spaces(text_block):
    if not isinstance(text_block, str): return ""
    cleaned_text = text_block.replace("\n", " ")
    cleaned_text = re.sub(r' +', ' ', cleaned_text).strip()
    return cleaned_text

def is_narrative_paragraph(text_paragraph_candidate):
    text_stripped = text_paragraph_candidate.strip() # Travaille sur le texte déjà strippé
    if not text_stripped:
        return False

    # 1. Pieds de page et numéros de page
    if "برنامج التدريس" in text_stripped or "نسخة تجريبية" in text_stripped or re.fullmatch(r"\s*\d+\s*", text_stripped):
        return False

    # 2. Titres (doivent être des correspondances exactes ou presque pour les blocs courts)
    common_titles_keywords = [
        "قراءة نصوص قصيرة", "قراءة الفقرات البسيطة", "اللبنة", "التحدي", "توليف",
        "قراءتي الأولى", "قراءتي الثانية", "قِراءَةُ نُصوصٍ قَصِيرَةٍ",
        "قِراءَةُ الْفِقْراتِ الْبَسِيطَةِ", "اللّبِنَة",
        "الْقِطَّةُ التَّائِهَةُ", "فَضْلُ الْبِذارِ", "صَبِيٌّ مُزْعِج", "رِسالَةُ عُصْفُورٍ",
        "الصَّدِيقُ وَقْتَ الضَّيقِ", "الْقِطَّةُ الْفَنَّانَةُ", "دَرْسُ فِي الصَّبْرِ",
        "بَيْتُ الْكَلْبِ", "أُمّي", "السُّلَحْفَاةُ الثَّرْثَارَةُ", "الْجَدْوَلُ الصَّغِيرُ",
        "عَيْنُ الطَّائِرِ", "ثَمَنُ الشَّواءِ", "نصيحة غالية", "لِنُحَافِظ عَلَى الْأَزْهَارِ",
        "الطَّفْلُ الْمَطَاطِيُّ", "مُكْرَه أَخاكَ ، لا بَطَل", "تُفَاحَةُ نيوتن", "لَيْسَ فِي كُلِّ مَرَّةٍ",
        "حلق عالياً"
    ]
    title_regex_patterns = [r"^\s*اللبنة\s*\d+\s*$", r"^\s*توليف\s*\d+\s*$", r"^\s*التحدي\s*\d*\s*$"] # Fin de ligne pour exactitude

    for title_kw in common_titles_keywords:
        if title_kw == text_stripped: return False
    for pattern in title_regex_patterns:
        if re.match(pattern, text_stripped, re.IGNORECASE): return False

    # 3. Instructions d'exercices et questions
    instruction_keywords_starters = [
        "اَقْرَا", "اُلاحِظُ", "اُكْمِلُ", "اُجِيبُ", "اَكْتُبُ", "اَصِلُ", "اَبْحَثُ",
        "اَسْتَخْرِجُ", "صِلْ", "ضَعْ", "رَكِبْ", "رَكِّبْ", "اُعيدُ", "اُنْطِلاقاً", "أَقْرَأُ",
        "أُلاحِظُ", "أُكْمِلُ", "أُجِيبُ", "أَكْتُبُ", "أَصِلُ", "أَبْحَثُ", "أَسْتَخْرِجُ",
        "أُعيدُ قِراءَةَ النَّصِ", "اِسْتَخْرِجْ مِنَ النَّصِ", "أَقْرَأُ الْكَلِمَاتِ", "أَقْرَأُ النَّصَّ",
        "أَقْرَأُ الْفَقَراتِ"
    ]
    # Modèle: optionnel (numéro/lettre + puce) suivi d'un mot-clé d'instruction.
    match_instruction_start = re.match(r"^\s*(?:\d+|[\u0621-\u064A\u0660-\u0669a-zA-Z])\s*[-–—.)]?\s*(.*)", text_stripped)
    if match_instruction_start:
        text_after_bullet = match_instruction_start.group(1).strip()
        # Si le texte après la puce commence par un mot-clé d'instruction (ou si le mot-clé est le texte entier)
        if any(text_after_bullet.startswith(keyword) for keyword in instruction_keywords_starters) or \
           any(keyword == text_after_bullet for keyword in instruction_keywords_starters):
            return False
        if len(text_after_bullet.split()) < 8 and text_after_bullet.endswith(("؟", "?", ":")):
            return False
        # Cas comme "أ -" seul
        if re.fullmatch(r"^\s*[\u0621-\u064A\u0660-\u0669a-zA-Z]\s*[-–—.)]?\s*$", text_stripped):
            return False
        if len(text_after_bullet.split()) < 3 and not text_after_bullet: # Si après la puce c'est vide ou très court
             return False


    general_instructions_phrases = [
        "عَلي دَفْتَرِكَ", "في دَفْتَري", "ذاتِ الْمَعْ ني", "مِنَ الْكَلِم اتِ التّالِيَةِ",
        "الْفِقْرَتَيْنِ وَأُجِيبُ عَنِ الْأَسْئِلَةِ", "أَسْئِلَةِ عَلَى دَفْتَرِي", "أَكْبَرَ عَدَد مِنَ الْجُمَلِ"
    ]
    for instr_phrase in general_instructions_phrases:
        # Si la phrase d'instruction constitue la majorité du texte et que le texte est court
        if instr_phrase in text_stripped and len(text_stripped.split()) < (len(instr_phrase.split()) + 5) :
            return False

    # 4. Tableaux de vocabulaire / Listes de mots
    table_keywords = ["مَتي", "ذَ لِك", "ُحَيْث", "َكَيْف", "َّلَكِن", "ُاَقْرَا", "ِهَذِ ه", "حَت ي",
                      "َمَع", "طِ فْل", "َمَدْ رَس ة", "َتَحْت", "ْهَل", "لِماذا", "اَلَّتي", "اُخْت",
                      "اِمّْلء", "اِسْم", "اَلَّذي", "َفَوْق", "ْمَن", "في", "هَذا", "َاَيْن",
                      "اَعْتَني", "مُشاكِس", "ُيَغْ ضَب", "مُنافَسَة", "ِاِجْت هاد", "عَزيمَة"]

    words_in_line = text_stripped.split()
    word_count_in_stripped = len(words_in_line)

    if word_count_in_stripped > 0 and word_count_in_stripped <= 15: # Lignes typiques des tableaux/listes
        # Compter les mots séparés par des tirets (comme la liste اَعْتَني- مُشاكِس-)
        if text_stripped.count('-') >= 2 and word_count_in_stripped > text_stripped.count('-') : # plus de 2 tirets
             return False # Probablement une liste de mots avec tirets

        table_keyword_count = sum(1 for kw in table_keywords if kw in words_in_line)
        if table_keyword_count / word_count_in_stripped >= 0.6: # Si 60% des mots sont des keywords de tableau
            if not text_stripped.endswith(('.', '!', '؟', ':')):
                return False
        elif word_count_in_stripped <= 5 and table_keyword_count >=2 : # Très court avec au moins 2 mots de tableau
             if not text_stripped.endswith(('.', '!', '؟', ':')):
                return False


    # 5. Cas des exercices à compléter
    if ".........." in text_stripped and word_count_in_stripped < 10:
        return False

    return True

def extract_text_blocks_from_pdf(pdf_path):
    page_blocks_text = []
    try:
        doc = fitz.open(pdf_path)
        for page_num in range(len(doc)):
            page = doc[page_num]
            blocks_raw_text = page.get_text("blocks", sort=True)
            for block_info in blocks_raw_text:
                if block_info[6] == 0:
                    current_block_text = block_info[4]
                    if current_block_text.strip():
                        page_blocks_text.append(current_block_text.strip())
        doc.close()
    except Exception as e:
        print(f"Erreur extraction blocs PDF {pdf_path}: {e}")
    return page_blocks_text

def process_all_pdfs_for_contexts(pdf_dir, output_file):
    all_initial_paragraphs_data = []

    if not os.path.exists(pdf_dir):
        print(f"Le répertoire PDF '{pdf_dir}' n'existe pas.")
        return
    pdf_files = [f for f in os.listdir(pdf_dir) if f.lower().endswith(".pdf")]
    if not pdf_files:
        print(f"Aucun fichier PDF dans '{pdf_dir}'.")
        return

    for filename in pdf_files:
        pdf_path = os.path.join(pdf_dir, filename)
        print(f"\n--- Traitement du fichier : {filename} ---")

        raw_blocks = extract_text_blocks_from_pdf(pdf_path)

        file_paragraph_id_counter = 0
        for block_text_raw in raw_blocks:
            normalized_block_text = normalize_arabic_text(block_text_raw)

            # Chaque bloc est un candidat de paragraphe, les \n internes au bloc sont préservés ici
            # La division en sous-paragraphes se fera si le bloc contient \n\s*\n+
            # Mais la plupart des blocs de PyMuPDF sont déjà bien segmentés.

            # Traiter le bloc normalisé comme un paragraphe candidat
            # Si le bloc contient lui-même des séparateurs de paragraphe, les traiter
            sub_paragraphs = re.split(r'\n\s*\n+', normalized_block_text) # Sépare par un ou plusieurs sauts de ligne
            for para_candidate in sub_paragraphs:
                para_stripped = para_candidate.strip() # Enlève les espaces de début/fin seulement
                if not para_stripped: continue

                if is_narrative_paragraph(para_stripped) and len(para_stripped.split()) >= MIN_PARAGRAPH_WORD_COUNT_INITIAL:
                    file_paragraph_id_counter += 1
                    all_initial_paragraphs_data.append({
                        "source_pdf": filename,
                        "original_id_in_file": file_paragraph_id_counter,
                        "contexte_raw_lines": para_stripped # Conserve les \n pour la fusion
                    })
        print(f"Nombre initial de paragraphes candidats pour {filename}: {file_paragraph_id_counter}")

    if not all_initial_paragraphs_data:
        print("Aucun contexte narratif initial n'a été extrait.")
        return

    # Logique de fusion
    fused_paragraphs_output = []
    global_id_counter = 0
    if all_initial_paragraphs_data:
        current_fused_item = dict(all_initial_paragraphs_data[0])
        current_fused_item["contexte"] = current_fused_item.pop("contexte_raw_lines")

        for i in range(1, len(all_initial_paragraphs_data)):
            next_para_item = all_initial_paragraphs_data[i]
            next_context_raw_lines = next_para_item["contexte_raw_lines"]

            can_fuse = (
                current_fused_item["source_pdf"] == next_para_item["source_pdf"] and
                not current_fused_item["contexte"].strip().endswith(('.', '!', '؟', ':')) and # Ne fusionne pas si le courant finit bien
                len(current_fused_item["contexte"].split()) < FUSE_CURRENT_PARA_MAX_WORDS and
                len(next_context_raw_lines.split()) < FUSE_NEXT_PARA_MAX_WORDS and
                is_narrative_paragraph(next_context_raw_lines) # S'assurer que le suivant est aussi narratif
            )

            if can_fuse:
                current_fused_item["contexte"] += "\n" + next_context_raw_lines
            else:
                final_context_text = clean_internal_newlines_and_final_spaces(current_fused_item["contexte"])
                if len(final_context_text.split()) >= MIN_FUSED_PARAGRAPH_WORD_COUNT: # Vérifier la longueur finale avant fusion
                     # Seulement si après fusion ou seul, il respecte le seuil *final*
                    if len(final_context_text.split()) >= MIN_FUSED_PARAGRAPH_WORD_COUNT :
                        global_id_counter += 1
                        current_fused_item["global_id"] = global_id_counter
                        current_fused_item["contexte"] = final_context_text
                        fused_paragraphs_output.append(current_fused_item)

                current_fused_item = dict(next_para_item)
                current_fused_item["contexte"] = current_fused_item.pop("contexte_raw_lines")

        final_context_text_last = clean_internal_newlines_and_final_spaces(current_fused_item["contexte"])
        if len(final_context_text_last.split()) >= MIN_FUSED_PARAGRAPH_WORD_COUNT:
            global_id_counter += 1
            current_fused_item["global_id"] = global_id_counter
            current_fused_item["contexte"] = final_context_text_last
            fused_paragraphs_output.append(current_fused_item)

    if not fused_paragraphs_output:
        print("Aucun contexte narratif après fusion et filtrage final.")
        return

    with open(output_file, 'w', encoding='utf-8') as f:
        for item in fused_paragraphs_output:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    print(f"\n{len(fused_paragraphs_output)} contextes narratifs (après fusion) extraits et sauvegardés dans {output_file}")

# --- Exécution ---
if __name__ == "__main__":
    if not os.path.exists(PDF_DIRECTORY):
        try:
            os.makedirs(PDF_DIRECTORY)
            print(f"Dossier '{PDF_DIRECTORY}' créé. Veuillez y placer vos fichiers PDF arabes.")
        except OSError as e:
            print(f"Erreur création dossier '{PDF_DIRECTORY}': {e}")
            exit()

    process_all_pdfs_for_contexts(PDF_DIRECTORY, OUTPUT_CONTEXT_FILE)

    if os.path.exists(OUTPUT_CONTEXT_FILE):
        print(f"\n--- Premiers contextes extraits de {OUTPUT_CONTEXT_FILE} ---")
        count = 0
        with open(OUTPUT_CONTEXT_FILE, 'r', encoding='utf-8') as f:
            for line in f:
                if count < 20:
                    try:
                        data = json.loads(line)
                        print(f"\nSource: {data.get('source_pdf')}, Global_ID: {data.get('global_id')}")
                        print(f"Contexte: {data.get('contexte')}")
                        print("-" * 30)
                        count += 1
                    except json.JSONDecodeError:
                        print(f"Ligne malformée: {line.strip()}")
                else:
                    break