# Easy Method

In [None]:
import difflib
import nltk
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from pdfminer.high_level import extract_text
import time
import fitz
import pytesseract
from PIL import Image
import io
import re
import pickle

In [None]:
nltk.download('punkt')
model = SentenceTransformer('all-mpnet-base-v2')  # Léger mais performant

In [None]:
decoup1 = None
decoup_page1 = None
decoup2 = None
decoup_page2 = None

# === 1.1 Charger les textes depuis PDF ===
def load_book(pdf_path):
    text = extract_text(pdf_path)
    return text

# === 1.2 Charger les textes depuis PDF avec OCR ===
def split_paragraphs_from_ocr_text(ocr_text, min_length=100):
    lines = ocr_text.splitlines()
    paragraphs = []
    buffer = ""

    for line in lines:
        line = line.strip()
        if not line:
            continue

        buffer += " " + line if buffer else line

        if re.search(r'[.!?…]$|["»]$', line.strip()):
            if len(buffer.strip()) >= min_length:
                paragraphs.append(buffer.strip())
            buffer = ""

    if buffer and len(buffer.strip()) >= min_length:
        paragraphs.append(buffer.strip())

    return paragraphs

def extract_text_ocr(pdf_path):
    doc = fitz.open(pdf_path)
    paragraphs = []
    paragraph_to_page = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pix = page.get_pixmap(dpi=300)
        img = Image.open(io.BytesIO(pix.tobytes("png")))

        text = pytesseract.image_to_string(img, lang="fra")  # ou 'eng' selon la langue
        page_paragraphs = split_paragraphs_from_ocr_text(text)

        for p in page_paragraphs:
            paragraphs.append(p)
            paragraph_to_page.append(page_num + 1)

    return paragraphs, paragraph_to_page

# === 2. Segmenter le texte (par paragraphe ou par phrase) ===
def segment_text(text, level='paragraph'):
    if level == 'paragraph':
        return [p.strip() for p in text.split('\n\n') if len(p.strip()) > 50]
    elif level == 'sentence':
        return nltk.sent_tokenize(text)
    else:
        raise ValueError("level must be 'paragraph' or 'sentence'")

# === 3.1 Aligner les segments dans l'ordre (approche naïve) ===
def align_texts(texts1, texts2):
    min_len = min(len(texts1), len(texts2))
    return list(zip(texts1[:min_len], texts2[:min_len]))

# === 3.2 Aligner les segments dans l'ordre (approche sémantique) ===
def align_texts_semantically(v1_segments, v2_segments, threshold=0.90):
    emb1 = model.encode(v1_segments, convert_to_tensor=True)
    emb2 = model.encode(v2_segments, convert_to_tensor=True)

    alignment = []
    used_v2 = set()
    print("len embedding: ", len(emb1))

    for i, vec1 in enumerate(emb1):
        if i % 1000 == 0:
            print(f'On est à l\'étape {i}')
        sims = util.cos_sim(vec1, emb2)[0]
        best_idx = sims.argmax().item()
        best_score = sims[best_idx].item()

        if best_score >= threshold and best_idx not in used_v2:
            alignment.append((i, best_idx, v1_segments[i], v2_segments[best_idx], best_score))
            used_v2.add(best_idx)
        else:
            alignment.append((i, None, v1_segments[i], "", best_score))  # paragraphe sans match

    return alignment

# === 4. Comparaison textuelle ===
def diff_ratio(a, b):
    return difflib.SequenceMatcher(None, a, b).ratio()

# === 5. Comparaison sémantique ===
def semantic_similarity(a, b):
    emb1 = model.encode(a, convert_to_tensor=True)
    emb2 = model.encode(b, convert_to_tensor=True)
    return float(util.cos_sim(emb1, emb2))

# === 6. Comparaison complète et rapport CSV ===
def compare_books(book1_path, book2_path, init_time, sim_threshold=0.90):
    print('DEBUT DE L\'ALGO')

    # v1_segments, v1_pages = extract_text_ocr(book1_path)
    # print(v1_segments[30:34])
    # print(v1_pages[30:34])
    # save_ocr_output(v1_segments, v1_pages, 'ocr_nouveau.pkl')
    # print('TEXT 1 IMPORTE')

    # print("Temps: ", time.time() - init_time)
    # v2_segments, v2_pages = extract_text_ocr(book2_path)
    # print(v2_segments[30:34])
    # print(v2_pages[30:34])
    # save_ocr_output(v2_segments, v2_pages, 'ocr_ancien.pkl')
    # print('TEXT 2 IMPORTE')
    # print("Temps: ", time.time() - init_time)

    v1_segments, v1_pages = load_ocr_output('ocr_nouveau.pkl')
    v2_segments, v2_pages = load_ocr_output('ocr_ancien.pkl')

    aligned = align_texts_semantically(v1_segments, v2_segments, threshold=sim_threshold)
    print('ALIGN TERMINE')
    print("Temps: ", time.time() - init_time)
    print('len align: ', len(aligned))

    report = []

    for i, j, seg1, seg2, sim in aligned:
        diff = diff_ratio(seg1, seg2) if seg2 else 0.0

        if diff < 0.95 or sim < sim_threshold:
            report.append({
                'index_v1': i,
                'index_v2': j if j is not None else 'NO_MATCH',
                'page_v1': v1_pages[i],
                'page_v2': v2_pages[j] if j is not None else 'NO_MATCH',
                'text_v1': seg1,
                'text_v2': seg2,
                'textual_similarity': round(diff, 3),
                'semantic_similarity': round(sim, 3)
            })

    return pd.DataFrame(report)

# utils
def save_ocr_output(paragraphs, pages, filename):
    with open(filename, 'wb') as f:
        pickle.dump((paragraphs, pages), f)

def load_ocr_output(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [None]:
# === 7. Utilisation ===
if __name__ == "__main__":
    df_diff = compare_books('documents/nouveau.pdf', 'documents/ancien.pdf', init_time=time.time())
    df_diff.to_csv('differences3.csv', index=False)
    print(df_diff.head())

# Color part

In [17]:
import fitz  # PyMuPDF
from collections import defaultdict

def extract_paragraphs(pdf_path, line_spacing_threshold=10):
    doc = fitz.open(pdf_path)
    paragraphs = defaultdict(list)

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            if "lines" not in block:
                continue

            lines_data = []

            for line in block["lines"]:
                y_top = line["bbox"][1]
                line_text = ""
                colors = set()

                for span in line["spans"]:
                    line_text += span["text"].strip() + " "
                    colors.add(span["color"])

                if line_text.strip():
                    lines_data.append({
                        "text": " ".join(line_text.strip().split()),
                        "y": y_top,
                        "colors": colors
                    })

            # Regroupe les lignes proches en paragraphes
            current_para = ""
            current_colors = set()
            last_y = None

            for line in lines_data:
                if last_y is not None and abs(line["y"] - last_y) > line_spacing_threshold:
                    # saut de paragraphe
                    text = current_para.strip()
                    if text:
                        paragraphs[text].append((page_num + 1, tuple(current_colors)))
                    current_para = ""
                    current_colors = set()

                current_para += line["text"] + " "
                current_colors.update(line["colors"])
                last_y = line["y"]

            # dernier paragraphe du bloc
            text = current_para.strip()
            if text:
                paragraphs[text].append((page_num + 1, tuple(current_colors)))

    return paragraphs

def int_to_rgb(color_int):
    """Convertit une couleur entière en tuple (R, G, B)"""
    r = (color_int >> 16) & 255
    g = (color_int >> 8) & 255
    b = color_int & 255
    return (r, g, b)

def is_grayscale(rgb, tolerance=15):
    """Renvoie True si la couleur est proche d'un gris (R ≈ G ≈ B)"""
    r, g, b = rgb
    return abs(r - g) < tolerance and abs(g - b) < tolerance and abs(r - b) < tolerance

def is_nearly_black(rgb, threshold=90):
    return sum(rgb) < threshold  # proche de (0, 0, 0)

def is_nearly_white(rgb, threshold=90):
    return sum(255 - c for c in rgb) < threshold  # proche de (255, 255, 255)

def is_significant_color(rgb):
    return not ((is_nearly_black(rgb) or is_nearly_white(rgb)))

def color_distance(c1, c2):
    """Distance euclidienne simple entre deux couleurs RGB"""
    r1, g1, b1 = int_to_rgb(c1)
    r2, g2, b2 = int_to_rgb(c2)
    return ((r2 - r1) ** 2 + (g2 - g1) ** 2 + (b2 - b1) ** 2) ** 0.5

def colors_are_significantly_different(set1, set2, threshold=380):
    """Vrai si une couleur de set1 diffère visiblement d'une de set2"""
    

    for c1 in set1:
        for c2 in set2:
            rgb1 = int_to_rgb(c1)
            rgb2 = int_to_rgb(c2)

            # Ignore les gris ou blancs/noirs neutres
            if not is_significant_color(rgb1) and not is_significant_color(rgb2):
                continue  # on saute cette comparaison
        
            if color_distance(c1, c2) >= threshold:
                return True
    return False


def compare_paragraphs(map1, map2):
    changed = []

    for para in map1:
        if para in map2:
            colors1 = set()
            pages1 = set()
            for page, color_set in map1[para]:
                colors1.update(color_set)
                pages1.add(page)

            colors2 = set()
            pages2 = set()
            for page, color_set in map2[para]:
                colors2.update(color_set)
                pages2.add(page)

            if colors_are_significantly_different(colors1, colors2):
                changed.append({
                    "text": para,
                    "colors_v1": colors1,
                    "colors_v2": colors2,
                    "pages_v1": pages1,
                    "pages_v2": pages2
                })

    return changed


if __name__ == "__main__":
    file_v1 = "documents/ancien_clean.pdf"
    file_v2 = "documents/nouveau.pdf"

    print("🔍 Extraction de paragraphes de l'ancien PDF...")
    paras_v1 = extract_paragraphs(file_v1)

    print("🔍 Extraction de paragraphes du nouveau PDF...")
    paras_v2 = extract_paragraphs(file_v2)

    print("🔎 Comparaison...")
    changed = compare_paragraphs(paras_v1, paras_v2)

    print(f"\n🟠 Paragraphes avec changement de couleur : {len(changed)}\n")
    for idx, entry in enumerate(changed, 1):
        short = entry['text'][:150].replace("\n", " ") + ("..." if len(entry['text']) > 150 else "")
        print(f"{idx}. 📄 Texte : {short}")
        print(f"   🧾 Pages ancien PDF : {sorted(entry['pages_v1'])}")
        print(f"   🎨 Couleurs ancien PDF : {entry['colors_v1']}")
        print(f"   🧾 Pages nouveau PDF : {sorted(entry['pages_v2'])}")
        print(f"   🎨 Couleurs nouveau PDF : {entry['colors_v2']}")
        print("-" * 80)

    # ✅ Extraire les couples uniques de pages avec changement de couleur
    page_pairs = set()

    for idx, entry in enumerate(changed, 1):
        page_pairs.add((tuple(entry['pages_v1']), tuple(entry['pages_v2'])))

    # 📤 Affichage final
    print(f"\n📘 Couples de pages avec changement de couleur détecté ({len(page_pairs)} couples uniques) :\n")
    for p1, p2 in sorted(page_pairs):
        print(f"Ancien PDF : page {p1}  ⇄  Nouveau PDF : page {p2}")


🔍 Extraction de paragraphes de l'ancien PDF...
🔍 Extraction de paragraphes du nouveau PDF...
🔎 Comparaison...

🟠 Paragraphes avec changement de couleur : 244

1. 📄 Texte : X
   🧾 Pages ancien PDF : [12, 689, 690, 691, 692, 693, 694, 695, 696, 726, 727, 728, 729, 730, 731, 732, 733, 734, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 756, 758, 759, 760, 761, 762, 763, 764, 765, 767, 768, 769, 770]
   🎨 Couleurs ancien PDF : {2301728, 16777215}
   🧾 Pages nouveau PDF : [11, 666, 667, 668, 669, 671, 673, 674, 676, 677, 678, 679, 680, 682, 684, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 710, 713]
   🎨 Couleurs nouveau PDF : {2894120, 16777215}
--------------------------------------------------------------------------------
2. 📄 Texte : A Connaissances fondamentales que tout étudiant doit connaître en fin de deuxième cycle.
   🧾 Pages ancien PDF : [27]
   🎨 Couleurs ancien PDF 