In [None]:
import re
import fitz  # PyMuPDF
from langdetect import detect

PDF_PATH = "C:/Users/OUMAIMA/Desktop/cv_oumeymaFinal.pdf"

def extract_text_from_pdf(pdf_path):
    """Extract text from PDF using PyMuPDF."""
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        page_text = page.get_text()
        if page_text.strip():
            text += page_text + "\n"
    doc.close()
    return text

def clean_text(text):
    """Clean text by replacing accented characters and removing special chars except \n and spaces."""
    text = text.replace('à', 'a').replace('â', 'a').replace('ä', 'a')
    text = text.replace('é', 'e').replace('è', 'e').replace('ê', 'e').replace('ë', 'e')
    text = text.replace('î', 'i').replace('ï', 'i')
    text = text.replace('ô', 'o').replace('ö', 'o')
    text = text.replace('ù', 'u').replace('û', 'u').replace('ü', 'u')
    text = text.replace('ç', 'c')
    text = re.sub(r'[^\w\s\n]', '', text)
    return text

def detect_language(text):
    """Detect if the text is English or French with a larger sample and header check."""
    try:
        sample = text[:1000].lower()
        lang = detect(sample)
        if re.search(r'\b(formation|éducation|compétences|projets académiques)\b', sample):
            return "French"
        return "French" if lang == "fr" else "English" if lang == "en" else f"Other ({lang})"
    except:
        return "Unknown"

def simple_experience_score(block, language):
    """Score a block based on the presence of common experience-related words."""
    block = block.lower()
    if language == "French":
        keywords = [
            r"j'ai travaille", r"stage chez", r"responsable de", r"projet de developpement",
            r"experience professionnelle", r"mission principale", r"participation.*projet",
            r"analyse et conception", r"developpement d'une application", r"mise en place de",
            r"gestion d'une equipe", r"travail en collaboration", r"conception technique"
        ]
    else:
        keywords = [
            r"worked at", r"internship at", r"responsible for", r"developed an application",
            r"professional experience", r"main tasks included", r"participated in.*project",
            r"designed and implemented", r"managed a team", r"collaborated with",
            r"technical design", r"led the development of", r"project management"
        ]
    return sum(1 for keyword in keywords if re.search(keyword, block))

def find_best_experience_block(text, language, initial_window=5):
    """Find the best block of text for Work Experience using a scoring approach."""
    lines = text.splitlines()
    if not lines:
        return "No text extracted from PDF."

    best_start = 0
    best_end = min(initial_window, len(lines))
    max_score = simple_experience_score("\n".join(lines[best_start:best_end]), language)
    current_start = 0
    current_end = best_end

    print(f"Initial block: {lines[best_start:best_end]}, Score: {max_score}")

    # Pre-filter to skip initial low-scoring lines (e.g., personal details)
    while current_start < len(lines) - initial_window:
        initial_block = "\n".join(lines[current_start:current_start + initial_window])
        score = simple_experience_score(initial_block, language)
        print(f"Checking block {lines[current_start:current_start + initial_window]}, Score: {score}")
        if score > 0:
            break
        current_start += 1
    best_start = current_start
    current_end = current_start + initial_window

    # Iterate to find the best scoring block
    while current_end < len(lines):
        current_block = "\n".join(lines[current_start:current_end])
        current_score = simple_experience_score(current_block, language)
        print(f"Current block: {lines[current_start:current_end]}, Score: {current_score}")

        # Add the next line and check score
        next_end = current_end + 1
        if next_end <= len(lines):
            new_block = "\n".join(lines[current_start:next_end])
            new_score = simple_experience_score(new_block, language)
            print(f"New block with next line: {lines[current_start:next_end]}, Score: {new_score}")
            if new_score > current_score:
                current_score = new_score
                current_end = next_end
            else:
                # Stop if the block is large enough and score drops significantly
                if current_end - current_start >= initial_window * 2 and new_score < max_score * 0.6:
                    print(f"Stopping: Block size {current_end - current_start}, Score drop to {new_score}")
                    break

        # Update best block if current score is higher
        if current_score > max_score:
            max_score = current_score
            best_start = current_start
            best_end = current_end
            print(f"New max score: {max_score}, Block: {lines[best_start:best_end]}")

    experience_block = "\n".join(lines[best_start:best_end]).strip()
    line_count = len(experience_block.splitlines())
    print(f"Work experience block spans {line_count} lines.")
    return experience_block if experience_block else "No valid experience block found."

# MAIN
raw_text = extract_text_from_pdf(PDF_PATH)
language = detect_language(raw_text)
print(f"\n📄 Langue détectée : {language}")

best_exp = find_best_experience_block(raw_text, language)

print("\n=== SECTION D'EXPÉRIENCE DÉTECTÉE ===\n")
print(best_exp)