In [None]:

!pip -q install pymupdf python-pptx pillow transformers sentence-transformers scikit-learn tqdm

In [None]:
import os, io, json, csv
from tqdm import tqdm
import fitz  # PyMuPDF
from pptx import Presentation
from PIL import Image

from transformers import pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# summarizer: BART
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device_map="auto")

# embeddings: MiniLM
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


In [None]:
# OCR engine + German language pack
!sudo apt-get -qq update && sudo apt-get -qq install -y tesseract-ocr tesseract-ocr-deu
!pip -q install pytesseract


In [None]:
import io, re
import pytesseract
from PIL import Image
import numpy as np

OCR_LANG = "eng+deu"      # supports English + German
OCR_PSM = "6"             # treat as a block of text
OCR_OEM = "3"             # default LSTM engine
OCR_CONFIG = f"--oem {OCR_OEM} --psm {OCR_PSM}"

def _clean_text(s: str) -> str:
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

def _ocr_pil(img: Image.Image) -> str:
    return _clean_text(pytesseract.image_to_string(img, lang=OCR_LANG, config=OCR_CONFIG))

def _pdf_page_to_image(page, dpi=300) -> Image.Image:
    pix = page.get_pixmap(dpi=dpi)
    return Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")

def extract_pdf_text(path, ocr=True, min_chars_no_ocr=25, dpi=300):
    import fitz
    doc = fitz.open(path)
    slides, ocr_count = [], 0
    for i, page in enumerate(doc):
        raw = page.get_text().strip()
        used_ocr = False
        image_only = False
        if (not raw or len(raw) < min_chars_no_ocr) and ocr:
            img = _pdf_page_to_image(page, dpi=dpi)
            raw = _ocr_pil(img)
            used_ocr = True
            image_only = True
            ocr_count += 1
        slides.append({
            "index": i,
            "text": _clean_text(raw),
            "source": "pdf",
            "ocr_used": used_ocr,
            "image_only": image_only
        })
    print(f"OCR used on {ocr_count}/{len(slides)} PDF pages.")
    return slides

def extract_pptx_text(path, ocr=True):
    from pptx import Presentation
    prs = Presentation(path)
    slides, ocr_count = [], 0
    for i, s in enumerate(prs.slides):
        texts, images_ocr = [], []
        has_picture = False
        for shape in s.shapes:
            if hasattr(shape, "text") and shape.text:
                texts.append(shape.text)
            # 13 == picture
            if getattr(shape, "shape_type", None) == 13:
                has_picture = True
                if ocr:
                    try:
                        img = Image.open(io.BytesIO(shape.image.blob)).convert("RGB")
                        images_ocr.append(_ocr_pil(img))
                    except Exception:
                        pass
        raw = _clean_text("\n".join(t for t in texts if t))
        used_ocr = False
        image_only = False
        if ocr and (not raw or len(raw) < 25) and has_picture:
            ocr_txt = _clean_text("\n".join(t for t in images_ocr if t))
            if ocr_txt:
                raw = (raw + "\n" + ocr_txt).strip() if raw else ocr_txt
                used_ocr = True
                image_only = (len(_clean_text("\n".join(texts))) < 5)
                ocr_count += 1
        slides.append({
            "index": i,
            "text": raw,
            "source": "pptx",
            "ocr_used": used_ocr,
            "image_only": image_only
        })
    print(f"OCR used on {ocr_count}/{len(slides)} PPTX slides.")
    return slides


In [None]:
from google.colab import files
up = files.upload()  # choose your PDF or PPTX
file_path = list(up.keys())[0]
print(" Uploaded:", file_path)


In [None]:
ext = extract_pdf_text if file_path.lower().endswith(".pdf") else extract_pptx_text
slides = ext(file_path, ocr=True)
len(slides), sum(s["ocr_used"] for s in slides), sum(s["image_only"] for s in slides)


In [None]:
# Summarizer (multilingual: DE/EN)
from transformers import pipeline
try:
    summarizer
except NameError:
    summarizer = pipeline(
        "summarization",
        model="csebuetnlp/mT5_multilingual_XLSum",
        device_map="auto"
    )

# Embeddings (multilingual MiniLM)
from sentence_transformers import SentenceTransformer
try:
    embedder
except NameError:
    embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")


In [None]:
from tqdm import tqdm

def summarize_text(txt):
    txt = (txt or "").strip()
    if not txt or len(txt.split()) < 25:
        return txt  # keep as-is for very short slides
    out = summarizer(txt, max_length=160, min_length=60, do_sample=False)[0]["summary_text"]
    return out.strip()

slide_summaries = []
for s in tqdm(slides):
    summary = summarize_text(s["text"])
    slide_summaries.append({
        "index": s["index"],
        "text": s["text"],
        "summary": summary,
        "ocr_used": s.get("ocr_used", False),
        "image_only": s.get("image_only", False),
        "source": s.get("source", "pdf")
    })

# quick peek
slide_summaries[0]


In [None]:
from transformers import pipeline

# CPU summarizer (stable)
summarizer = pipeline(
    "summarization",
    model="csebuetnlp/mT5_multilingual_XLSum",
    device=-1  # CPU
)

def chunk_by_tokens(text, tokenizer, max_tokens=480):
    enc = tokenizer(text, return_attention_mask=False, truncation=False)
    ids = enc["input_ids"]
    chunks = []
    for i in range(0, len(ids), max_tokens):
        chunk_ids = ids[i:i+max_tokens]
        chunks.append(tokenizer.decode(chunk_ids, skip_special_tokens=True))
    return chunks

def summarize_long_text(text, target_max=260, target_min=160, max_input_tokens=480):
    tok = summarizer.tokenizer
    if not text.strip():
        return ""
    pieces = chunk_by_tokens(text, tok, max_tokens=max_input_tokens)

    partial = []
    for p in pieces:
        s = summarizer(p, max_length=160, min_length=60, do_sample=False, truncation=False)[0]["summary_text"]
        partial.append(s)

    merged = "\n".join(partial)
    final = summarizer(merged, max_length=target_max, min_length=target_min, do_sample=False, truncation=False)[0]["summary_text"]
    return final.strip()

# Build deck summary
full_text = "\n\n".join(s["text"] for s in slides if s["text"])
deck_summary = summarize_long_text(full_text, target_max=260, target_min=160, max_input_tokens=480)

import os
os.makedirs("outputs", exist_ok=True)
with open("outputs/deck_summary.txt", "w", encoding="utf-8") as f:
    f.write(deck_summary)

print(deck_summary[:400])


In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer

try:
    embedder
except NameError:
    embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

corpus = [f"Slide {s['index']}: {s['summary'] or s['text']}" for s in slide_summaries]
embs = embedder.encode(corpus, convert_to_numpy=True, normalize_embeddings=True)

def search(query, top_k=5):
    q = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    sims = (embs @ q.T).ravel()
    top_idx = sims.argsort()[::-1][:top_k]
    return [
        {
            "slide_index": slide_summaries[i]["index"],
            "score": float(sims[i]),
            "preview": (slide_summaries[i]["summary"] or slide_summaries[i]["text"])[:240]
        }
        for i in top_idx
    ]

search("Markow-Eigenschaft")  # or any query


In [None]:
import pandas as pd
df = pd.DataFrame([
    {"index": s["index"], "ocr": s["ocr_used"], "image_only": s["image_only"],
     "preview": s["text"][:120].replace("\n"," ")}
    for s in slides
])
df[df["ocr"] | df["image_only"]].head(10)  # shows any OCR'd slides


In [None]:
import re, os
from collections import Counter, defaultdict

def top_keywords(text, k=3):
    words = re.findall(r"[A-Za-zÄÖÜäöüß][A-Za-zÄÖÜäöüß\-]{2,}", text or "")
    stop = set("""
der die das ein eine und oder für von mit ohne zu auf an im in aus ist sind war waren sein auch sowie
the a an and or for from with without into in on of to is are was were be been being this that these those it its by as at if then else when while not
""".split())
    words = [w.lower() for w in words if w.lower() not in stop]
    return [w for w,_ in Counter(words).most_common(k)]

kw_to_slides = defaultdict(list)
for s in slide_summaries:
    base = s["summary"] or s["text"]
    for kw in top_keywords(base, k=3):
        kw_to_slides[kw].append(s["index"])

# keep top ~12 keywords
top_kws = [kw for kw,_ in Counter(kw_to_slides.keys()).most_common()]
top_kws = list(kw_to_slides.keys())[:12]

title = os.path.basename(file_path)
lines = ["mindmap", f"  root(({title}))"]
for kw in top_kws:
    lines.append(f"    {kw}({kw})")
    for idx in sorted(kw_to_slides[kw])[:6]:
        lines.append(f"      s{idx}([Slide {idx}])")

os.makedirs("outputs", exist_ok=True)
with open("outputs/mindmap.mmd", "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print(" mindmap.mmd written to outputs/. Paste it into https://mermaid.live to render.")


In [None]:
import csv, re

def split_points(text):
    bullets = re.findall(r"(?:^|\n)[\-\•\*]\s*(.+)", text or "")
    if bullets:
        return [b.strip() for b in bullets if len(b.strip()) > 3][:4]
    sents = re.split(r"(?<=[\.\!\?])\s+", text or "")
    return [s for s in sents if len(s.split()) >= 6][:3]

def detect_de(text: str) -> bool:
    t = (text or "").lower()
    if any(c in t for c in "äöüÄÖÜß"): return True
    de_hits = sum(w in t for w in ["und","der","die","das","ist","nicht","ein","eine","oder","auch"])
    en_hits = sum(w in t for w in ["and","the","is","are","not","or","also","a","an"])
    return de_hits >= en_hits

rows = []
for s in slide_summaries:
    base = s["summary"] or s["text"]
    if not base:
        continue
    de = detect_de(base)
    items = split_points(base)
    if items:
        for j, it in enumerate(items, 1):
            q = (f"Nenne einen Kerngedanken (Folie {s['index']}, Punkt {j}):"
                 if de else
                 f"Name one key idea (slide {s['index']}, point {j}):")
            rows.append([q, it, f"ClassMind;{'DE' if de else 'EN'}"])
    else:
        q = (f"Was ist die Kernaussage von Folie {s['index']}?"
             if de else
             f"What is the main idea of slide {s['index']}?")
        rows.append([q, base, f"ClassMind;{'DE' if de else 'EN'}"])

os.makedirs("outputs", exist_ok=True)
with open("outputs/flashcards.csv", "w", encoding="utf-8", newline="") as f:
    w = csv.writer(f); w.writerow(["Question","Answer","Tags"]); w.writerows(rows)

print(f"flashcards.csv written to outputs/ — total cards: {len(rows)}")


In [None]:
from transformers import pipeline

try:
    summarizer
except NameError:
    summarizer = pipeline("summarization", model="csebuetnlp/mT5_multilingual_XLSum", device=-1)

def chunk_by_tokens(text, tokenizer, max_tokens=480):
    enc = tokenizer(text, return_attention_mask=False, truncation=False)
    ids = enc["input_ids"]
    return [tokenizer.decode(ids[i:i+max_tokens], skip_special_tokens=True)
            for i in range(0, len(ids), max_tokens)]

def summarize_long_text(text, target_max=260, target_min=160, max_input_tokens=480):
    tok = summarizer.tokenizer
    if not text.strip():
        return ""
    parts = chunk_by_tokens(text, tok, max_tokens=max_input_tokens)
    partial = [summarizer(p, max_length=160, min_length=60, do_sample=False, truncation=False)[0]["summary_text"] for p in parts]
    merged = "\n".join(partial)
    final = summarizer(merged, max_length=target_max, min_length=target_min, do_sample=False, truncation=False)[0]["summary_text"]
    return final.strip()

full_text = "\n\n".join(s["text"] for s in slides if s["text"])
deck_summary = summarize_long_text(full_text)

with open("outputs/deck_summary.txt", "w", encoding="utf-8") as f:
    f.write(deck_summary)

print("✅ deck_summary.txt written to outputs/")
