In [6]:
import os
import chardet
import math
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModelForCausalLM

# --- Step 1: Load any file smartly ---
def smart_load_text(file_path):
    with open(file_path, "rb") as f:
        raw_data = f.read()
    result = chardet.detect(raw_data)
    encoding = result['encoding'] or "utf-8"
    text = raw_data.decode(encoding, errors="replace")
    return text

# --- Step 2: Extract text from HTML ---
def extract_text_from_html(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    for unwanted in soup(["script", "style", "head", "footer", "nav"]):
        unwanted.decompose()
    text = soup.get_text(separator="\n")
    lines = [line.strip() for line in text.splitlines()]
    return "\n".join(line for line in lines if line)

# --- Step 3: Clean Hebrew text ---
import re

def clean_hebrew_text(text):
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"(?<=[^\.\!\?:])\n(?=[^\n\Wא-תa-zA-Z])", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)
    text = re.sub(r"(\S)[ ]{3,}(\S)", r"\1 | \2", text)
    text = re.sub(r"^[•·●▪️✓✔▶►❖-]+[ \t]+", "- ", text, flags=re.MULTILINE)
    text = re.sub(r"[^\x00-\x7F\u0590-\u05FF\d\.\,\-\:\;\|\!\?\(\)\"\'\n ]", " ", text)
    text = re.sub(r"[ \t]+\n", "\n", text)
    return text.strip()

# --- Step 4: Count tokens ---
def count_tokens(text, tokenizer):
    return len(tokenizer.encode(text, add_special_tokens=False))

# --- Step 5: Split text into chunks ---
def split_into_chunks(text, tokenizer, max_tokens=2000):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk = tokens[i:i+max_tokens]
        chunk_text = tokenizer.decode(chunk)
        chunks.append(chunk_text)
    return chunks

# --- Step 6: Summarize one chunk ---
def summarize_one_chunk(text, template, model, tokenizer, max_new_tokens=800, temperature=0.3):
    prompt = f"""
הטקסט הבא הוא הדוח שצריך לסכם:

{text}

אנא סכם את הדוח לפי התבנית הבאה:

{template}
"""
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids
    input_ids = input_ids.to(model.device)

    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    torch.cuda.empty_cache()  # Free memory immediately after generation

    return tokenizer.decode(output[0], skip_special_tokens=True)

# --- Step 7: Master pipeline ---
def summarize_file(file_path, template_path, model, tokenizer):
    # Step A: Load file
    raw_text = smart_load_text(file_path)

    # Step B: If HTML, extract text
    if file_path.endswith((".html", ".htm")):
        raw_text = extract_text_from_html(raw_text)

    # Step C: Clean text
    clean_text = clean_hebrew_text(raw_text)

    # Step D: Load template
    with open(template_path, "r", encoding="utf-8") as f:
        template = f.read()

    # Step E: Check if need to split
    total_tokens = count_tokens(clean_text, tokenizer)

    if total_tokens <= 8000:
        chunks = [clean_text]
    else:
        chunks = split_into_chunks(clean_text, tokenizer)

    # Step F: Summarize each chunk
    summaries = []
    for idx, chunk in enumerate(chunks):
        print(f"Summarizing chunk {idx+1}/{len(chunks)}...")
        summary = summarize_one_chunk(chunk, template, model, tokenizer)
        summaries.append(summary)

    # Step G: Combine summaries
    final_summary = "\n\n".join(summaries)
    
    return final_summary


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_path = "G:/tase_project/models/dictalm2.0"

# 1. Load DictaLM model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
device = "cuda" 
model = AutoModelForCausalLM.from_pretrained(model_path, local_files_only=True).to(device)
print(f"Model loaded on {device}")

# 1. Load DictaLM model and tokenizer
pdf_path = r"../inputs/אבגד מקוצר מאוד.pdf"
template_path = r"../inputs/שינוי החזקות בע נ.משרה.txt"

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model loaded on cuda


In [7]:


# 2. Call the pipeline
final_summary = summarize_file(
    file_path= pdf_path,  # Your input file (HTML, PDF text, or TXT)
    template_path= template_path,        # Your template file
    model=model,
    tokenizer=tokenizer
)

# 3. Save or print
with open("final_summary.txt", "w", encoding="utf-8") as f:
    f.write(final_summary)

print("✅ Summarization complete!")


Summarizing chunk 1/235...


KeyboardInterrupt: 