In [None]:

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_path = "G:/tase_project/models/dictalm2.0"

# Load from local path
tokenizer = AutoTokenizer.from_pretrained(model_path)
device = "cuda" 
model = AutoModelForCausalLM.from_pretrained(model_path, local_files_only=True).to(device)
print(f"Model loaded on {device}")


In [None]:
def generate_text(prompt, max_length=200, temperature=0.9, top_p=0.9):
    """
    Generate text from DictaLM2.0 based on a given prompt.
    """
    # Add ":" and newline to hint the model to continue
    prepared_prompt = prompt.strip() + ":\n"

    # Tokenize the prompt
    inputs = tokenizer(prepared_prompt, return_tensors="pt").to(device)

    # Generate output
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            do_sample=True,
            temperature=temperature,
            top_p=top_p
        )

    # Decode the output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return generated_text


In [25]:
import fitz
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def smart_split_text_by_tokens(text, tokenizer, max_tokens=300):
    tokens = tokenizer.encode(text)
    chunks = []
    current_chunk = []
    
    for token in tokens:
        current_chunk.append(token)
        if len(current_chunk) >= max_tokens:
            chunks.append(current_chunk)
            current_chunk = []

    if current_chunk:
        chunks.append(current_chunk)

    decoded_chunks = [tokenizer.decode(chunk) for chunk in chunks]
    return decoded_chunks

def summarize_pdf_to_text(pdf_path, model = model, tokenizer = tokenizer, device = device, chunk_tokens=300, summary_tokens=200):
    # 1. Extract text from the PDF
    full_text = extract_text_from_pdf(pdf_path)
    
    # 2. Split text into smaller chunks
    chunks = smart_split_text_by_tokens(full_text, tokenizer, max_tokens=chunk_tokens)
    
    # 3. Summarize each chunk
    summaries = []
    for chunk_text in chunks:
        prompt = f"הנה קטע טקסט. סכם אותו בקצרה בעברית:\n\n{chunk_text}\n\nסיכום:"
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=summary_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                repetition_penalty=1.1
            )

        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        summaries.append(summary)
    
    # 4. Combine all summaries into one text
    full_summary = "\n\n".join(summaries)
    
    return full_summary

def summarize_text_to_text(full_text, model, tokenizer, device, chunk_tokens=300, summary_tokens=200):
    
    # 2. Split text into smaller chunks
    chunks = smart_split_text_by_tokens(full_text, tokenizer, max_tokens=chunk_tokens)
    
    # 3. Summarize each chunk with a progress bar
    summaries = []
    
    for chunk_text in tqdm(chunks, desc="Summarizing chunks", unit="chunk"):
        prompt = f"הנה קטע טקסט. סכם אותו בקצרה בעברית:\n\n{chunk_text}\n\nסיכום:"
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=summary_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                repetition_penalty=1.1
            )

        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        summaries.append(summary)
    
    # 4. Combine all summaries into one text
    full_summary = "\n\n".join(summaries)
    
    return full_summary

In [None]:
# Assume model, tokenizer, device already loaded
pdf_path = r"../inputs/1660447.htm"

summary = summarize_pdf_to_text(
    pdf_path,
    model=model,
    tokenizer=tokenizer,
    device=device,
    chunk_tokens=300,
    summary_tokens=200
)

# Print the summary
print(summary)


In [None]:
import re

def clean_extracted_text(text):
    # 1. Remove multiple spaces and tabs
    text = re.sub(r"[ \t]+", " ", text)

    # 2. Remove multiple newlines (more than 2) ➔ collapse to just one
    text = re.sub(r"\n{2,}", "\n", text)

    # 3. Remove newlines inside paragraphs
    # (Keep newlines only after sentence ends . ! ?)
    text = re.sub(r"(?<![\.\!\?])\n(?!\n)", " ", text)

    # 4. Remove weird characters (optional: you can expand this as needed)
    text = re.sub(r"[^\x00-\x7F\u0590-\u05FF]+", " ", text)  # Keep ASCII + Hebrew

    # 5. Strip leading/trailing spaces
    text = text.strip()

    return text

def clean_hebrew_pdf_text(text):
    # 1. Normalize line endings
    text = text.replace('\r\n', '\n').replace('\r', '\n')

    # 2. Remove multiple tabs/spaces
    text = re.sub(r"[ \t]+", " ", text)

    # 3. Merge broken lines inside paragraphs:
    # If a line ends without . ? ! or : and the next line starts with a lowercase or Hebrew letter — merge them
    text = re.sub(r"(?<=[^\.\!\?:])\n(?=[^\n\Wא-תa-zA-Z])", " ", text)

    # 4. Collapse multiple newlines into a single one (for clean paragraph breaks)
    text = re.sub(r"\n{2,}", "\n", text)

    # 5. Handle tables:
    # Sometimes tables split rows with too many spaces or newlines inside a row — try to fix
    text = re.sub(r"(\S)[ ]{3,}(\S)", r"\1 | \2", text)  # Assume 3+ spaces = table column gap
    text = re.sub(r"\n(?=\S+ [|] \S+)", " ", text)  # Merge table lines if they continue

    # 6. Normalize bullets:
    # Replace various Hebrew/English bullets with a standard dash
    text = re.sub(r"^[•·●▪️✓✔▶►❖-]+[ \t]+", "- ", text, flags=re.MULTILINE)

    # 7. Remove strange Unicode artifacts (keep Hebrew, English, numbers, and basic symbols)
    text = re.sub(r"[^\x00-\x7F\u0590-\u05FF\d\.\,\-\:\;\|\!\?\(\)\"\'\n ]", " ", text)

    # 8. Strip trailing spaces at end of lines
    text = re.sub(r"[ \t]+\n", "\n", text)

    # 9. Final trim
    text = text.strip()

    return text


In [23]:
from bs4 import BeautifulSoup
import chardet

def smart_load_html(file_path):
    # Step 1: Read the raw bytes first
    with open(file_path, "rb") as f:
        raw_data = f.read()

    # Step 2: Detect encoding
    result = chardet.detect(raw_data)
    detected_encoding = result['encoding']

    # Step 3: Use detected encoding to decode properly
    try:
        text = raw_data.decode(detected_encoding)
    except UnicodeDecodeError:
        # fallback: force decode as windows-1255 if detection failed
        text = raw_data.decode("windows-1255", errors="replace")
    
    return text

def extract_text_from_html(html_content):
    soup = BeautifulSoup(html_content, "html.parser")

    # Remove unwanted elements (optional: scripts, styles)
    for unwanted in soup(["script", "style", "head", "footer", "nav"]):
        unwanted.decompose()

    # Get raw text
    text = soup.get_text(separator="\n")  # Insert \n where tags like <p>, <div> end

    # Remove excessive newlines
    lines = [line.strip() for line in text.splitlines()]
    text = "\n".join(line for line in lines if line)

    return text
import re

def clean_hebrew_html_text(text):
    # 1. Normalize line endings
    text = text.replace('\r\n', '\n').replace('\r', '\n')

    # 2. Remove multiple spaces and tabs
    text = re.sub(r"[ \t]+", " ", text)

    # 3. Merge broken lines inside paragraphs
    text = re.sub(r"(?<=[^\.\!\?:])\n(?=[^\n\Wא-תa-zA-Z])", " ", text)

    # 4. Collapse multiple newlines into one
    text = re.sub(r"\n{2,}", "\n", text)

    # 5. Handle table columns: 3+ spaces mean new table column
    text = re.sub(r"(\S)[ ]{3,}(\S)", r"\1 | \2", text)

    # 6. Normalize bullets
    text = re.sub(r"^[•·●▪️✓✔▶►❖-]+[ \t]+", "- ", text, flags=re.MULTILINE)

    # 7. Remove strange Unicode artifacts (keep Hebrew, English, digits, symbols)
    text = re.sub(r"[^\x00-\x7F\u0590-\u05FF\d\.\,\-\:\;\|\!\?\(\)\"\'\n ]", " ", text)

    # 8. Strip trailing spaces
    text = re.sub(r"[ \t]+\n", "\n", text)

    # 9. Final trim
    text = text.strip()

    return text


In [None]:
# Load HTML
html_path = r"../inputs/1660447.htm"

html_content = smart_load_html(html_path)

# Step 1: Extract text
raw_text = extract_text_from_html(html_content)

# Step 2: Clean text
clean_text = clean_hebrew_html_text(raw_text)

# Now you can save or send it to the model
summary = summarize_text_to_text(
    clean_text,
    model=model,
    tokenizer=tokenizer,
    device=device,
    chunk_tokens=300,
    summary_tokens=200
)

# Print the summary
print(summary)


In [31]:
from transformers import AutoTokenizer, AutoModelForCausalLM

def summarize_with_template(
    input_text_or_path,
    template_path,
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=800,
    temperature=0.3
):
    """
    Summarizes input text according to a template.

    Parameters:
    - input_text_or_path: str or Path (either raw text or a file path)
    - template_path: str or Path to template.txt
    - model: Hugging Face model object
    - tokenizer: Hugging Face tokenizer object
    - max_new_tokens: number of tokens to generate
    - temperature: creativity control (lower = more strict)
    
    Returns:
    - The generated summary (str)
    """

    # Step 1: Load input text if given a path
    if isinstance(input_text_or_path, str) and input_text_or_path.endswith((".txt", ".htm", ".html")):
        # Assume it's a file path, read content
        with open(input_text_or_path, "r", encoding="utf-8") as f:
            input_text = f.read()
    else:
        # Otherwise assume it's already clean text (str)
        input_text = input_text_or_path

    # Step 2: Load template
    with open(template_path, "r", encoding="utf-8") as f:
        template = f.read()

    # Step 3: Build prompt
    prompt = f"""
הטקסט הבא הוא הדוח שצריך לסכם בעברית:

{input_text}

אנא סכם את הדוח לפי התבנית הבאה:

{template}
"""

    # Step 4: Tokenize
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids

    # Step 5: Generate
    output = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
    )

    # Step 6: Decode
    summary = tokenizer.decode(output[0], skip_special_tokens=True)

    return summary


In [None]:
pdf_path = r"../inputs/אבגד מקוצר מאוד.pdf"
template_path = r"../inputs/שינוי החזקות בע נ.משרהץ.txt"
# After extracting from PDF
raw_text = extract_text_from_pdf(pdf_path)

# Clean
clean_text = clean_hebrew_pdf_text(raw_text)
summary = summarize_with_template(clean_text,template_path)