In [1]:
import re
def normalize_latex_math(text):
    """
    Preprocesses LaTeX text:
    1. Converts LaTeX inline math \( ... \) to $ ... $.
    2. Removes LaTeX comments (% to end of line), respecting \%.
    3. Removes preamble/end tags if \begin{document} is found.
    4. Adds space after opening curly braces ({).
    5. Adds space after lowercase LaTeX commands (\cmd) if not already present.
    6. Adds space after specific uppercase Greek commands (\Cmd) if not present.
    7. Cleans up extra blank lines and trims whitespace.
    """
    if not isinstance(text, str):
        print("Warning: Input to normalize_latex_math was not a string.")
        return text

    processed_text = text
    try:
        # 1. Normalize math \(...\) to $...$
        processed_text = re.sub(
            r'\\\(\s*(.*?)\s*\\\)',
            lambda match: f"${match.group(1).strip()}$",
            processed_text
        )

        # 2. Remove LaTeX comment lines (respects \%)
        processed_text = re.sub(r'(?<!\\)%.*$', '', processed_text, flags=re.MULTILINE)

        # 3. Remove preamble IF \begin{document} exists
        begin_doc_marker = r'\begin{document}'
        begin_doc_index = processed_text.find(begin_doc_marker)
        if begin_doc_index != -1:
            processed_text = processed_text[begin_doc_index + len(begin_doc_marker):]
        # 3b. Remove \end{document} if present near the end
        end_doc_marker = r'\end{document}'
        end_doc_index = processed_text.rfind(end_doc_marker)
        if end_doc_index != -1 and len(processed_text) - end_doc_index < 30:
            processed_text = processed_text[:end_doc_index]

        # --- Spacing Adjustments ---
        # 4. Add space after {
        processed_text = re.sub(r'\{', r'{ ', processed_text)

        # 5. Add space after lowercase commands (\cmd) if not followed by space
        processed_text = re.sub(r'(\\[a-z]+)(?!\s)', r'\1 ', processed_text)

        # --- NEW STEP 6 ---
        # 6. Add space after specific uppercase Greek commands (\Cmd) if not followed by space
        upper_greek_cmds = [
            'Gamma', 'Delta', 'Theta', 'Lambda', 'Xi', 'Pi',
            'Sigma', 'Upsilon', 'Phi', 'Psi', 'Omega'
            ]
        # Create pattern part like: Gamma|Delta|Theta...
        pattern_part = '|'.join(upper_greek_cmds)
        # Regex captures (\ + one of the commands), checks no following whitespace
        pattern_upper = rf'(\\ (?:{pattern_part}))(?!\s)'
        # Replacement adds back captured command (group 1) + space
        processed_text = re.sub(pattern_upper, r'\1 ', processed_text)
        # --- End NEW STEP 6 ---

        # 7. Clean up potential excessive blank lines
        processed_text = re.sub(r'(\n\s*){2,}', '\n', processed_text)
        # Remove leading/trailing whitespace from the whole result
        processed_text = processed_text.strip()

        # Optional: Collapse multiple spaces (might affect deliberate spacing)
        # processed_text = re.sub(r'[ \t]+', ' ', processed_text)

        return processed_text

    except Exception as e:
        error_message = f"Error during LaTeX text preprocessing: {e}"
        try:
            import streamlit as st
            st.error(error_message)
        except ImportError:
            print(error_message)
        return text # Return original text on error    


In [3]:
normalize_latex_math(r"$\alpha Z$")


'$\\alph a Z$'

In [4]:
import pdb

In [5]:
pdb.runcall(normalize_latex_math, r"$\alpha Z$")

> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(13)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     11 [0;31m    [0;36m7.[0m [0mCleans[0m [0mup[0m [0mextra[0m [0mblank[0m [0mlines[0m [0;32mand[0m [0mtrims[0m [0mwhitespace[0m[0;34m.[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     12 [0;31m    """
[0m[0;32m---> 13 [0;31m    [0;32mif[0m [0;32mnot[0m [0misinstance[0m[0;34m([0m[0mtext[0m[0;34m,[0m [0mstr[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     15 [0;31m        [0;32mreturn[0m [0mtext[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  text


'$\\alpha Z$'


ipdb>  


'$\\alpha Z$'


ipdb>  


'$\\alpha Z$'


ipdb>  n


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(17)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     15 [0;31m        [0;32mreturn[0m [0mtext[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     16 [0;31m[0;34m[0m[0m
[0m[0;32m---> 17 [0;31m    [0mprocessed_text[0m [0;34m=[0m [0mtext[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     18 [0;31m    [0;32mtry[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     19 [0;31m        [0;31m# 1. Normalize math \(...\) to $...$[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(18)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     16 [0;31m[0;34m[0m[0m
[0m[0;32m     17 [0;31m    [0mprocessed_text[0m [0;34m=[0m [0mtext[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 18 [0;31m    [0;32mtry[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     19 [0;31m        [0;31m# 1. Normalize math \(...\) to $...$[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     20 [0;31m        processed_text = re.sub(
[0m


ipdb>  


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(20)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     18 [0;31m    [0;32mtry[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     19 [0;31m        [0;31m# 1. Normalize math \(...\) to $...$[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 20 [0;31m        processed_text = re.sub(
[0m[0;32m     21 [0;31m            [0;34mr'\\\(\s*(.*?)\s*\\\)'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     22 [0;31m            [0;32mlambda[0m [0mmatch[0m[0;34m:[0m [0;34mf"${match.group(1).strip()}$"[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(21)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     19 [0;31m        [0;31m# 1. Normalize math \(...\) to $...$[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     20 [0;31m        processed_text = re.sub(
[0m[0;32m---> 21 [0;31m            [0;34mr'\\\(\s*(.*?)\s*\\\)'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     22 [0;31m            [0;32mlambda[0m [0mmatch[0m[0;34m:[0m [0;34mf"${match.group(1).strip()}$"[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     23 [0;31m            [0mprocessed_text[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(22)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     20 [0;31m        processed_text = re.sub(
[0m[0;32m     21 [0;31m            [0;34mr'\\\(\s*(.*?)\s*\\\)'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 22 [0;31m            [0;32mlambda[0m [0mmatch[0m[0;34m:[0m [0;34mf"${match.group(1).strip()}$"[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     23 [0;31m            [0mprocessed_text[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     24 [0;31m        )
[0m


ipdb>  


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(23)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     21 [0;31m            [0;34mr'\\\(\s*(.*?)\s*\\\)'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     22 [0;31m            [0;32mlambda[0m [0mmatch[0m[0;34m:[0m [0;34mf"${match.group(1).strip()}$"[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 23 [0;31m            [0mprocessed_text[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     24 [0;31m        )
[0m[0;32m     25 [0;31m[0;34m[0m[0m
[0m


ipdb>  


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(20)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     18 [0;31m    [0;32mtry[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     19 [0;31m        [0;31m# 1. Normalize math \(...\) to $...$[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 20 [0;31m        processed_text = re.sub(
[0m[0;32m     21 [0;31m            [0;34mr'\\\(\s*(.*?)\s*\\\)'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     22 [0;31m            [0;32mlambda[0m [0mmatch[0m[0;34m:[0m [0;34mf"${match.group(1).strip()}$"[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  processed_text


'$\\alpha Z$'


ipdb>  


'$\\alpha Z$'


ipdb>  


'$\\alpha Z$'


ipdb>  


'$\\alpha Z$'


ipdb>  


'$\\alpha Z$'


ipdb>  


'$\\alpha Z$'


ipdb>  n


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(27)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     25 [0;31m[0;34m[0m[0m
[0m[0;32m     26 [0;31m        [0;31m# 2. Remove LaTeX comment lines (respects \%)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 27 [0;31m        [0mprocessed_text[0m [0;34m=[0m [0mre[0m[0;34m.[0m[0msub[0m[0;34m([0m[0;34mr'(?<!\\)%.*$'[0m[0;34m,[0m [0;34m''[0m[0;34m,[0m [0mprocessed_text[0m[0;34m,[0m [0mflags[0m[0;34m=[0m[0mre[0m[0;34m.[0m[0mMULTILINE[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     28 [0;31m[0;34m[0m[0m
[0m[0;32m     29 [0;31m        [0;31m# 3. Remove preamble IF \begin{document} exists[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  process_text


*** NameError: name 'process_text' is not defined


ipdb>  processed_text


'$\\alpha Z$'


ipdb>  n


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(30)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     28 [0;31m[0;34m[0m[0m
[0m[0;32m     29 [0;31m        [0;31m# 3. Remove preamble IF \begin{document} exists[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 30 [0;31m        [0mbegin_doc_marker[0m [0;34m=[0m [0;34mr'\begin{document}'[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     31 [0;31m        [0mbegin_doc_index[0m [0;34m=[0m [0mprocessed_text[0m[0;34m.[0m[0mfind[0m[0;34m([0m[0mbegin_doc_marker[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     32 [0;31m        [0;32mif[0m [0mbegin_doc_index[0m [0;34m!=[0m [0;34m-[0m[0;36m1[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  processed_text


'$\\alpha Z$'


ipdb>  n


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(31)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     29 [0;31m        [0;31m# 3. Remove preamble IF \begin{document} exists[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     30 [0;31m        [0mbegin_doc_marker[0m [0;34m=[0m [0;34mr'\begin{document}'[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 31 [0;31m        [0mbegin_doc_index[0m [0;34m=[0m [0mprocessed_text[0m[0;34m.[0m[0mfind[0m[0;34m([0m[0mbegin_doc_marker[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     32 [0;31m        [0;32mif[0m [0mbegin_doc_index[0m [0;34m!=[0m [0;34m-[0m[0;36m1[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     33 [0;31m            [0mprocessed_text[0m [0;34m=[0m [0mprocessed_text[0m[0;34m[[0m[0mbegin_doc_index[0m [0;34m+[0m [0mlen[0m[0;34m([0m[0mbegin_doc_marker[0m[0;34m)[0m[0;34m:[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(32)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     30 [0;31m        [0mbegin_doc_marker[0m [0;34m=[0m [0;34mr'\begin{document}'[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     31 [0;31m        [0mbegin_doc_index[0m [0;34m=[0m [0mprocessed_text[0m[0;34m.[0m[0mfind[0m[0;34m([0m[0mbegin_doc_marker[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 32 [0;31m        [0;32mif[0m [0mbegin_doc_index[0m [0;34m!=[0m [0;34m-[0m[0;36m1[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     33 [0;31m            [0mprocessed_text[0m [0;34m=[0m [0mprocessed_text[0m[0;34m[[0m[0mbegin_doc_index[0m [0;34m+[0m [0mlen[0m[0;34m([0m[0mbegin_doc_marker[0m[0;34m)[0m[0;34m:[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     34 [0;31m        [0;31m# 3b. Remove \end{document} if present near the end[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(35)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     33 [0;31m            [0mprocessed_text[0m [0;34m=[0m [0mprocessed_text[0m[0;34m[[0m[0mbegin_doc_index[0m [0;34m+[0m [0mlen[0m[0;34m([0m[0mbegin_doc_marker[0m[0;34m)[0m[0;34m:[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     34 [0;31m        [0;31m# 3b. Remove \end{document} if present near the end[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 35 [0;31m        [0mend_doc_marker[0m [0;34m=[0m [0;34mr'\end{document}'[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     36 [0;31m        [0mend_doc_index[0m [0;34m=[0m [0mprocessed_text[0m[0;34m.[0m[0mrfind[0m[0;34m([0m[0mend_doc_marker[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     37 [0;31m        [0;32mif[0m [0mend_doc_index[0m [0;34m!=[0m [0;34m-[0m[0;36m1[0m [0;32mand[0m [0mlen[0m[0;34m([0m[0mprocessed_text[0m[0;34m

ipdb>  n


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(36)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     34 [0;31m        [0;31m# 3b. Remove \end{document} if present near the end[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     35 [0;31m        [0mend_doc_marker[0m [0;34m=[0m [0;34mr'\end{document}'[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 36 [0;31m        [0mend_doc_index[0m [0;34m=[0m [0mprocessed_text[0m[0;34m.[0m[0mrfind[0m[0;34m([0m[0mend_doc_marker[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     37 [0;31m        [0;32mif[0m [0mend_doc_index[0m [0;34m!=[0m [0;34m-[0m[0;36m1[0m [0;32mand[0m [0mlen[0m[0;34m([0m[0mprocessed_text[0m[0;34m)[0m [0;34m-[0m [0mend_doc_index[0m [0;34m<[0m [0;36m30[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     38 [0;31m            [0mprocessed_text[0m [0;34m=[0m [0mprocessed_text[0m[0;34m[[0m[0;34m:[0m[0mend_doc_index[0m[0;34m]

ipdb>  n


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(37)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     35 [0;31m        [0mend_doc_marker[0m [0;34m=[0m [0;34mr'\end{document}'[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     36 [0;31m        [0mend_doc_index[0m [0;34m=[0m [0mprocessed_text[0m[0;34m.[0m[0mrfind[0m[0;34m([0m[0mend_doc_marker[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 37 [0;31m        [0;32mif[0m [0mend_doc_index[0m [0;34m!=[0m [0;34m-[0m[0;36m1[0m [0;32mand[0m [0mlen[0m[0;34m([0m[0mprocessed_text[0m[0;34m)[0m [0;34m-[0m [0mend_doc_index[0m [0;34m<[0m [0;36m30[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     38 [0;31m            [0mprocessed_text[0m [0;34m=[0m [0mprocessed_text[0m[0;34m[[0m[0;34m:[0m[0mend_doc_index[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     39 [0;31m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(42)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     40 [0;31m        [0;31m# --- Spacing Adjustments ---[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     41 [0;31m        [0;31m# 4. Add space after {[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 42 [0;31m        [0mprocessed_text[0m [0;34m=[0m [0mre[0m[0;34m.[0m[0msub[0m[0;34m([0m[0;34mr'\{'[0m[0;34m,[0m [0;34mr'{ '[0m[0;34m,[0m [0mprocessed_text[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     43 [0;31m[0;34m[0m[0m
[0m[0;32m     44 [0;31m        [0;31m# 5. Add space after lowercase commands (\cmd) if not followed by space[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(45)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     43 [0;31m[0;34m[0m[0m
[0m[0;32m     44 [0;31m        [0;31m# 5. Add space after lowercase commands (\cmd) if not followed by space[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 45 [0;31m        [0mprocessed_text[0m [0;34m=[0m [0mre[0m[0;34m.[0m[0msub[0m[0;34m([0m[0;34mr'(\\[a-z]+)(?!\s)'[0m[0;34m,[0m [0;34mr'\1 '[0m[0;34m,[0m [0mprocessed_text[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     46 [0;31m[0;34m[0m[0m
[0m[0;32m     47 [0;31m        [0;31m# --- NEW STEP 6 ---[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  processed_text


'$\\alpha Z$'


ipdb>  n


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(49)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     47 [0;31m        [0;31m# --- NEW STEP 6 ---[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     48 [0;31m        [0;31m# 6. Add space after specific uppercase Greek commands (\Cmd) if not followed by space[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 49 [0;31m        upper_greek_cmds = [
[0m[0;32m     50 [0;31m            [0;34m'Gamma'[0m[0;34m,[0m [0;34m'Delta'[0m[0;34m,[0m [0;34m'Theta'[0m[0;34m,[0m [0;34m'Lambda'[0m[0;34m,[0m [0;34m'Xi'[0m[0;34m,[0m [0;34m'Pi'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     51 [0;31m            [0;34m'Sigma'[0m[0;34m,[0m [0;34m'Upsilon'[0m[0;34m,[0m [0;34m'Phi'[0m[0;34m,[0m [0;34m'Psi'[0m[0;34m,[0m [0;34m'Omega'[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(54)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     52 [0;31m            ]
[0m[0;32m     53 [0;31m        [0;31m# Create pattern part like: Gamma|Delta|Theta...[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 54 [0;31m        [0mpattern_part[0m [0;34m=[0m [0;34m'|'[0m[0;34m.[0m[0mjoin[0m[0;34m([0m[0mupper_greek_cmds[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     55 [0;31m        [0;31m# Regex captures (\ + one of the commands), checks no following whitespace[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     56 [0;31m        [0mpattern_upper[0m [0;34m=[0m [0;34mrf'(\\ (?:{pattern_part}))(?!\s)'[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(56)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     54 [0;31m        [0mpattern_part[0m [0;34m=[0m [0;34m'|'[0m[0;34m.[0m[0mjoin[0m[0;34m([0m[0mupper_greek_cmds[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     55 [0;31m        [0;31m# Regex captures (\ + one of the commands), checks no following whitespace[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 56 [0;31m        [0mpattern_upper[0m [0;34m=[0m [0;34mrf'(\\ (?:{pattern_part}))(?!\s)'[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     57 [0;31m        [0;31m# Replacement adds back captured command (group 1) + space[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     58 [0;31m        [0mprocessed_text[0m [0;34m=[0m [0mre[0m[0;34m.[0m[0msub[0m[0;34m([0m[0mpattern_upper[0m[0;34m,[0m [0;34mr'\1 '[0m[0;34m,[0m [0mprocessed_text[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(58)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     56 [0;31m        [0mpattern_upper[0m [0;34m=[0m [0;34mrf'(\\ (?:{pattern_part}))(?!\s)'[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     57 [0;31m        [0;31m# Replacement adds back captured command (group 1) + space[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 58 [0;31m        [0mprocessed_text[0m [0;34m=[0m [0mre[0m[0;34m.[0m[0msub[0m[0;34m([0m[0mpattern_upper[0m[0;34m,[0m [0;34mr'\1 '[0m[0;34m,[0m [0mprocessed_text[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     59 [0;31m        [0;31m# --- End NEW STEP 6 ---[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     60 [0;31m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(62)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     60 [0;31m[0;34m[0m[0m
[0m[0;32m     61 [0;31m        [0;31m# 7. Clean up potential excessive blank lines[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 62 [0;31m        [0mprocessed_text[0m [0;34m=[0m [0mre[0m[0;34m.[0m[0msub[0m[0;34m([0m[0;34mr'(\n\s*){2,}'[0m[0;34m,[0m [0;34m'\n'[0m[0;34m,[0m [0mprocessed_text[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     63 [0;31m        [0;31m# Remove leading/trailing whitespace from the whole result[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     64 [0;31m        [0mprocessed_text[0m [0;34m=[0m [0mprocessed_text[0m[0;34m.[0m[0mstrip[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(64)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     62 [0;31m        [0mprocessed_text[0m [0;34m=[0m [0mre[0m[0;34m.[0m[0msub[0m[0;34m([0m[0;34mr'(\n\s*){2,}'[0m[0;34m,[0m [0;34m'\n'[0m[0;34m,[0m [0mprocessed_text[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     63 [0;31m        [0;31m# Remove leading/trailing whitespace from the whole result[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 64 [0;31m        [0mprocessed_text[0m [0;34m=[0m [0mprocessed_text[0m[0;34m.[0m[0mstrip[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     65 [0;31m[0;34m[0m[0m
[0m[0;32m     66 [0;31m        [0;31m# Optional: Collapse multiple spaces (might affect deliberate spacing)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(69)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     67 [0;31m        [0;31m# processed_text = re.sub(r'[ \t]+', ' ', processed_text)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     68 [0;31m[0;34m[0m[0m
[0m[0;32m---> 69 [0;31m        [0;32mreturn[0m [0mprocessed_text[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     70 [0;31m[0;34m[0m[0m
[0m[0;32m     71 [0;31m    [0;32mexcept[0m [0mException[0m [0;32mas[0m [0me[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


--Return--
'$\\alph a Z$'
> [0;32m/var/folders/4b/lxf1fvsj6hs_rs1t1kcpm2m0jfgnr5/T/ipykernel_81762/1729876278.py[0m(69)[0;36mnormalize_latex_math[0;34m()[0m
[0;32m     67 [0;31m        [0;31m# processed_text = re.sub(r'[ \t]+', ' ', processed_text)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     68 [0;31m[0;34m[0m[0m
[0m[0;32m---> 69 [0;31m        [0;32mreturn[0m [0mprocessed_text[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     70 [0;31m[0;34m[0m[0m
[0m[0;32m     71 [0;31m    [0;32mexcept[0m [0mException[0m [0;32mas[0m [0me[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


'$\\alph a Z$'

In [6]:
def normalize_latex_math(text, debug=False): # Added debug flag, default is False
    """
    Preprocesses LaTeX text with optional print debugging after each stage.
    1. Converts LaTeX inline math \( ... \) to $ ... $.
    2. Removes LaTeX comments (% to end of line), respecting \%.
    3. Removes preamble/end tags if \begin{document} is found.
    4. Adds space after opening curly braces ({).
    5. Adds space after lowercase LaTeX commands (\cmd) if not already present.
    6. Adds space after specific uppercase Greek commands (\Cmd) if not present.
    7. Cleans up extra blank lines and trims whitespace.
    """
    if debug:
        print("\n--- Entering normalize_latex_math ---")
        # Print initial text snippet using repr() to show special chars like \n
        print(f"Input Text (snippet):\n'''{repr(text[:200])}...'''\n------")

    if not isinstance(text, str):
        print("Warning: Input was not a string.")
        return text

    processed_text = text
    original_text_for_compare = text # Keep original for error return
    try:
        # --- Step 1: Normalize math \(...\) to $...$ ---
        processed_text = re.sub(r'\\\(\s*(.*?)\s*\\\)', lambda m: f"${m.group(1).strip()}$", processed_text)
        if debug: print(f"--- After Step 1 (Math Norm): ---\n{processed_text}\n------")

        # --- Step 2: Remove LaTeX comment lines ---
        processed_text = re.sub(r'(?<!\\)%.*$', '', processed_text, flags=re.MULTILINE)
        if debug: print(f"--- After Step 2 (Comment Removal): ---\n{processed_text}\n------")

        # --- Step 3: Remove preamble/end tags ---
        begin_doc_marker = r'\begin{document}'
        begin_doc_index = processed_text.find(begin_doc_marker)
        if begin_doc_index != -1:
            processed_text = processed_text[begin_doc_index + len(begin_doc_marker):]
        end_doc_marker = r'\end{document}'
        end_doc_index = processed_text.rfind(end_doc_marker)
        if end_doc_index != -1 and len(processed_text) - end_doc_index < 30:
            processed_text = processed_text[:end_doc_index]
        if debug: print(f"--- After Step 3 (Preamble/End Removal): ---\n{processed_text}\n------")

        # --- Step 4: Add space after { ---
        processed_text = re.sub(r'\{', r'{ ', processed_text)
        if debug: print(f"--- After Step 4 (Space after {{): ---\n{processed_text}\n------") # Escaped { in f-string label

        # --- Step 5: Add space after \lowercasecmd ---
        
        processed_text = re.sub(r'(\\[a-z]+)(?!\s)', r'\1 ', processed_text)
        if debug: print(f"--- After Step 5 (Space after \\cmd): ---\n{processed_text}\n------")

        # --- Step 6: Add space after \UppercaseGreek ---
        upper_greek_cmds = ['Gamma', 'Delta', 'Theta', 'Lambda', 'Xi', 'Pi', 'Sigma', 'Upsilon', 'Phi', 'Psi', 'Omega']
        pattern_part = '|'.join(upper_greek_cmds)
        pattern_upper = rf'(\\ (?:{pattern_part}))(?!\s)'
        processed_text = re.sub(pattern_upper, r'\1 ', processed_text)
        if debug: print(f"--- After Step 6 (Space after \\Cmd): ---\n{processed_text}\n------")

        # --- Step 7: Clean up blank lines/strip ---
        processed_text = re.sub(r'(\n\s*){2,}', '\n', processed_text)
        processed_text = processed_text.strip()
        if debug: print(f"--- After Step 7 (Final Cleanup): ---\n{processed_text}\n------")

        return processed_text

    except Exception as e:
        error_message = f"Error during LaTeX text preprocessing: {e}"
        if st: st.error(error_message) # Use conditional st
        else: print(error_message)
        if debug: print("--- ERROR occurred in normalize_latex_math ---")
        return original_text_for_compare # Return original text on error


In [7]:
normalize_latex_math(r"$\alpha Z$",debug=True    )


--- Entering normalize_latex_math ---
Input Text (snippet):
''''$\\alpha Z$'...'''
------
--- After Step 1 (Math Norm): ---
$\alpha Z$
------
--- After Step 2 (Comment Removal): ---
$\alpha Z$
------
--- After Step 3 (Preamble/End Removal): ---
$\alpha Z$
------
--- After Step 4 (Space after {): ---
$\alpha Z$
------
--- After Step 5 (Space after \cmd): ---
$\alph a Z$
------
--- After Step 6 (Space after \Cmd): ---
$\alph a Z$
------
--- After Step 7 (Final Cleanup): ---
$\alph a Z$
------


'$\\alph a Z$'

In [9]:
import re

test_string = r"Some text $\gamma Z$R and $\alpha Z$R here."
pattern = r'(\\[a-z]+)(?!\s)'
replacement = r'\1 ' # The crucial part - ensure NO 'a' here

result = re.sub(pattern, replacement, test_string)

print(f"Original: {repr(test_string)}")
print(f"Result:   {repr(result)}")

# Expected Output should be:
# Original: 'Some text $\\gamma Z$R and $\\alphaZ$R here.'
# Result:   'Some text $\\gamma Z$R and $\\alpha Z$R here.'
# (Note the space added ONLY after \gamma, because Z followed it,
#  and NO space added after \alpha, because Z followed it. And definitely NO 'a')
# CORRECTION TO EXPECTED OUTPUT: Both \gamma and \alpha should match the pattern because Z is not whitespace.
# Expected Output:
# Original: 'Some text $\\gamma Z$R and $\\alphaZ$R here.'
# Result:   'Some text $\\gamma  Z$R and $\\alpha Z$R here.' (Space added after both)



Original: 'Some text $\\gamma Z$R and $\x07lpha Z$R here.'
Result:   'Some text $\\gamm a Z$R and $\x07lpha Z$R here.'
