In [5]:
#%%writefile app.py

import re

greek_map = {
    'alpha': 'a', 'beta': 'b', 'gamma': 'g', 'delta': 'd', 'epsilon': 'e',
    'zeta': 'z', 'eta': 'e', 'theta': 't', 'iota': 'i', 'kappa': 'k',
    'lambda': 'l', 'mu': 'm', 'nu': 'n', 'xi': 'x', 'omicron': 'o',
    'pi': 'p', 'rho': 'r', 'sigma': 's', 'tau': 't', 'upsilon': 'u',
    'phi': 'p', 'chi': 'c', 'psi': 'p', 'omega': 'o',
    # Add common Uppercase Greek commands if needed, mapping to lowercase
    'Gamma': 'g', 'Delta': 'd', 'Theta': 't', 'Lambda': 'l', 'Xi': 'x',
    'Pi': 'p', 'Sigma': 's', 'Upsilon': 'u', 'Phi': 'p', 'Psi': 'p', 'Omega': 'o'
    # Add others if required
}

# 2. Parse Abbreviation String
def get_abbr_repr_letters(abbr_string):
    """
    Parses an abbreviation string containing potential LaTeX Greek letters
    and uppercase letters. Returns a list of representative lowercase chars.
    e.g., "$\alpha$-SP" -> ['a', 's', 'p']
    """
    representative_letters = []
    # Pattern finds \command OR single Uppercase letter
    # It captures the command name (e.g., 'alpha') in group 1 OR the uppercase letter in group 2
    findings = re.findall(r'\\([a-zA-Z]+)|([A-Z])', abbr_string)

    for greek_cmd, upper_letter in findings:
        if greek_cmd: # Matched \command (e.g., greek_cmd == 'alpha')
            if greek_cmd in greek_map:
                representative_letters.append(greek_map[greek_cmd]) # Add lowercase representation
        elif upper_letter: # Matched single uppercase letter (e.g., upper_letter == 'S')
             representative_letters.append(upper_letter.lower()) # Add lowercase representation

    return representative_letters

def extract_abbreviations(text, require_first_last_match=True):
    """
    Best-effort abbreviation extraction supporting LaTeX Greek letters.

    Extracts inclusive slice covering first/last matched words based on
    representative characters (Greek mapped to first letter, others lowercase).
    """


    # 3. Modify Regex slightly for $, \ characters
    # Allows $, \ within words and abbreviation. Group 2 captures full abbr string.
    pattern = re.compile(r'((?:[\w\-\$\\]+\s+){1,10})\(([A-Za-z\-\$\\]{2,})\)')
    matches = pattern.findall(text)

    abbreviation_dict = {}

    for match in matches:
        words_before_abbr_text = match[0].strip()
        # Original split seems okay: '$\alpha$-Synclein Protein' -> ['$\\alpha$-Synclein', 'Protein']
        words_ahead = [word for word in re.split(r'\s+|(?<=-)(?=[A-Za-z])', words_before_abbr_text) if word]

        abbr_string = match[1] # The raw abbreviation string, e.g., "$\alpha$-SP"
        # Get representative letters, e.g., ['a', 's', 'p']
        abbr_letters = get_abbr_repr_letters(abbr_string)
        num_abbr_letters = len(abbr_letters)

        if not abbr_letters or not words_ahead or num_abbr_letters == 0:
            continue

        # Stores 0-based index
        match_indices = [-1] * num_abbr_letters
        unmatched_abbr_indices = set(range(num_abbr_letters))

        # Iterate through words backwards
        for i, word in enumerate(reversed(words_ahead)):
            original_idx = len(words_ahead) - 1 - i
            if not unmatched_abbr_indices:
                break

            # 4. Determine "Effective First Character" (lowercase)
            effective_char = None
            # Check for $\greek... pattern
            m_dollar = re.match(r'\$\\([a-zA-Z]+)', word)
            if m_dollar and m_dollar.group(1) in greek_map:
                effective_char = greek_map[m_dollar.group(1)]
            else:
                 # Check for \greek... pattern (less common start)
                 m_slash = re.match(r'\\([a-zA-Z]+)', word)
                 if m_slash and m_slash.group(1) in greek_map:
                     effective_char = greek_map[m_slash.group(1)]
                 else:
                     # Standard word handling: find the first ASCII letter
                     # Remove leading/trailing hyphens for this check? No, search within original word.
                     m_first_letter = re.search(r'[a-zA-Z]', word)
                     if m_first_letter:
                         effective_char = m_first_letter.group(0).lower()
                     # If no letter found (e.g., word is just "$"), effective_char remains None

            # 5. Match effective char with abbreviation letters
            if effective_char is not None:
                best_match_abbr_idx = -1
                # Check remaining letters right-to-left
                for abbr_idx in sorted(list(unmatched_abbr_indices), reverse=True):
                    # Compare lowercase effective char with lowercase representative abbr letter
                    if effective_char == abbr_letters[abbr_idx]:
                         best_match_abbr_idx = abbr_idx
                         break

                if best_match_abbr_idx != -1:
                    match_indices[best_match_abbr_idx] = original_idx
                    unmatched_abbr_indices.remove(best_match_abbr_idx)

        # --- Validation & Reconstruction (Keep from best_effort) ---
        successful_match_indices = [idx for idx in match_indices if idx != -1]

        if not successful_match_indices:
            continue

        valid_match = True
        if require_first_last_match:
            # Check if first (idx 0) and last (idx num_abbr_letters - 1) letters were matched
            if match_indices[0] == -1 or match_indices[num_abbr_letters - 1] == -1:
                valid_match = False

        if valid_match:
            min_idx_py = min(successful_match_indices) # 0-based index of earliest word
            max_idx_py = max(successful_match_indices) # 0-based index of latest word

            if min_idx_py <= max_idx_py:
                # Extract the inclusive slice covering the span of matched words
                full_phrase_words_slice = words_ahead[min_idx_py : max_idx_py + 1]

                # Join the slice
                full_name = ''.join(word if i == 0 else (' ' + word if not full_phrase_words_slice[i - 1].endswith('-') else word)
                                    for i, word in enumerate(full_phrase_words_slice))

                # Use the original abbreviation string as the key
                abbreviation_dict[abbr_string] = full_name

    return abbreviation_dict



In [6]:
# @title A Test with An Example of Input Text
text_input = r"""
In this paper, we propose utilizing $\alpha$-residuals-method ($\alpha$-RM) to diagnose Cox PH models. The recent studies by Li et al. 2021 \cite{LiLonghai2021Mdfc} and Wu et al. 2024 \cite{WuTingxuan2024Zdtf} introduced the concept of randomized survival probabilities (RSP) to define $Z$-residuals for diagnosing model assumptions in accelerated failure time (AFT) and shared frailty models. The RSP approach involves replacing the survival probability of a censored failure time with a uniform random number between 0 and the survival probability of the censored time \cite{WuTingxuan2024Zdtf}.

Also consider the effect on Tumor Necrosis Factor-$\alpha$ (TNF-$\alpha$).

Compare $\Sigma$-Algebra ($\Sigma$-A) with others.

The decision was made by the United States of America (USA) representative.

We need to check the Central Processing Unit (CPU) temperature.

This is a state-of-the-art (SOTA) system.

Consult the National Aeronautics and Space Administration (NASA) guidelines.

This Department of Health and Human Services (HHS) policy is new.

Missing words example for North Atlantic Organization (NATO).
"""

def format_abbreviations(abbreviations, format_type):
    if format_type == "nomenclature":
        latex_output = "\\usepackage{nomencl}\n"
        for abbr, full_name in abbreviations.items():
            latex_output += f"\\nomenclature{{{abbr}}}{{{full_name}}}\n"
        return latex_output
    elif format_type == "tabular":
        latex_output = "\\begin{tabular}{ll}\n"
        for abbr, full_name in abbreviations.items():
            latex_output += f"{abbr} & {full_name} \\\\\n"
        latex_output += "\\end{tabular}\n"
        return latex_output
    else: # Default plain text list
        output = ""
        for abbr, full_name in abbreviations.items():
            output += f"{abbr}: {full_name}; "
        return output


print(format_abbreviations(extract_abbreviations(text_input), format_type="tabular"))


\begin{tabular}{ll}
$\alpha$-RM & $\alpha$-residuals-method \\
RSP & randomized survival probabilities \\
AFT & accelerated failure time \\
TNF-$\alpha$ & Also consider the effect on Tumor Necrosis Factor-$\alpha$ \\
$\Sigma$-A & $\Sigma$-Algebra \\
USA & United States of America \\
CPU & Central Processing Unit \\
SOTA & state-of-the-art \\
NASA & National Aeronautics and Space Administration \\
HHS & Health and Human Services \\
NATO & North Atlantic Organization \\
\end{tabular}



In [8]:
# @title Click the ▶️ (play) to Show Interactive Widgets.

import re
import requests
from bs4 import BeautifulSoup
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import HTML

def get_text_from_url(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text()
    except requests.exceptions.RequestException as e:
        return f"Error fetching URL: {e}"




def process_input(input_text, format_type):
    if input_text.startswith('http'):
        text = get_text_from_url(input_text)
    else:
        text = input_text

    abbreviations = extract_abbreviations(text)
    formatted_output = format_abbreviations(abbreviations, format_type)

    output_text_box.value = formatted_output
    num_lines = formatted_output.count('\n') + 2
    output_text_box.layout.height = f'{min(num_lines * 20, 400)}px'

def clear_output_area(b):
    output_text_box.value = ''
    output_text_box.layout.height = '100px'

def clear_text_input(b):
    text_box.value = ''

def rerun_format(change):
    global text_box, output_format_dropdown
    process_input(text_box.value, change.new)

text_label = HTML(value='<b>Enter text (Latex Allowed) or URL:</b>')
text_box = widgets.Textarea(
    value=r"""
    In this paper, we propose utilizing $\alpha$-residuals-method ($\alpha$-RM) to diagnose Cox PH models. The recent studies by Li et al. 2021 \cite{LiLonghai2021Mdfc} and Wu et al. 2024 \cite{WuTingxuan2024Zdtf} introduced the concept of randomized survival probabilities (RSP) to define $Z$-residuals for diagnosing model assumptions in accelerated failure time (AFT) and shared frailty models. The RSP approach involves replacing the survival probability of a censored failure time with a uniform random number between 0 and the survival probability of the censored time \cite{WuTingxuan2024Zdtf}.

    Also consider the effect on Tumor Necrosis Factor-$\alpha$ (TNF-$\alpha$).

    Compare $\Sigma$-Algebra ($\Sigma$-A) with others.

    The decision was made by the United States of America (USA) representative.

    We need to check the Central Processing Unit (CPU) temperature.

    This is a state-of-the-art (SOTA) system.

    Consult the National Aeronautics and Space Administration (NASA) guidelines.

    This Department of Health and Human Services (HHS) policy is new.

    Missing words example for North Atlantic Organization (NATO).

    """,
    placeholder='Enter text or URL',
    disabled=False,
    layout=widgets.Layout(width='100%', height='100px')
)

output_label = HTML(value='<b>List of Abbreviations</b>')
output_format_dropdown = widgets.Dropdown(
    options=['plain','nomenclature', 'tabular'],
    value='tabular',
    description='Format:',
)

output_box = widgets.HBox([output_label, output_format_dropdown])

output_text_box = widgets.Textarea(
    value='',
    placeholder='List of Generated Abbreviations',
    disabled=False,
    layout=widgets.Layout(width='100%', height='100px')
)

submit_button = widgets.Button(
    description='Generate Abbreviations',
    disabled=False,
    button_style='primary',
    tooltip='Click to generate abbreviations',
    icon='magic'
)

clear_output_button = widgets.Button(
    description='Clear Output',
    disabled=False,
    button_style='',
    tooltip='Click to clear output',
    icon='times'
)

clear_text_button = widgets.Button(
    description='Clear Input',
    disabled=False,
    button_style='',
    tooltip='Click to clear input',
    icon='times'
)

submit_button.on_click(lambda b: process_input(text_box.value, output_format_dropdown.value))
clear_output_button.on_click(clear_output_area)
clear_text_button.on_click(clear_text_input)

output_format_dropdown.observe(rerun_format, names='value')

input_box_with_clear = widgets.VBox([text_label, widgets.HBox([text_box, clear_text_button])])
output_box_with_clear = widgets.VBox([output_box, widgets.HBox([output_text_box, clear_output_button])])

display(input_box_with_clear, submit_button, output_box_with_clear)



VBox(children=(HTML(value='<b>Enter text (Latex Allowed) or URL:</b>'), HBox(children=(Textarea(value='\n    I…

Button(button_style='primary', description='Generate Abbreviations', icon='magic', style=ButtonStyle(), toolti…

VBox(children=(HBox(children=(HTML(value='<b>List of Abbreviations</b>'), Dropdown(description='Format:', inde…

In [7]:
# @title Working Abbreviation Extracting Functions and Examples

import re
from datetime import datetime

# 1. Greek Letter Mapping (common commands to lowercase first letter)
greek_map = {
    'alpha': 'a', 'beta': 'b', 'gamma': 'g', 'delta': 'd', 'epsilon': 'e',
    'zeta': 'z', 'eta': 'e', 'theta': 't', 'iota': 'i', 'kappa': 'k',
    'lambda': 'l', 'mu': 'm', 'nu': 'n', 'xi': 'x', 'omicron': 'o',
    'pi': 'p', 'rho': 'r', 'sigma': 's', 'tau': 't', 'upsilon': 'u',
    'phi': 'p', 'chi': 'c', 'psi': 'p', 'omega': 'o',
    # Add common Uppercase Greek commands if needed, mapping to lowercase
    'Gamma': 'g', 'Delta': 'd', 'Theta': 't', 'Lambda': 'l', 'Xi': 'x',
    'Pi': 'p', 'Sigma': 's', 'Upsilon': 'u', 'Phi': 'p', 'Psi': 'p', 'Omega': 'o'
    # Add others if required
}

# 2. Parse Abbreviation String
def get_abbr_repr_letters(abbr_string):
    """
    Parses an abbreviation string containing potential LaTeX Greek letters
    and uppercase letters. Returns a list of representative lowercase chars.
    e.g., "$\alpha$-SP" -> ['a', 's', 'p']
    """
    representative_letters = []
    # Pattern finds \command OR single Uppercase letter
    # It captures the command name (e.g., 'alpha') in group 1 OR the uppercase letter in group 2
    findings = re.findall(r'\\([a-zA-Z]+)|([A-Z])', abbr_string)

    for greek_cmd, upper_letter in findings:
        if greek_cmd: # Matched \command (e.g., greek_cmd == 'alpha')
            if greek_cmd in greek_map:
                representative_letters.append(greek_map[greek_cmd]) # Add lowercase representation
        elif upper_letter: # Matched single uppercase letter (e.g., upper_letter == 'S')
             representative_letters.append(upper_letter.lower()) # Add lowercase representation

    return representative_letters

def extract_abbreviations_best_effort_greek(text, require_first_last_match=True):
    """
    Best-effort abbreviation extraction supporting LaTeX Greek letters.

    Extracts inclusive slice covering first/last matched words based on
    representative characters (Greek mapped to first letter, others lowercase).
    """
    # 3. Modify Regex slightly for $, \ characters
    # Allows $, \ within words and abbreviation. Group 2 captures full abbr string.
    pattern = re.compile(r'((?:[\w\-\$\\]+\s+){1,10})\(([A-Za-z\-\$\\]{2,})\)')
    matches = pattern.findall(text)

    abbreviation_dict = {}
    current_date_str = datetime.now().strftime("%Y-%m-%d")

    for match in matches:
        words_before_abbr_text = match[0].strip()
        # Original split seems okay: '$\alpha$-Synclein Protein' -> ['$\\alpha$-Synclein', 'Protein']
        words_ahead = [word for word in re.split(r'\s+|(?<=-)(?=[A-Za-z])', words_before_abbr_text) if word]

        abbr_string = match[1] # The raw abbreviation string, e.g., "$\alpha$-SP"
        # Get representative letters, e.g., ['a', 's', 'p']
        abbr_letters = get_abbr_repr_letters(abbr_string)
        num_abbr_letters = len(abbr_letters)

        if not abbr_letters or not words_ahead or num_abbr_letters == 0:
            continue

        # Stores 0-based index
        match_indices = [-1] * num_abbr_letters
        unmatched_abbr_indices = set(range(num_abbr_letters))

        # Iterate through words backwards
        for i, word in enumerate(reversed(words_ahead)):
            original_idx = len(words_ahead) - 1 - i
            if not unmatched_abbr_indices:
                break

            # 4. Determine "Effective First Character" (lowercase)
            effective_char = None
            # Check for $\greek... pattern
            m_dollar = re.match(r'\$\\([a-zA-Z]+)', word)
            if m_dollar and m_dollar.group(1) in greek_map:
                effective_char = greek_map[m_dollar.group(1)]
            else:
                 # Check for \greek... pattern (less common start)
                 m_slash = re.match(r'\\([a-zA-Z]+)', word)
                 if m_slash and m_slash.group(1) in greek_map:
                     effective_char = greek_map[m_slash.group(1)]
                 else:
                     # Standard word handling: find the first ASCII letter
                     # Remove leading/trailing hyphens for this check? No, search within original word.
                     m_first_letter = re.search(r'[a-zA-Z]', word)
                     if m_first_letter:
                         effective_char = m_first_letter.group(0).lower()
                     # If no letter found (e.g., word is just "$"), effective_char remains None

            # 5. Match effective char with abbreviation letters
            if effective_char is not None:
                best_match_abbr_idx = -1
                # Check remaining letters right-to-left
                for abbr_idx in sorted(list(unmatched_abbr_indices), reverse=True):
                    # Compare lowercase effective char with lowercase representative abbr letter
                    if effective_char == abbr_letters[abbr_idx]:
                         best_match_abbr_idx = abbr_idx
                         break

                if best_match_abbr_idx != -1:
                    match_indices[best_match_abbr_idx] = original_idx
                    unmatched_abbr_indices.remove(best_match_abbr_idx)

        # --- Validation & Reconstruction (Keep from best_effort) ---
        successful_match_indices = [idx for idx in match_indices if idx != -1]

        if not successful_match_indices:
            continue

        valid_match = True
        if require_first_last_match:
            # Check if first (idx 0) and last (idx num_abbr_letters - 1) letters were matched
            if match_indices[0] == -1 or match_indices[num_abbr_letters - 1] == -1:
                valid_match = False

        if valid_match:
            min_idx_py = min(successful_match_indices) # 0-based index of earliest word
            max_idx_py = max(successful_match_indices) # 0-based index of latest word

            if min_idx_py <= max_idx_py:
                # Extract the inclusive slice covering the span of matched words
                full_phrase_words_slice = words_ahead[min_idx_py : max_idx_py + 1]

                # Join the slice
                full_name = ''.join(word if i == 0 else (' ' + word if not full_phrase_words_slice[i - 1].endswith('-') else word)
                                    for i, word in enumerate(full_phrase_words_slice))

                # Use the original abbreviation string as the key
                abbreviation_dict[abbr_string] = full_name

    return abbreviation_dict

# --- Example Usage ---
text1 = r"In this paper, we propose utilizing $\alpha$-residuals-method ($\alpha$-RM) to diagnose Cox PH models. The recent studies by Li et al. 2021 \cite{LiLonghai2021Mdfc} and Wu et al. 2024 \cite{WuTingxuan2024Zdtf} introduced the concept of randomized survival probabilities (RSP) to define $Z$-residuals for diagnosing model assumptions in accelerated failure time (AFT) and shared frailty models. The RSP approach involves replacing the survival probability of a censored failure time with a uniform random number between 0 and the survival probability of the censored time \cite{WuTingxuan2024Zdtf}. The RSPs for $t_{i}$ in the Cox PH model are"
text2 = r"Also consider the effect on Tumor Necrosis Factor-$\alpha$ (TNF-$\alpha$)."
text3 = r"Compare $\Sigma$-Algebra ($\Sigma$-A) with others." # Example with Uppercase Greek

text1 = "The decision was made by the United States of America (USA) representative."
text2 = "We need to check the Central Processing Unit (CPU) temperature."
text3 = "This is a state-of-the-art (SOTA) system."
text4 = "Consult the National Aeronautics and Space Administration (NASA) guidelines."
text5 = "This Department of Health and Human Services (HHS) policy is new."
# Text 6 now has a missing word "Treaty" compared to the abbreviation "NATO"
text6 = "Missing words example for North Atlantic Organization (NATO)."


current_date_str = datetime.now().strftime("%Y-%m-%d")

print(f"Date: {current_date_str}")
abbreviations1 = extract_abbreviations_best_effort_greek(text1)
abbreviations2 = extract_abbreviations_best_effort_greek(text2)
abbreviations3 = extract_abbreviations_best_effort_greek(text3)

print(f"'{text1}' ->\n")
print(format_abbreviations(abbreviations1, format_type="plain"))
# Expected: {'$\\alpha$-SP': '$\\alpha$-Synclein Protein', '$\\beta$-APP': '$\\beta$-Amyloid Precursor Protein'}
print(f"'{text2}' ->\n {abbreviations2}")
# Expected: {'TNF-$\alpha$': 'Tumor Necrosis Factor-$\alpha$'}
print(f"'{text3}' ->\n {abbreviations3}")
# Expected: {'$\\Sigma$-A': '$\\Sigma$-Algebra'}

Date: 2025-03-27
'The decision was made by the United States of America (USA) representative.' ->

USA: United States of America; 
'We need to check the Central Processing Unit (CPU) temperature.' ->
 {'CPU': 'Central Processing Unit'}
'This is a state-of-the-art (SOTA) system.' ->
 {'SOTA': 'state-of-the-art'}
