In [3]:
import os
os.getcwd()
# os.chdir('./Elsevier/')

'e:\\0 Main Codes\\Refs\\Elsevier'

# Order cite by section from latex file

In [17]:
import re

def split_bib_by_section(tex_path: str, bib_path: str, out_bib_path: str):
    r"""
    Reads a LaTeX file (tex_path) and a BibTeX file (bib_path),
    then writes a new BibTeX file (out_bib_path) containing only
    those entries first cited in each \section{…} of the .tex,
    grouped and commented by section, **with no empty lines inside entries**.
    """
    # 1. Read files
    tex = open(tex_path,  encoding='utf-8').read()
    bib = open(bib_path, encoding='utf-8').read()

    # 2. Split into sections
    sec_re = re.compile(r'\\section\{([^}]+)\}')
    matches = list(sec_re.finditer(tex))
    sections = []
    if matches and matches[0].start() > 0:
        sections.append(('Preamble', tex[:matches[0].start()]))
    for i, m in enumerate(matches):
        title = m.group(1).strip()
        start = m.end()
        end   = matches[i+1].start() if i+1 < len(matches) else len(tex)
        sections.append((title, tex[start:end]))
    if not matches:
        sections = [('Document', tex)]

    # 3. Gather first-time citations per section
    cited = set()
    section_order = []
    cite_re = re.compile(r'\\cite\w*\{([^}]+)\}')
    for title, content in sections:
        keys_in_sec = []
        for g in cite_re.findall(content):
            for key in g.split(','):
                key = key.strip()
                if key and key not in cited:
                    cited.add(key)
                    keys_in_sec.append(key)
        if keys_in_sec:
            section_order.append((title, keys_in_sec))

    # 4. Load all Bib entries into a dict
    bib_entries = {}
    for entry in re.split(r'(?=@)', bib):
        m = re.match(r'@\w+\{([^,]+),', entry)
        if m:
            bib_entries[m.group(1).strip()] = entry.strip()

    # 5. Write filtered .bib (cleaning out blank lines within entries)
    with open(out_bib_path, 'w', encoding='utf-8') as out:
        for title, keys in section_order:
            out.write(f"% Section: {title}\n")
            for k in keys:
                raw_entry = bib_entries.get(k)
                if raw_entry:
                    # remove any entirely blank lines
                    lines = raw_entry.splitlines()
                    clean = [ln for ln in lines if ln.strip()]
                    out.write("\n".join(clean) + "\n\n")
                else:
                    out.write(f"% WARNING: no entry for '{k}'\n\n")

    print(f"Done! {len(cited)} unique citations written to {out_bib_path}")

split_bib_by_section(
    tex_path='main.tex',
    bib_path='Refs.bib',
    out_bib_path='cited_by_section.bib'
)


Done! 35 unique citations written to cited_by_section.bib


# Update by CrossREF

In [20]:
import re
import time
import random
import requests
import string

def normalize_title(t: str) -> str:
    """
    Lowercase, strip out everything except letters, digits and spaces,
    collapse multiple spaces into one.
    """
    allowed = set(string.ascii_lowercase + string.digits + " ")
    t = t.lower()
    t = "".join(ch for ch in t if ch in allowed)
    return " ".join(t.split())

def update_bib_with_crossref_strict(input_bib_path: str, output_bib_path: str):
    text = open(input_bib_path, encoding='utf-8').read()
    # split by your Section headers
    blocks = re.split(r'(?=%\s*Section:)', text)
    cache = {}
    out = []

    for block in blocks:
        if not block.strip():
            continue
        # keep anything before the first "% Section:"
        if not block.lstrip().startswith('% Section:'):
            out.append(block)
            continue

        # Separate header line from the entries
        header, *rest = block.splitlines()
        header += "\n"
        content = "\n".join(rest)
        raw_entries = re.split(r'(?=@\w+\{)', content)
        updated_entries = []

        for raw in raw_entries:
            entry = raw.strip()
            if not entry or not entry.startswith('@'):
                continue

            # pull out entry type and your original key
            m = re.match(r'@(\w+)\{([^,]+),', entry)
            if not m:
                updated_entries.append(entry)
                continue
            ent_type, orig_key = m.group(1), m.group(2)

            # pull title / authors / year
            t_m = re.search(r'title\s*=\s*\{([^}]+)\}', entry, re.I)
            a_m = re.search(r'author\s*=\s*\{([^}]+)\}', entry, re.I)
            y_m = re.search(r'year\s*=\s*\{?(\d{4})\}?', entry, re.I)

            if not t_m:
                print(f"[✗] No title for key '{orig_key}', keeping original.")
                updated_entries.append(entry)
                continue

            raw_title = t_m.group(1).strip()
            norm_title = normalize_title(raw_title)

            # if we've done this title before, reuse whatever we got
            if norm_title in cache:
                updated_entries.append(cache[norm_title])
                continue

            doi = None
            # 1) title‐based search
            try:
                r1 = requests.get(
                    'https://api.crossref.org/works',
                    params={'query.title': raw_title, 'rows': 1}
                )
                if r1.ok:
                    items = r1.json().get('message', {}).get('items', [])
                    if items:
                        cand = items[0]
                        cand_title = cand.get('title', [''])[0]
                        if normalize_title(cand_title) == norm_title:
                            doi = cand.get('DOI')
            except Exception as e:
                print(f"[✗] Title‐search error for '{raw_title}': {e}")

            # 2) fallback author+year, but **still** require title match
            if not doi and a_m and y_m:
                first_author = a_m.group(1).split(' and ')[0].split()[-1]
                year = y_m.group(1)
                try:
                    r2 = requests.get(
                        'https://api.crossref.org/works',
                        params={
                            'query.author': first_author,
                            'filter': f'from-pub-date:{year},until-pub-date:{year}',
                            'rows': 1
                        }
                    )
                    if r2.ok:
                        items2 = r2.json().get('message', {}).get('items', [])
                        if items2:
                            cand = items2[0]
                            cand_title2 = cand.get('title', [''])[0]
                            if normalize_title(cand_title2) == norm_title:
                                doi = cand.get('DOI')
                except Exception as e:
                    print(f"[✗] Fallback error for '{raw_title}': {e}")

            # 3) if we got a DOI, fetch fresh BibTeX and force your key
            if doi:
                try:
                    r3 = requests.get(
                        f'https://doi.org/{doi}',
                        headers={'Accept': 'application/x-bibtex; charset=utf-8'}
                    )
                    if r3.ok:
                        fresh = r3.text
                        # swap in your original key
                        fresh = re.sub(
                            r'(@\w+\{)[^,]+,',
                            fr'\1{orig_key},',
                            fresh, count=1
                        )
                        new_entry = fresh.strip()
                    else:
                        print(f"[✗] Couldn’t fetch BibTeX for DOI {doi}, keeping original.")
                        new_entry = entry
                except Exception as e:
                    print(f"[✗] Fetch error for DOI {doi}: {e}")
                    new_entry = entry
            else:
                # no valid DOI/title‐match
                print(f"[✗] No exact match for “{raw_title}”, keeping original.")
                new_entry = entry

            cache[norm_title] = new_entry
            updated_entries.append(new_entry)
            time.sleep(random.uniform(10, 30))  # be polite

        # reassemble this section
        out.append(header)
        for e in updated_entries:
            out.append(e + "\n\n")

    # write out
    with open(output_bib_path, 'w', encoding='utf-8') as f:
        f.write("".join(out))

    print(f"\n✅ Updated .bib written to: {output_bib_path}")


update_bib_with_crossref_strict(
    input_bib_path='cited_by_section.bib',
    output_bib_path='crossRef_cited_by_section.bib'
)


[✗] No exact match for “Big data and health”, keeping original.
[✗] No exact match for “Publicly available clinical BERT embeddings”, keeping original.
[✗] No exact match for “Effect of a Predictive Model on Planned Surgical Duration Accuracy, Patient Wait Time, and Use of Presurgical Resources: A Randomized Clinical Trial”, keeping original.

✅ Updated .bib written to: crossRef_cited_by_section.bib


# Abbreviation

In [None]:
import re
import pandas as pd

def load_ltwa_mapping(csv_path: str) -> dict:
    """
    Load LTWA mappings from a TSV file ('WORD','ABBREVIATION','LANGUAGES'),
    filter for English entries, and return {word_lower: abbreviation}.
    """
    df = pd.read_csv(csv_path, sep='\t', engine='python', dtype=str)
    df = df.dropna(subset=['ABBREVIATION'])
    df_en = df[df['LANGUAGES'].str.contains('English', case=False, na=False)]
    return {row['WORD'].lower(): row['ABBREVIATION'] for _, row in df_en.iterrows()}

def find_abbreviation(word: str, mapping: dict) -> str:
    """
    Try to find an abbreviation for `word` in `mapping`:
    1. Exact match
    2. With trailing '-'
    3. Iteratively truncate last char + '-'
    If none found, return the original word.
    """
    original = word
    key = word.lower()
    # 1) Exact
    if key in mapping and mapping[key]:
        return mapping[key]
    # 2) Trailing dash
    dash_key = f"{key}-"
    if dash_key in mapping and mapping[dash_key]:
        return mapping[dash_key]
    # 3) Truncate + dash
    truncated = key
    while len(truncated) > 1:
        truncated = truncated[:-1]
        tr_key = f"{truncated}-"
        if tr_key in mapping and mapping[tr_key]:
            return mapping[tr_key]
    # Fallback
    return original

def abbreviate_journal_title(title: str, mapping: dict) -> str:
    """
    Abbreviate a full journal title by applying `find_abbreviation` to each word,
    preserving punctuation.
    """
    tokens = re.split(r'(\W+)', title)
    return ''.join(
        find_abbreviation(tok, mapping) if re.match(r'\w+', tok) else tok
        for tok in tokens
    )

def abbreviate_journals_in_bib(input_bib_path: str, output_bib_path: str, ltwa_csv_path: str):
    """
    Read a .bib file, abbreviate all `journal = {...}` fields using LTWA rules,
    print each change (old vs new), and write out a new .bib file.
    """
    # Load the mapping
    mapping = load_ltwa_mapping(ltwa_csv_path)

    # Read .bib content
    with open(input_bib_path, 'r', encoding='utf-8') as f:
        bib_text = f.read()

    # Replace journal titles and print changes
    journal_re = re.compile(r'(journal\s*=\s*\{)([^}]+)(\})', flags=re.IGNORECASE)

    def repl(m):
        prefix, full_title, suffix = m.groups()
        abbr = abbreviate_journal_title(full_title, mapping)
        if abbr != full_title:
            print(f"Replaced journal title:\n  Old: {full_title}\n  New: {abbr}\n")
        return f"{prefix}{abbr}{suffix}"

    updated = journal_re.sub(repl, bib_text)

    # Write the result
    with open(output_bib_path, 'w', encoding='utf-8') as f:
        f.write(updated)

    print(f"✅ Abbreviated journals written to '{output_bib_path}'")

# Example usage:
abbreviate_journals_in_bib(
    input_bib_path='crossRef_cited_by_section.bib',
    output_bib_path='abbrev_crossRef_cited_by_section.bib',
    ltwa_csv_path='ltwa_current.csv'
)

def fix_encoding_artifacts(input_bib: str, output_bib: str):
    """
    Reads input_bib, replaces:
      • 'â€“' → '-'
      • 'â€™' → "'"
    and writes the cleaned text to output_bib.
    """
    with open(input_bib, 'r', encoding='utf-8') as fin:
        text = fin.read()

    # Define all your replacement pairs here
    replacements = {
        'â€“': '-',
        'â€™': "'",
    }

    # Apply each replacement
    for old, new in replacements.items():
        text = text.replace(old, new)

    with open(output_bib, 'w', encoding='utf-8') as fout:
        fout.write(text)

    print(f"Cleaned file written to: {output_bib}")


# adjust filenames as needed:
fix_encoding_artifacts(
    input_bib='abbrev_crossRef_cited_by_section.bib',
    output_bib='abbrev_crossRef_cited_by_section.bib')



Replaced journal title:
  Old: JAMA Surgery
  New: JAMA surg.

Replaced journal title:
  Old: Mayo Clinic Proceedings: Innovations, Quality &amp; Outcomes
  New: Mayo clin. proc.: Innovations, qual. &amp; Outcomes

Replaced journal title:
  Old: European Journal of Operational Research
  New: European j. of Operational res.

Replaced journal title:
  Old: Computer Methods and Programs in Biomedicine
  New: comput. Methods and Programs in biomed.

Replaced journal title:
  Old: Journal of Combinatorial Optimization
  New: j. of comb. optim.

Replaced journal title:
  Old: The Lancet Digital Health
  New: The Lancet Digital heal.

Replaced journal title:
  Old: Journal of Medical Systems
  New: j. of med. syst.

Replaced journal title:
  Old: Surgical Innovation
  New: surg. Innovation

Replaced journal title:
  Old: Nature Communications
  New: nat. commun.

Replaced journal title:
  Old: BMC Medical Research Methodology
  New: BMC med. res. methodol.

Replaced journal title:
  Old: arX