In [4]:
import re

FIELD_START_RE = re.compile(r'^\s*\w+\s*=')

def fix_bib_commas(input_path, output_path):
    corrected_lines = []

    with open(input_path, "r", encoding="utf-8") as f:
        in_entry = False
        entry_header = None
        entry_body = []
        closing_line = None

        for line in f:
            stripped = line.strip()

            # Detect start of an entry
            if not in_entry and stripped.startswith("@"):
                in_entry = True
                entry_header = line
                entry_body = []
                closing_line = None
                continue

            # Detect end of an entry
            if in_entry and stripped == "}":
                closing_line = line
                # Process the entry body to fix commas
                processed_entry = process_entry(entry_header, entry_body, closing_line)
                corrected_lines.extend(processed_entry)

                # Reset flags
                in_entry = False
                entry_header = None
                entry_body = []
                closing_line = None
                continue

            # Inside an entry: collect body lines
            if in_entry:
                entry_body.append(line)
            else:
                # Outside entries, copy lines as-is
                corrected_lines.append(line)

    # Write out corrected content
    with open(output_path, "w", encoding="utf-8") as f:
        f.writelines(corrected_lines)


def process_entry(header_line, body_lines, closing_line):
    """
    body_lines: all lines between the @entry{... line and the final '}' line.
    We will:
      - Find lines that start fields.
      - For each field block (from field_start[i] to field_start[i+1]-1):
          * ensure last non-blank line ends with comma (except for last field).
      - For the last field:
          * ensure last non-blank line does NOT end with comma.
    """
    # Find indices of lines where a new field starts
    field_indices = []
    for i, ln in enumerate(body_lines):
        if FIELD_START_RE.match(ln.strip()):
            field_indices.append(i)

    # If no fields detected, just return untouched
    if not field_indices:
        return [header_line] + body_lines + [closing_line]

    # Helper to find the last non-empty line index in a given range (inclusive)
    def last_nonempty_idx(start, end):
        idx = end
        while idx >= start and body_lines[idx].strip() == "":
            idx -= 1
        return idx if idx >= start else None

    # Ensure commas for all fields except last
    for k in range(len(field_indices) - 1):
        start = field_indices[k]
        next_start = field_indices[k + 1]
        end = next_start - 1

        last_idx = last_nonempty_idx(start, end)
        if last_idx is None:
            continue  # nothing non-empty in this block

        line = body_lines[last_idx].rstrip("\n")
        # Add comma only if it isn't already there
        if not line.rstrip().endswith(","):
            line = line.rstrip() + ","
        body_lines[last_idx] = line + "\n"

    # Handle last field: ensure NO trailing comma
    last_field_start = field_indices[-1]
    last_field_end = len(body_lines) - 1

    last_idx = last_nonempty_idx(last_field_start, last_field_end)
    if last_idx is not None:
        line = body_lines[last_idx].rstrip("\n")
        stripped = line.rstrip()
        if stripped.endswith(","):
            stripped = stripped[:-1]  # remove the final comma
        body_lines[last_idx] = stripped + "\n"

    return [header_line] + body_lines + [closing_line]


In [5]:
fix_bib_commas("/home/haque/Downloads/papers.bib", "/home/haque/Downloads/output.bib")

In [7]:
import csv
import re
from pathlib import Path
from typing import List, Dict, Any


# ----------------------------------------------------
# Image Filename Generator (Your Required Rule)
# ----------------------------------------------------

def generate_image_filename(name: str) -> str:
    """
    Generate filename: first letter of first name + last name, all lower case.
    Example: 'Oliver Turnbull' -> 'oturnbull.jpg'
    """
    parts = name.strip().split()
    if len(parts) == 0:
        return "default.jpg"

    first = parts[0]
    last = parts[-1]  # last token is last name

    filename = (first[0] + last).lower()
    filename = re.sub(r"[^a-z0-9]", "", filename)  # remove punctuation

    return f"{filename}.jpg"


def slugify_name(name: str) -> str:
    """Convert a name to a slug for use in about_<slug>.md filenames."""
    name = name.strip().lower()
    name = re.sub(r"[^a-z0-9]+", "-", name)
    name = re.sub(r"-+", "-", name)
    return name.strip("-")


# ----------------------------------------------------
# Parsing functions
# ----------------------------------------------------

def parse_members_file(file_path: Path) -> List[Dict[str, Any]]:
    """
    Parse a members text file.
    Keeps only current members (Left == '-' or blank).
    Adds derived fields including image filename and slug.
    """
    members = []
    with Path(file_path).open(newline="", encoding="utf-8") as f:
        reader = csv.reader(f)
        header = next(reader, None)  # skip header

        for row in reader:
            if not row or not row[0].strip():
                continue

            # Guarantee at least 6 columns
            row += [""] * (6 - len(row))

            name = row[0].strip()
            position = row[1].strip()
            research = row[2].strip()
            link = row[3].strip()
            arrived = row[4].strip()
            left = row[5].strip()

            # Skip former members
            if left not in ("", "-"):
                continue

            members.append({
                "name": name,
                "position": position,
                "research": research,
                "link": link,
                "arrived": arrived,
                "image": generate_image_filename(name),   # << ADDED
                "slug": slugify_name(name)                # << ADDED
            })

    return members


def merge_members(file_paths: List[str]) -> List[Dict[str, Any]]:
    """Merge multiple member files without duplicates."""
    seen = set()
    merged = []

    for file in file_paths:
        for m in parse_members_file(file):
            key = (m["name"], m["position"], m["arrived"])
            if key not in seen:
                seen.add(key)
                merged.append(m)

    return merged


# ----------------------------------------------------
# Markdown Generation
# ----------------------------------------------------

def generate_profiles_markdown(members: List[Dict[str, Any]]) -> str:
    """Return Markdown text for profiles.md."""

    def sort_key(m):
        try:
            yr = int(m["arrived"])
        except:
            yr = 9999
        return (yr, m["name"])

    members_sorted = sorted(members, key=sort_key)

    lines = []
    lines.append("---\n")
    lines.append("layout: profiles\n")
    lines.append("permalink: /people/\n")
    lines.append("title: people\n")
    lines.append("description: members of the lab or group\n")
    lines.append("nav: true\n")
    lines.append("nav_order: 7\n\n")
    lines.append("profiles:\n")

    for i, m in enumerate(members_sorted):
        align = "left" if i % 2 == 0 else "right"

        lines.append(f"  - align: {align}\n")
        lines.append(f"    image: {m['image']}\n")
        lines.append(f"    content: about_{m['slug']}.md\n")
        lines.append("    image_circular: true\n")
        lines.append("    more_info: >\n")
        lines.append(f"      <p>{m['position']}</p>\n")

        if m["research"] and m["research"] != "#":
            lines.append(f"      <p>Research area: {m['research']}</p>\n")

        if m["link"] and m["link"] != "#":
            lines.append(f"      <p><a href=\"{m['link']}\">{m['link']}</a></p>\n")

        if m["arrived"]:
            lines.append(f"      <p>Member since {m['arrived']}</p>\n")

    lines.append("---\n")
    return "".join(lines)


def write_profiles_md(output_path: str, markdown_text: str):
    Path(output_path).write_text(markdown_text, encoding="utf-8")


In [None]:
from pathlib import Path

# List your member files (you can add more)
files = [
    "/home/haque/Downloads/members_all.txt",
    "/home/haque/Downloads/members_visitor.txt",
    # "members_extra.txt",
]

members = merge_members(files)

md_text = generate_profiles_markdown(members)

write_profiles_md("profiles.md", md_text)

md_text  # display in notebook
