In [40]:
import re
import html
import os

def extract_genres_from_wikitext(wikitext_content):
    genre_pattern = re.compile(
        r'\|\s*genre\s*=\s*(.*?)(?=\n\s*\||\n}})',
        re.DOTALL | re.IGNORECASE
    )

    link_pattern = re.compile(r'\[\[(?:[^|\]]+\|)?([^\]]+)\]\]')
    citation_pattern = re.compile(r'\{\{cite.*?\}\}', re.DOTALL | re.IGNORECASE)
    template_pattern = re.compile(r'\{\{(?:flatlist|hlist|nowrap)\|?', re.IGNORECASE)
    #ref_tag_pattern = re.compile(r'<ref[^>]*?>.*?</ref>|<ref[^>]*/>', re.DOTALL | re.IGNORECASE)
    ref_tag_pattern = re.compile(r'(<ref[^>]*?>.*?</ref>|<ref[^>]*/>)|(&lt;ref[^&gt;]*?&gt;.*?&lt;/ref&gt;|&lt;ref[^&gt;]*/&gt;)', re.DOTALL | re.IGNORECASE)
    html_comment_pattern = re.compile(r'<!--.*?-->', re.DOTALL)

    match = genre_pattern.search(wikitext_content)
    if match:
        raw_genres = match.group(1)

        # Unescape HTML entities
        raw_genres = html.unescape(raw_genres)

        # Remove HTML tags like <small>, <i>, etc.
        raw_genres = re.sub(r'<[^>]+>', '', raw_genres) 

        # Remove HTML comments, citations, <ref> tags, and nested templates
        raw_genres = html_comment_pattern.sub('', raw_genres)
        raw_genres = citation_pattern.sub('', raw_genres)
        raw_genres = ref_tag_pattern.sub('', raw_genres)
        raw_genres = template_pattern.sub('', raw_genres)

        # Remove lines that look like citation metadata
        raw_genres = re.sub(r'\b(title|url|access-date|publisher|quote|last|first|website|date|archive-url|archive-date|language)\b.*', '', raw_genres)

        # Remove trailing explanations like "The following sources refer to..."
        raw_genres = re.sub(r'(The following.*)', '', raw_genres)

        # Remove stray brackets and asterisks
        raw_genres = raw_genres.replace('[[', '').replace(']]', '')
        raw_genres = raw_genres.replace('{{', '').replace('}}', '')
        raw_genres = raw_genres.replace('*', '')

        # Split by common delimiters
        genre_candidates = re.split(r'\||\n|,', raw_genres)

        # Clean and filter
        genres = []
        for genre in genre_candidates:
            genre = link_pattern.sub(r'\1', genre).strip()
            if genre:
                genres.append(genre)

        return list(set(genres))
    return []

In [42]:
import re, html

def extract_genres_from_wikitext(wikitext_content):
    # 1) Isolate the 'genre =' field in the infobox
    m = re.search(r'\|\s*genre\s*=\s*(.*?)(?=\n\s*\||\n}})', 
                  wikitext_content, flags=re.DOTALL | re.IGNORECASE)
    if not m:
        return []
    g = m.group(1)

    # 2) Decode HTML entities so &lt;ref&gt; becomes <ref>, etc.
    g = html.unescape(g)

    # 3) Remove HTML comments
    g = re.sub(r'<!--.*?-->', '', g, flags=re.DOTALL)

    # 4) Remove references: self-closing BEFORE paired to avoid cross-line capture
    g = re.sub(r'<ref[^>]*/>', '', g, flags=re.DOTALL | re.IGNORECASE)                     # <ref .../>
    g = re.sub(r'<ref(?:(?!/>)[^>])*?>.*?</ref>', '', g, flags=re.DOTALL | re.IGNORECASE)  # <ref ...>...</ref>

    # 5) Remove citation/footnote templates entirely
    g = re.sub(r'\{\{\s*cite[^{}]*\}\}', '', g, flags=re.IGNORECASE)                       # {{cite web|...}}
    g = re.sub(r'\{\{\s*harv-?nb?[^{}]*\}\}', '', g, flags=re.IGNORECASE)                  # {{harvnb}}, {{harv-nb}}
    g = re.sub(r'\{\{\s*(sfn|sfnb|efn|refn)[^{}]*\}\}', '', g, flags=re.IGNORECASE)        # {{sfn}}, {{refn}}, etc.
    g = re.sub(r'\{\{\s*(citation needed|cn)[^{}]*\}\}', '', g, flags=re.IGNORECASE)       # {{citation needed}}, {{cn}}

    # 6) Drop wrapper templates but keep their inner content
    g = re.sub(r'\{\{\s*(flatlist|hlist|nowrap|plainlist|ubl|unbulleted list)\s*\|?', '', g, flags=re.IGNORECASE)

    # 7) Remove any remaining braces and HTML tags like <small>, <i>, etc.
    g = g.replace('{{', '').replace('}}', '')
    g = re.sub(r'<[^>]+>', '', g)

    # 8) Replace wiki links [[A|B]] -> B; [[A]] -> A
    g = re.sub(r'\[\[(?:[^|\]]+\|)?([^\]]+)\]\]', r'\1', g)

    # 9) Normalize spaces (incl. NBSP) and bullets
    g = g.replace('\xa0', ' ')
    g = g.replace('*', ' ')

    # 10) Split and clean
    parts = re.split(r'[\n,;•·]+|\s*\|\s*', g)
    cleaned = []
    for p in parts:
        t = p.strip()
        if not t:
            continue
        # drop obvious citation metadata leftovers
        if any(k in t.lower() for k in [
            'title=', 'url=', 'access-date', 'publisher=', 'website=',
            'archive-url', 'archive-date', 'language=', 'first=', 'last=',
            'work=', 'date='
        ]):
            continue
        # remove any stranded "citation ..." tail
        t = re.sub(r'\bcitation\b.*', '', t, flags=re.IGNORECASE).strip()
        if t:
            cleaned.append(t)

    # 11) Deduplicate (case-insensitive) while preserving order
    out, seen = [], set()
    for x in cleaned:
        xl = x.lower()
        if xl not in seen:
            seen.add(xl)
            out.append(x)
    return out

In [46]:
import re, html

def _extract_infobox_genre_value(wikitext: str) -> str | None:
    """
    Find the 'genre =' value from the first 'Infobox musical artist' (or similar) block,
    scanning forward and respecting simple nesting of {{ }}, [[ ]], and <ref>...</ref>.
    Returns the raw text of the genre value (without the leading 'genre ='), or None.
    """
    # Find the first occurrence of '| genre ='
    m = re.search(r'\|\s*genre\s*=\s*', wikitext, flags=re.IGNORECASE)
    if not m:
        return None

    i = m.end()            # start scanning after 'genre ='
    n = len(wikitext)

    # Nesting counters
    tpl_depth = 0          # for {{ ... }}
    link_depth = 0         # for [[ ... ]]
    in_ref = False         # for <ref ...> ... </ref>
    # We'll treat self-closing <ref .../> immediately.

    out_chars = []
    while i < n:
        ch = wikitext[i]

        # --- Handle <ref .../> and <ref>...</ref> ---
        if not in_ref and wikitext.startswith('<ref', i):
            # Self-closing?
            j = wikitext.find('>', i)
            if j == -1:
                # malformed, just break
                break
            # Determine if self-closing
            # scan forward to the '>' we just found and see if immediately before it is '/'
            if wikitext[i:j+1].rstrip().endswith('/>'):
                # consume the whole <ref .../>
                i = j + 1
                continue
            else:
                # enter paired ref; consume the opening tag and set in_ref
                i = j + 1
                in_ref = True
                continue
        if in_ref:
            # consume until '</ref>'
            close_idx = wikitext.lower().find('</ref>', i)
            if close_idx == -1:
                # malformed; drop the rest
                return ''.join(out_chars)
            i = close_idx + len('</ref>')
            in_ref = False
            continue

        # --- Handle {{ ... }} template nesting ---
        if wikitext.startswith('{{', i):
            tpl_depth += 1
            out_chars.append('{{')
            i += 2
            continue
        if wikitext.startswith('}}', i) and tpl_depth > 0:
            tpl_depth -= 1
            out_chars.append('}}')
            i += 2
            continue

        # --- Handle [[ ... ]] link nesting ---
        if wikitext.startswith('[[', i):
            link_depth += 1
            out_chars.append('[[')
            i += 2
            continue
        if wikitext.startswith(']]', i) and link_depth > 0:
            link_depth -= 1
            out_chars.append(']]')
            i += 2
            continue

        # --- End condition: a new parameter line or end of infobox, but only when not nested ---
        if ch == '\n' and tpl_depth == 0 and link_depth == 0 and not in_ref:
            # Look ahead to next non-space character
            j = i + 1
            while j < n and wikitext[j] in ' \t':
                j += 1
            if j < n and wikitext[j] in ['|', '}']:
                # Stop BEFORE this newline — end of 'genre' value
                break

        # Regular character
        out_chars.append(ch)
        i += 1

    return ''.join(out_chars).strip()


def extract_genres_from_wikitext(wikitext_content: str) -> list[str]:
    # 1) Get a robust raw block for the genre value
    raw = _extract_infobox_genre_value(wikitext_content)
    if not raw:
        return []

    # 2) Unescape once so &lt;ref&gt; becomes <ref> etc.
    g = html.unescape(raw)

    # 3) Remove HTML comments
    g = re.sub(r'<!--.*?-->', '', g, flags=re.DOTALL)

    # 4) Remove references: self-closing BEFORE paired
    g = re.sub(r'<ref[^>]*/>', '', g, flags=re.DOTALL | re.IGNORECASE)                    # <ref .../>
    g = re.sub(r'<ref(?:(?!/>)[^>])*?>.*?</ref>', '', g, flags=re.DOTALL | re.IGNORECASE) # <ref ...>...</ref>

    # 5) Remove citation/footnote templates entirely
    g = re.sub(r'\{\{\s*cite[^{}]*\}\}', '', g, flags=re.IGNORECASE)                      # {{cite web|...}}
    g = re.sub(r'\{\{\s*harv-?nb?[^{}]*\}\}', '', g, flags=re.IGNORECASE)                 # {{harvnb}}, {{harv-nb}}
    g = re.sub(r'\{\{\s*(sfn|sfnb|efn|refn)[^{}]*\}\}', '', g, flags=re.IGNORECASE)       # {{sfn}}, {{refn}}, etc.
    g = re.sub(r'\{\{\s*(citation needed|cn)[^{}]*\}\}', '', g, flags=re.IGNORECASE)      # {{citation needed}}, {{cn}}

    # 6) Remove wrapper templates but keep inner content
    g = re.sub(r'\{\{\s*(flatlist|hlist|nowrap|plainlist|ubl|unbulleted list)\s*\|?', '', g, flags=re.IGNORECASE)

    # 7) Remove remaining stray '{{' / '}}' and HTML tags like <small>, <i>, <span>
    g = g.replace('{{', '').replace('}}', '')
    g = re.sub(r'<[^>]+>', '', g)

    # 8) Replace wiki links [[A|B]] -> B ; [[A]] -> A    (allow pipes and newlines within)
    g = re.sub(r'\[\[(?:[^|\]]+?\|)?([^\]]+)\]\]', r'\1', g, flags=re.DOTALL)

    # 9) Normalize whitespace (incl. NBSP) and bullets
    g = g.replace('\xa0', ' ')
    g = g.replace('*', ' ')

    # 10) Split on common separators (commas/newlines/semicolons/pipes/dots)
    parts = re.split(r'[\n,;•·]+|\s*\|\s*', g)

    # 11) Clean and filter
    cleaned = []
    for p in parts:
        t = p.strip()
        if not t:
            continue
        # Drop obvious citation metadata leftovers
        low = t.lower()
        if any(k in low for k in [
            'title=', 'url=', 'access-date', 'publisher=', 'website=',
            'archive-url', 'archive-date', 'language=', 'first=', 'last=',
            'work=', 'date=', 'quote='
        ]):
            continue
        # Remove stranded 'citation ...' tail if any
        t = re.sub(r'\bcitation\b.*', '', t, flags=re.IGNORECASE).strip()

        # Trim non-word chars at ends
        t = re.sub(r'^[\W_]+|[\W_]+$', '', t)
        
        # strip trailing parentheticals
        t = re.sub(r'\s*\([^)]*\)\s*$', '', t)

        if t:
            cleaned.append(t)


    # 12) Dedupe case-insensitively, preserve order
    out, seen = [], set()
    for x in cleaned:
        xl = x.lower()
        if xl not in seen:
            seen.add(xl)
            out.append(x)
    return out


In [48]:
import re, html

def extract_genres_from_wikitext(wikitext_content):
    # Find the | genre = ... field
    m = re.search(r'\|\s*genre\s*=\s*(.*?)(?=\n\s*\||\n}})', 
                  wikitext_content, flags=re.DOTALL | re.IGNORECASE)
    if not m:
        return []
    g = html.unescape(m.group(1))

    # Remove HTML comments
    g = re.sub(r'<!--.*?-->', '', g, flags=re.DOTALL)

    # Remove refs: self-closing first, then paired (prevents cross-line over-capture)
    g = re.sub(r'<ref[^>]*/>', '', g, flags=re.DOTALL | re.IGNORECASE)                     # <ref .../>
    g = re.sub(r'<ref(?:(?!/>)[^>])*?>.*?</ref>', '', g, flags=re.DOTALL | re.IGNORECASE)  # <ref ...>...</ref>

    # Remove citation/footnote templates entirely
    g = re.sub(r'\{\{\s*cite[^{}]*\}\}', '', g, flags=re.IGNORECASE)                       # {{cite ...}}
    g = re.sub(r'\{\{\s*harv-?nb?[^{}]*\}\}', '', g, flags=re.IGNORECASE)                  # {{harvnb}}
    g = re.sub(r'\{\{\s*(sfn|sfnb|efn|refn)[^{}]*\}\}', '', g, flags=re.IGNORECASE)        # {{sfn}}, {{refn}}, etc.
    g = re.sub(r'\{\{\s*(citation needed|cn)[^{}]*\}\}', '', g, flags=re.IGNORECASE)       # {{citation needed}}

    # Remove wrapper templates but keep their content
    g = re.sub(r'\{\{\s*(flatlist|hlist|nowrap|plainlist|ubl|unbulleted list)\s*\|?', '', g, flags=re.IGNORECASE)

    # Remove any remaining braces and HTML tags
    g = g.replace('{{', '').replace('}}', '')
    g = re.sub(r'<[^>]+>', '', g)

    # Convert wiki-links [[A|B]] -> B; [[A]] -> A
    g = re.sub(r'\[\[(?:[^|\]]+\|)?([^\]]+)\]\]', r'\1', g)

    # Normalize whitespace/NBSP and bullets
    g = g.replace('\xa0', ' ').replace('*', ' ')

    # Split on lines, commas, semicolons, bullets, or pipes
    parts = re.split(r'[\n,;•·]+|\s*\|\s*', g)

    cleaned = []
    for p in parts:
        t = p.strip()
        if not t:
            continue
        # Trim only whitespace and common delimiters (keep parentheses)
        t = re.sub(r'^[\s\-–—,:]+|[\s\-–—,:]+$', '', t)

        # Drop leftover citation metadata
        low = t.lower()
        if any(k in low for k in [
            'title=', 'url=', 'access-date', 'publisher=', 'website=',
            'archive-url', 'archive-date', 'language=', 'first=', 'last=',
            'work=', 'date='
        ]):
            continue
        t = re.sub(r'\bcitation\b.*', '', t, flags=re.IGNORECASE).strip()
        if not t:
            continue

        # Auto-close unmatched '(' → ')'
        if t.count('(') > t.count(')'):
            t = t + ')'

        cleaned.append(t)

    # De-dupe (case-insensitive), preserve order
    out, seen = [], set()
    for x in cleaned:
        xl = x.lower()
        if xl not in seen:
            seen.add(xl)
            out.append(x)
    return out

In [50]:
import re, html

def extract_genres_from_wikitext(wikitext_content):
    # 1) Isolate the 'genre =' field in the infobox
    m = re.search(r'\|\s*genre\s*=\s*(.*?)(?=\n\s*\||\n}})',
                  wikitext_content, flags=re.DOTALL | re.IGNORECASE)
    if not m:
        return []
    g = m.group(1)

    # 2) Decode HTML entities so &lt;ref&gt; becomes <ref>, etc.
    g = html.unescape(g)

    # 3) Remove HTML comments
    g = re.sub(r'<!--.*?-->', '', g, flags=re.DOTALL)

    # 4) Remove references: self-closing BEFORE paired to avoid cross-line capture
    g = re.sub(r'<ref[^>]*/>', '', g, flags=re.DOTALL | re.IGNORECASE)                      # <ref .../>
    g = re.sub(r'<ref(?:(?!/>)[^>])*?>.*?</ref>', '', g, flags=re.DOTALL | re.IGNORECASE)   # <ref ...>...</ref>

    # 5) Remove citation/footnote templates entirely
    g = re.sub(r'\{\{\s*cite[^{}]*\}\}', '', g, flags=re.IGNORECASE)                        # {{cite web|...}}
    g = re.sub(r'\{\{\s*harv-?nb?[^{}]*\}\}', '', g, flags=re.IGNORECASE)                   # {{harvnb}}, etc.
    g = re.sub(r'\{\{\s*(sfn|sfnb|efn|refn)[^{}]*\}\}', '', g, flags=re.IGNORECASE)         # {{sfn}}, {{refn}}, etc.
    g = re.sub(r'\{\{\s*(citation needed|cn)[^{}]*\}\}', '', g, flags=re.IGNORECASE)        # {{citation needed}}, {{cn}}

    # 6) Drop wrapper templates but keep their inner content
    g = re.sub(r'\{\{\s*(flatlist|hlist|nowrap|plainlist|ubl|unbulleted list)\s*\|?', '',
               g, flags=re.IGNORECASE)

    # 7) Remove remaining braces and HTML tags like <small>, <i>, etc.
    g = g.replace('{{', '').replace('}}', '')
    g = re.sub(r'<[^>]+>', '', g)

    # 8) Replace wiki links [[A|B]] -> B; [[A]] -> A
    g = re.sub(r'\[\[(?:[^|\]]+\|)?([^\]]+)\]\]', r'\1', g)

    # 9) Normalize whitespace (incl. NBSP) and bullets
    g = g.replace('\xa0', ' ')
    g = g.replace('*', ' ')

    # 10) Split on common delimiters
    parts = re.split(r'[\n,;•·]+|\s*\|\s*', g)

    def normalize_label(t: str) -> str:
        # Trim surrounding punctuation
        t = t.strip()
        t = re.sub(r'^[\W_]+|[\W_]+$', '', t)
        if not t:
            return ''

        # Drop obvious citation metadata leftovers
        lo = t.lower()
        if any(k in lo for k in [
            'title=', 'url=', 'access-date', 'publisher=', 'website=',
            'archive-url', 'archive-date', 'language=', 'first=', 'last=',
            'work=', 'date='
        ]):
            return ''

        # Remove any stranded "citation ..." tails
        t = re.sub(r'\bcitation\b.*', '', t, flags=re.IGNORECASE).strip()

        # --- Strip parenthetical qualifiers anywhere (e.g., "rock (early)" → "rock") ---
        # Remove ALL (...) groups; repeat to be safe if multiple occur
        t = re.sub(r'\s*\([^)]*\)', '', t).strip()

        # Lowercase everything
        t = t.lower()

        # --- Normalize rock and roll variants to a single label ---
        # rock & roll / rock and roll / rock 'n' roll / rock ’n’ roll → "rock and roll"
        t = re.sub(r"rock\s*(?:&|and|['`´’]\s*n['`´’])\s*roll", "rock and roll", t)

        # Collapse repeated whitespace
        t = re.sub(r'\s{2,}', ' ', t)

        return t

    cleaned = [normalize_label(p) for p in parts]
    cleaned = [c for c in cleaned if c]

    # 11) Deduplicate (already lowercase) while preserving order
    out, seen = [], set()
    for x in cleaned:
        if x not in seen:
            seen.add(x)
            out.append(x)
    return out

In [52]:
import re
import html

def extract_genres_from_wikitext(wikitext_content):
    """
    Extract a cleaned, normalized list of genres from Wikipedia-like wikitext.
    - Removes refs, citation templates, wrapper templates, HTML, and wiki links.
    - Strips all parenthetical qualifiers (e.g., '(early)', '(late)') and broken tails.
    - Lowercases all genres.
    - Normalizes common variants (rock & roll, r&b, hip-hop, post punk, synth pop, etc.).
    - Drops obvious template/config noise.
    - Deduplicates while preserving order.
    """
    # 1) Isolate the 'genre =' field in the infobox
    m = re.search(
        r'\|\s*genre\s*=\s*(.*?)(?=\n\s*\||\n}})',
        wikitext_content,
        flags=re.DOTALL | re.IGNORECASE
    )
    if not m:
        return []
    g = m.group(1)

    # 2) Decode HTML entities (&lt;ref&gt; -> <ref>, &amp; -> &, etc.)
    g = html.unescape(g)

    # 3) Remove HTML comments
    g = re.sub(r'<!--.*?-->', '', g, flags=re.DOTALL)

    # 4) Remove references: self-closing BEFORE paired to avoid cross-line capture
    g = re.sub(r'<ref[^>]*/>', '', g, flags=re.DOTALL | re.IGNORECASE)                      # <ref .../>
    g = re.sub(r'<ref(?:(?!/>)[^>])*?>.*?</ref>', '', g, flags=re.DOTALL | re.IGNORECASE)   # <ref ...>...</ref>

    # 5) Remove citation/footnote templates entirely
    g = re.sub(r'\{\{\s*cite[^{}]*\}\}', '', g, flags=re.IGNORECASE)                        # {{cite web|...}}
    g = re.sub(r'\{\{\s*harv-?nb?[^{}]*\}\}', '', g, flags=re.IGNORECASE)                   # {{harvnb}}, {{harv-nb}}
    g = re.sub(r'\{\{\s*(sfn|sfnb|efn|refn)[^{}]*\}\}', '', g, flags=re.IGNORECASE)         # {{sfn}}, {{refn}}, etc.
    g = re.sub(r'\{\{\s*(citation needed|cn)[^{}]*\}\}', '', g, flags=re.IGNORECASE)        # {{citation needed}}, {{cn}}

    # 6) Drop wrapper templates but keep their inner content
    g = re.sub(r'\{\{\s*(flatlist|hlist|nowrap|plainlist|ubl|unbulleted list)\s*\|?', '',
               g, flags=re.IGNORECASE)

    # 7) Remove remaining braces and HTML tags like <small>, <i>, etc.
    g = g.replace('{{', '').replace('}}', '')
    g = re.sub(r'<[^>]+>', '', g)

    # 8) Replace wiki links [[A|B]] -> B; [[A]] -> A
    g = re.sub(r'\[\[(?:[^|\]]+\|)?([^\]]+)\]\]', r'\1', g)

    # 9) Normalize whitespace (incl. NBSP) and bullets
    g = g.replace('\xa0', ' ')
    g = g.replace('*', ' ')

    # 10) Split on common delimiters
    parts = re.split(r'[\n,;•·]+|\s*\|\s*', g)

    # --- Normalization helpers ---

    # Canonical mapping for near-duplicates and synonyms (all keys/values lowercase)
    CANON_MAP = {
        # rock and roll family
        "rock & roll": "rock and roll",
        "rock and roll": "rock and roll",
        "rock 'n' roll": "rock and roll",
        "rock ’n’ roll": "rock and roll",
        "rock `n` roll": "rock and roll",

        # r&b / rhythm & blues
        "r&b": "rhythm and blues",
        "r & b": "rhythm and blues",
        "rhythm & blues": "rhythm and blues",

        # hip hop
        "hip-hop": "hip hop",
        "hip–hop": "hip hop",

        # hyphen/space variants
        "post punk": "post-punk",
        "synth pop": "synth-pop",
        "dance rock": "dance-rock",
        "blue eyed soul": "blue-eyed soul",

        # AOR
        "aor": "album-oriented rock",
    }

    # Regexes for pattern families
    ROCKNROLL_RX = re.compile(r"rock\s*(?:&|and|['`´’]\s*n['`´’])\s*roll")
    HIPHOP_RX    = re.compile(r"hip\s*[-–]?\s*hop")
    RAND_B_RX    = re.compile(r"(?:^|\b)r\s*&\s*b(?:\b|$)|rhythm\s*&\s*blues")

    # Token acceptability: letters (incl. accents), numbers, space, hyphen, apostrophes/quotes, ampersand, dots
    ALLOWED_RX   = re.compile(r"^[0-9A-Za-zÀ-ÖØ-öø-ÿ\s\-\&'`´’\.]+$")

    # Substrings indicating template/config noise to drop
    DROP_SUBSTRS = [
        'citation', 'cite', 'access-date', 'archive-url', 'archive-date',
        'publisher=', 'website=', 'title=', 'url=', 'first=', 'last=',
        'class=', 'nowrap', 'italic', 'lang', 'script=', 'format='
    ]

    def normalize_label(t: str) -> str:
        # Trim punctuation
        t = t.strip()
        t = re.sub(r'^[\W_]+|[\W_]+$', '', t)
        if not t:
            return ''

        # Drop obvious noise
        lo = t.lower()
        if any(s in lo for s in DROP_SUBSTRS):
            return ''

        # Remove any parenthetical parts, even if multiple (e.g., "(early)", "(later)")
        t = re.sub(r'\s*\([^)]*\)', '', t).strip()
        # If an unmatched '(' tail remains, drop that too (e.g., "funk metal (early")
        t = re.sub(r'\s*\([^)]*$', '', t).strip()

        # Lowercase
        t = t.lower()

        # Normalize rock and roll forms
        if ROCKNROLL_RX.search(t):
            t = ROCKNROLL_RX.sub('rock and roll', t)

        # Normalize hip hop forms
        if HIPHOP_RX.search(t):
            t = HIPHOP_RX.sub('hip hop', t)

        # Normalize r&b / rhythm & blues
        if RAND_B_RX.search(t):
            t = 'rhythm and blues'

        # Apply canonical map (covers aor, synth-pop, post-punk, blue-eyed soul, dance-rock, etc.)
        t = CANON_MAP.get(t, t)

        # Reject invalid tokens (after normalization)
        if not ALLOWED_RX.match(t):
            return ''

        # Collapse repeated spaces and strip stray punctuation
        t = re.sub(r'\s{2,}', ' ', t).strip('. ').strip()

        return t

    cleaned = [normalize_label(p) for p in parts]
    cleaned = [c for c in cleaned if c]

    # Deduplicate while preserving order (already lowercase)
    out, seen = [], set()
    for x in cleaned:
        if x not in seen:
            seen.add(x)
            out.append(x)
    return out

In [53]:
# Define the directory where your band data files are located
data_directory = "Bands"
band_genres = {}

# Iterate over all files in the specified directory
for filename in os.listdir(data_directory):
    file_path = os.path.join(data_directory, filename)
    band_name = os.path.splitext(filename)[0] # Get band name from filename

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

            wikitext_content = content

            genres = extract_genres_from_wikitext(wikitext_content)
            band_genres[band_name] = genres
    except Exception as e:
        print(f"Error processing file {filename}: {e}")

# order the band list alphabetically
band_genres = dict(sorted(band_genres.items()))

# print number of bands that have infoboxes:
num_bands_with_infoboxes = sum(1 for genres in band_genres.values() if genres)
print(f"Number of bands with infoboxes: {num_bands_with_infoboxes} \n")

# Print the extracted genres for each band
for band, genres in band_genres.items():
    print(f"{band}: {genres}")

Number of bands with infoboxes: 472 

10_Years__band_: ['alternative metal', 'progressive metal', 'post-grunge', 'nu metal']
10cc: ['art rock', 'art pop', 'progressive pop', 'soft rock', 'pop rock']
311__band_: ['alternative rock', 'rap rock', 'reggae rock', 'funk rock', 'funk metal']
38_Special__band_: ['hard rock', 'southern rock', 'boogie rock', 'blues rock']
3_Doors_Down: ['post-grunge', 'hard rock', 'alternative rock']
ABBA: ['pop', 'disco', 'pop rock', 'europop']
AC_DC: ['hard rock', 'blues rock', 'rock and roll', 'heavy metal']
AFI__band_: ['punk rock', 'gothic rock', 'horror punk', 'post-hardcore', 'emo', 'hardcore punk']
A_Perfect_Circle: ['alternative rock', 'alternative metal', 'hard rock', 'art rock']
Accept__band_: ['heavy metal']
Adam_Ant: ['new wave', 'post-punk', 'alternative rock', 'dance-rock']
Aerosmith: ['hard rock', 'blues rock', 'heavy metal']
Air_Supply: ['soft rock', 'pop rock', 'pop', 'adult contemporary', 'middle-of-the-road']
Alanis_Morissette: ['alternative 