In [8]:
import re
import os

def extract_genres_from_wikitext(wikitext_content):
    """
    Extracts genres from a Wikipedia infobox wikitext.
    """

    # It looks for 'genre =' followed by content, handling flatlist/list items,
    # and captures everything until the next field or end of infobox.
    genre_pattern = re.compile(
        r'\|\s*genre\s*=\s*(?:{{flatlist\||\s*\n\s*\*\s*)?(.*?)(?=\n\s*\||\n}})',
        re.DOTALL | re.IGNORECASE
    )

    # Regex to clean up the extracted genre string
    # - remove [[ and ]] for internal links, but keep the displayed text
    # - remove <ref> tags and their content
    # - remove {{flatlist| and *
    # - trim whitespace
    clean_genre_pattern = re.compile(r'\[\[(?:[^|\]]+\|)?([^\]]+)\]\]|<ref[^>]*>.*?<\/ref>|\{\{flatlist\||\*', re.DOTALL)


    match = genre_pattern.search(wikitext_content)
    if match:
        raw_genres_string = match.group(1).strip() # Access the first capturing group

        # Clean the string
        # This part will remove the wiki-markup for links and references.
        # Example: "[[Post-grunge]]<ref name=allmusic-bio/>" becomes "Post-grunge"
        cleaned_genres_string = clean_genre_pattern.sub(lambda m: m.group(1) if m.group(1) else '', raw_genres_string)

        # Split into individual genres, handle multiple lines/commas, and clean each
        # This regex will split by a newline followed by an asterisk (*), or just a comma, or just a newline.
        # This helps in splitting list-formatted genres.
        genres = [
            genre.strip()
            for genre in re.split(r'\n\*|\n|,', cleaned_genres_string)
            if genre.strip() # Remove any empty strings that might result from splitting
        ]
        return list(set(genres)) # Use set to get unique genres, then convert back to list
    return []

In [11]:
# Define the directory where your band data files are located
data_directory = "Bands"
band_genres = {}

# Iterate over all files in the specified directory
for filename in os.listdir(data_directory):
    file_path = os.path.join(data_directory, filename)
    band_name = os.path.splitext(filename)[0] # Get band name from filename

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

            wikitext_content = content

            genres = extract_genres_from_wikitext(wikitext_content)
            band_genres[band_name] = genres
    except Exception as e:
        print(f"Error processing file {filename}: {e}")

# print number of bands that have infoboxes:
num_bands_with_infoboxes = sum(1 for genres in band_genres.values() if genres)
print(f"Number of bands with infoboxes: {num_bands_with_infoboxes} \n")

# Print the extracted genres for each band
for band, genres in band_genres.items():
    print(f"{band}: {genres}")

Number of bands with infoboxes: 475 

Funkadelic: ['{{hlist|Funk rock|psychedelic funk|acid rock|psychedelic rock}}']
Slayer: ['<!--Do not change genre without discussing on the talk page first.-->Thrash metal']
Ted_Nugent: ['Hard rock']
T__Rex__band_: ['Glam rock', 'pop', '{{nowrap|psychedelic folk (early)}}']
Great_White: ['Hard rock', 'glam metal']
Days_of_the_New: ['Post-grunge', 'alternative metal', 'alternative rock']
The_Dave_Clark_Five: ['{{Hlist|Rock and roll|beat|pop}}']
Anthrax__American_band_: ['heavy metal', '{{nowrap|alternative metal}}', '<!--Do not change these genres without discussing on the talk page first.-->', 'Thrash metal', 'groove metal']
Jimmy_Eat_World: ['power pop', 'Alternative rock', 'pop-punk', 'pop rock', 'emo']
Flogging_Molly: ['folk punk', 'Celtic punk']
Simple_Plan: ['power pop', 'Pop-punk', '<!-- All of these genres are sourced in the musical style section. Please cite a source for or discuss any additions -->', 'alternative rock', 'pop rock', 'emo']
