In [5]:
import pandas as pd
import requests

# Load data
train_df = pd.read_json("https://huggingface.co/datasets/sander-wood/irishman/resolve/main/train.json")
valid_df = pd.read_json("https://huggingface.co/datasets/sander-wood/irishman/resolve/main/validation.json")

# Load leadsheet IDs correctly
leadsheet_ids_url = "https://huggingface.co/datasets/sander-wood/irishman/resolve/main/leadsheet_ids.json"
leadsheet_ids_json = requests.get(leadsheet_ids_url).json()

train_leadsheet_ids = set(leadsheet_ids_json["train"])
valid_leadsheet_ids = set(leadsheet_ids_json["validation"])

# Extract ID from abc text
def extract_id(abc_text):
    for line in abc_text.splitlines():
        if line.startswith("X:"):
            return line[2:].strip()
    return None

# Add song IDs
train_df["song_id"] = train_df["abc notation"].apply(extract_id)
valid_df["song_id"] = valid_df["abc notation"].apply(extract_id)

# Filter only the correct leadsheets from each split
train_leadsheets = train_df[train_df["song_id"].isin(train_leadsheet_ids)]
valid_leadsheets = valid_df[valid_df["song_id"].isin(valid_leadsheet_ids)]

# Combine and save
all_leadsheets = pd.concat([train_leadsheets, valid_leadsheets], ignore_index=True)
abc_text = "\n\n".join(all_leadsheets["abc notation"])

with open("leadsheets.abc", "w", encoding="utf-8") as f:
    f.write(abc_text)

print(f"Wrote {len(all_leadsheets)} leadsheet songs to leadsheets.abc")


Wrote 34211 leadsheet songs to leadsheets.abc
