In [20]:
import pandas as pd
import requests
from tqdm import tqdm
import json
import os

def get_place_work_relationships(place_id, headers, relationship_types=None):
    """
    Fetch all place-work relationships for a given place ID.

    Args:
        place_id (str): The MusicBrainz ID of the place.
        headers (dict): Headers to be used in the API request.
        relationship_types (list, optional): List of relationship types to filter by.

    Returns:
        list: A list of filtered relationships for the given place ID.
    """
    if not place_id or len(place_id) != 36:  # Validate UUID format
        print(f"Invalid place ID: {place_id}")
        return []

    BASE_URL = "https://musicbrainz.org/ws/2"
    endpoint = f"{BASE_URL}/place/{place_id}"
    params = {"inc": "work-rels", "fmt": "json"}  # Changed to 'work-rels'

    try:
        response = requests.get(endpoint, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()

        if not data or "relations" not in data:
            print(f"No relations found for place ID: {place_id}")
            return []

        # Fetch all relationships
        relationships = data.get("relations", [])

        # Filter relationships if relationship_types is provided
        if relationship_types:
            relationships = [
                rel for rel in relationships if rel.get("type") in relationship_types
            ]

        # Return all matching relationships
        return relationships if relationships else []

    except Exception as e:
        print(f"Error fetching place-work relationships for {place_id}: {e}")
        return []

# 1. Read your list of places from a CSV
places_file = 'places.csv'
places_df = pd.read_csv(places_file)
place_ids = places_df['id'].astype(str).tolist()

# 2. Define headers as a dictionary
headers = {
    'User-Agent': 'jihbr@umich.edu'  
}

# 3. Define the relationship types to include (Place-Work relationship types)
relationship_types = [
    "premiere",
    "written at",
    "composed at",
    "lyrics written at",
    "libretto written at",
    "revised at",
    "translated at",
    "arranged at",
    "commissioned",
    "dedication"
]

# Initialize variables for checkpointing
BATCH_SIZE = 20
checkpoint_file = 'checkpoint_place_work.parquet'
output_file = 'place_work_relationships.parquet'
output_data = []
count_since_checkpoint = 0

# Load checkpoint if exists
if os.path.exists(checkpoint_file):
    checkpoint_df = pd.read_parquet(checkpoint_file)
    processed_place_ids = set(checkpoint_df['place_id'].tolist())
    output_data = checkpoint_df.to_dict('records')
else:
    processed_place_ids = set()

# Process the place IDs in batches
for i in tqdm(range(0, len(place_ids), BATCH_SIZE), desc="Processing batches", initial=len(processed_place_ids) // BATCH_SIZE, total=len(place_ids) // BATCH_SIZE):
    batch_place_ids = place_ids[i:i + BATCH_SIZE]
    batch_output_data = []

    for place_id in batch_place_ids:
        if place_id in processed_place_ids:
            continue

        relationships = get_place_work_relationships(place_id, headers, relationship_types)
        batch_output_data.append({'place_id': place_id, 'relationships': relationships})
        processed_place_ids.add(place_id)

    # Append batch data to output data
    output_data.extend(batch_output_data)
    count_since_checkpoint += len(batch_output_data)

    # Save checkpoint if batch size is reached
    if count_since_checkpoint >= BATCH_SIZE:
        temp_df = pd.DataFrame(output_data)
        
        # Convert the 'relationships' column to JSON strings to make it compatible with Parquet
        if 'relationships' in temp_df.columns:
            temp_df['relationships'] = temp_df['relationships'].apply(json.dumps)
        
        temp_df.to_parquet(checkpoint_file, compression='snappy')
        count_since_checkpoint = 0

# Save final output
final_output_df = pd.DataFrame(output_data)

# Convert the 'relationships' column to JSON strings for the final save
if 'relationships' in final_output_df.columns:
    final_output_df['relationships'] = final_output_df['relationships'].apply(json.dumps)

final_output_df.to_parquet(output_file, compression='snappy')


Processing batches: 6627it [00:01, 2058.15it/s]                
