In [5]:
import pandas as pd
import requests
from tqdm import tqdm
import json  # <-- Needed for converting dict/list objects to JSON strings

def get_place_recording_relationships(place_id, headers, relationship_types=None):
    """
    Fetch all place-recording relationships for a given place ID.

    Args:
        place_id (str): The MusicBrainz ID of the place.
        headers (dict): Headers to be used in the API request.
        relationship_types (list, optional): List of relationship types to filter by.

    Returns:
        list: A list of filtered relationships for the given place ID.
    """
    if not place_id or len(place_id) != 36:  # Validate UUID format
        print(f"Invalid place ID: {place_id}")
        return []

    BASE_URL = "https://musicbrainz.org/ws/2"
    endpoint = f"{BASE_URL}/place/{place_id}"
    params = {"inc": "recording-rels", "fmt": "json"}

    try:
        response = requests.get(endpoint, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()

        if not data or "relations" not in data:
            print(f"No relations found for place ID: {place_id}")
            return []

        # Fetch all relationships
        relationships = data.get("relations", [])

        # Filter relationships if relationship_types is provided
        if relationship_types:
            relationships = [
                rel for rel in relationships if rel.get("type") in relationship_types
            ]

        # Keep only the first matching relationship, if any
        return relationships[:1] if relationships else []

    except Exception as e:
        print(f"Error fetching place-recording relationships for {place_id}: {e}")
        return []



# 1. Read your list of places from a CSV or other source
places_df = pd.read_csv('places.csv')
place_ids = places_df['id'].tolist()

# 2. Define headers as a dictionary
headers = {
    'User-Agent': 'jihbr@umich.edu'
}

# 3. Define the relationship types to include
relationship_types = [
    "arranged at",
    "engineered at",
    "recorded at",
    "mixed at",
    "edited at",
    "remixed at",
    "produced at",
    "video shot at"
]

# 4. Prepare for checkpointing in Parquet
checkpoint_file = "checkpoint.parquet"
output_data = []

# Try loading an existing checkpoint to skip processed IDs
try:
    checkpoint_df = pd.read_parquet(checkpoint_file)
    processed_place_ids = set(checkpoint_df['place_id'].tolist())
    output_data = checkpoint_df.to_dict('records')  # Continue from checkpoint
    print(f"Loaded {len(output_data)} records from existing checkpoint.")
except (FileNotFoundError, pd.errors.EmptyDataError):
    processed_place_ids = set()

BATCH_SIZE = 500
count_since_checkpoint = 0

for place_id in tqdm(place_ids, desc="Fetching relationships"):
    if place_id in processed_place_ids:
        continue  # Skip already processed place IDs

    relationships = get_place_recording_relationships(place_id, headers, relationship_types)

    # Filter out relationships that do not match the defined types
    filtered_relationships = [
        {
            "type": rel["type"],
            "recording": rel.get("recording", {})
        }
        for rel in relationships
    ]

    output_data.append({'place_id': place_id, 'relationships': filtered_relationships})
    processed_place_ids.add(place_id)
    count_since_checkpoint += 1

    # Check if we've hit our batch threshold for checkpointing
    if count_since_checkpoint >= BATCH_SIZE:
        # Save entire accumulated data so far
        temp_df = pd.DataFrame(output_data)

        # Convert relationships column to JSON strings
        temp_df['relationships'] = temp_df['relationships'].apply(json.dumps)

        temp_df.to_parquet(checkpoint_file, index=False, compression='snappy')
        print(f"Checkpoint saved with {len(output_data)} total records.")
        
        # Reset our counter for the next batch
        count_since_checkpoint = 0

# Finally, save the entire output to a final Parquet file
final_output_df = pd.DataFrame(output_data)

# Convert relationships column to JSON strings for the final write
final_output_df['relationships'] = final_output_df['relationships'].apply(json.dumps)

final_output_df.to_parquet('place_recording_relationships.parquet', 
                           index=False, 
                           compression='snappy')

print(f"Final output saved with {len(output_data)} total records.")


Loaded 62500 records from existing checkpoint.


Fetching relationships:  99%|█████████▉| 69297/69663 [00:00<00:00, 96540.42it/s]

Error fetching place-recording relationships for 36650a57-3158-47e7-8784-ee7c0ca9667d: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/36650a57-3158-47e7-8784-ee7c0ca9667d?inc=recording-rels&fmt=json
Error fetching place-recording relationships for 16be1ef1-25c0-4ba0-9d48-ca42b838bd21: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/16be1ef1-25c0-4ba0-9d48-ca42b838bd21?inc=recording-rels&fmt=json


Fetching relationships: 100%|█████████▉| 69346/69663 [00:34<00:00, 871.10it/s]  

Error fetching place-recording relationships for d2683ce5-c358-4194-b266-5fbf187ca044: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/d2683ce5-c358-4194-b266-5fbf187ca044?inc=recording-rels&fmt=json
Error fetching place-recording relationships for 2b3f3905-b285-44fd-9de8-2b44853bfc29: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/2b3f3905-b285-44fd-9de8-2b44853bfc29?inc=recording-rels&fmt=json
Error fetching place-recording relationships for 39fccd49-63dd-41ab-8d87-707d5548b6ad: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/39fccd49-63dd-41ab-8d87-707d5548b6ad?inc=recording-rels&fmt=json
Error fetching place-recording relationships for be09071f-6558-4b25-b282-84eba5c59d62: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/be09071f-6558-4b25-b282-84eba5c59d62?inc=recording-rels&fmt=json


Fetching relationships: 100%|█████████▉| 69406/69663 [01:14<00:01, 156.07it/s]

Error fetching place-recording relationships for a5e71d2d-a47d-407d-b46b-0c30a422b576: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/a5e71d2d-a47d-407d-b46b-0c30a422b576?inc=recording-rels&fmt=json
Error fetching place-recording relationships for 93a630a8-b258-4a70-b379-c96f32174b77: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/93a630a8-b258-4a70-b379-c96f32174b77?inc=recording-rels&fmt=json


Fetching relationships: 100%|█████████▉| 69463/69663 [01:55<00:05, 36.93it/s] 

Error fetching place-recording relationships for fd7fd4e0-044e-4003-946b-3566fec326e3: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/fd7fd4e0-044e-4003-946b-3566fec326e3?inc=recording-rels&fmt=json
Error fetching place-recording relationships for 0f9c1441-e618-490c-8971-04cb1a98ed1f: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/0f9c1441-e618-490c-8971-04cb1a98ed1f?inc=recording-rels&fmt=json


Fetching relationships: 100%|█████████▉| 69525/69663 [02:34<00:14,  9.84it/s]

Error fetching place-recording relationships for 840df2d5-a777-4be6-b1ea-468db33f72ef: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/840df2d5-a777-4be6-b1ea-468db33f72ef?inc=recording-rels&fmt=json
Error fetching place-recording relationships for 5a5e2074-62e5-4324-b7fb-0c99187fecbc: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/5a5e2074-62e5-4324-b7fb-0c99187fecbc?inc=recording-rels&fmt=json
Error fetching place-recording relationships for a9d98bc7-2e91-4def-8405-8d883e8bc9e4: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/a9d98bc7-2e91-4def-8405-8d883e8bc9e4?inc=recording-rels&fmt=json
Error fetching place-recording relationships for 0abd66e8-fed0-460a-844a-b5a86fdc1ce0: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/0abd66e8-fed0-460a-844a-b5a86fdc1ce0?inc=recording-rels&fmt=json
Error fetching p

Fetching relationships: 100%|█████████▉| 69556/69663 [02:55<00:19,  5.54it/s]

Error fetching place-recording relationships for 4b144143-0e84-47bd-ac1a-5b37119da580: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/4b144143-0e84-47bd-ac1a-5b37119da580?inc=recording-rels&fmt=json


Fetching relationships: 100%|█████████▉| 69583/69663 [03:14<00:23,  3.44it/s]

Error fetching place-recording relationships for 78d24b19-e7d9-4338-be39-a401168a1bb7: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/78d24b19-e7d9-4338-be39-a401168a1bb7?inc=recording-rels&fmt=json
Error fetching place-recording relationships for a94d884f-d7e6-4b81-ba2c-76492ba0c290: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/a94d884f-d7e6-4b81-ba2c-76492ba0c290?inc=recording-rels&fmt=json
Error fetching place-recording relationships for 33cc95a5-796b-418c-8796-09e0c5e0f3bd: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/33cc95a5-796b-418c-8796-09e0c5e0f3bd?inc=recording-rels&fmt=json
Error fetching place-recording relationships for 262ed11b-b86e-4106-8ffb-e69ea79d1453: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/262ed11b-b86e-4106-8ffb-e69ea79d1453?inc=recording-rels&fmt=json


Fetching relationships: 100%|█████████▉| 69615/69663 [03:34<00:19,  2.51it/s]

Error fetching place-recording relationships for b2ebefd7-235f-4e5d-bf05-a76850932e28: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/b2ebefd7-235f-4e5d-bf05-a76850932e28?inc=recording-rels&fmt=json


Fetching relationships: 100%|█████████▉| 69646/69663 [03:54<00:08,  2.02it/s]

Error fetching place-recording relationships for ba096a11-9e20-4883-b5ff-fb93cbae558c: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/ba096a11-9e20-4883-b5ff-fb93cbae558c?inc=recording-rels&fmt=json
Error fetching place-recording relationships for 6ab2dcaf-66cd-4f61-8d30-29e8e0be9df3: 503 Server Error: Service Temporarily Unavailable for url: https://musicbrainz.org/ws/2/place/6ab2dcaf-66cd-4f61-8d30-29e8e0be9df3?inc=recording-rels&fmt=json


Fetching relationships: 100%|██████████| 69663/69663 [04:04<00:00, 284.81it/s]

Final output saved with 62864 total records.



