In [11]:
import pandas as pd
import requests
from tqdm import tqdm
import json  # <-- Needed for converting dict/list objects to JSON strings

def get_place_recording_relationships(place_id, headers, relationship_types=None):
    """
    Fetch all place-recording relationships for a given place ID.

    Args:
        place_id (str): The MusicBrainz ID of the place.
        headers (dict): Headers to be used in the API request.
        relationship_types (list, optional): List of relationship types to filter by.

    Returns:
        list: A list of filtered relationships for the given place ID.
    """
    if not place_id or len(place_id) != 36:  # Validate UUID format
        print(f"Invalid place ID: {place_id}")
        return []

    BASE_URL = "https://musicbrainz.org/ws/2"
    endpoint = f"{BASE_URL}/place/{place_id}"
    params = {"inc": "recording-rels", "fmt": "json"}

    try:
        response = requests.get(endpoint, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()

        if not data or "relations" not in data:
            print(f"No relations found for place ID: {place_id}")
            return []

        # Fetch all relationships
        relationships = data.get("relations", [])

        # Filter relationships if relationship_types is provided
        if relationship_types:
            relationships = [
                rel for rel in relationships if rel.get("type") in relationship_types
            ]

        # Keep only the first matching relationship, if any
        return relationships[:1] if relationships else []

    except Exception as e:
        print(f"Error fetching place-recording relationships for {place_id}: {e}")
        return []

# --- Example usage: ---

# 1. Read your list of places from a CSV or other source
places_df = pd.read_csv('places.csv')
place_ids = places_df['id'].tolist()

# 2. Define headers as a dictionary
headers = {
    'User-Agent': 'jihbr@umich.edu'
}

# 3. Define the relationship types to include
relationship_types = [
    "arranged at",
    "engineered at",
    "recorded at",
    "mixed at",
    "edited at",
    "remixed at",
    "produced at",
    "video shot at"
]

# 4. Prepare for checkpointing in Parquet
checkpoint_file = "checkpoint.parquet"
output_data = []

# Try loading an existing checkpoint to skip processed IDs
try:
    checkpoint_df = pd.read_parquet(checkpoint_file)
    processed_place_ids = set(checkpoint_df['place_id'].tolist())
    output_data = checkpoint_df.to_dict('records')  # Continue from checkpoint
    print(f"Loaded {len(output_data)} records from existing checkpoint.")
except (FileNotFoundError, pd.errors.EmptyDataError):
    processed_place_ids = set()

BATCH_SIZE = 1
count_since_checkpoint = 0

for place_id in tqdm(place_ids, desc="Fetching relationships"):
    if place_id in processed_place_ids:
        continue  # Skip already processed place IDs

    relationships = get_place_recording_relationships(place_id, headers, relationship_types)

    # Filter out relationships that do not match the defined types
    filtered_relationships = [
        {
            "type": rel["type"],
            "recording": rel.get("recording", {})
        }
        for rel in relationships
    ]

    output_data.append({'place_id': place_id, 'relationships': filtered_relationships})
    processed_place_ids.add(place_id)
    count_since_checkpoint += 1

    # Check if we've hit our batch threshold for checkpointing
    if count_since_checkpoint >= BATCH_SIZE:
        # Save entire accumulated data so far
        temp_df = pd.DataFrame(output_data)

        # Convert relationships column to JSON strings
        temp_df['relationships'] = temp_df['relationships'].apply(json.dumps)

        temp_df.to_parquet(checkpoint_file, index=False, compression='snappy')
        print(f"Checkpoint saved with {len(output_data)} total records.")
        
        # Reset our counter for the next batch
        count_since_checkpoint = 0

# Finally, save the entire output to a final Parquet file
final_output_df = pd.DataFrame(output_data)

# Convert relationships column to JSON strings for the final write
final_output_df['relationships'] = final_output_df['relationships'].apply(json.dumps)

final_output_df.to_parquet('place_recording_relationships.parquet', 
                           index=False, 
                           compression='snappy')

print(f"Final output saved with {len(output_data)} total records.")


Fetching relationships:   0%|          | 1/69663 [00:01<36:13:54,  1.87s/it]

Checkpoint saved with 1 total records.


Fetching relationships:   0%|          | 2/69663 [00:02<27:27:10,  1.42s/it]

Checkpoint saved with 2 total records.


Fetching relationships:   0%|          | 3/69663 [00:03<21:58:26,  1.14s/it]

Checkpoint saved with 3 total records.


Fetching relationships:   0%|          | 4/69663 [00:04<18:40:25,  1.04it/s]

Checkpoint saved with 4 total records.


Fetching relationships:   0%|          | 5/69663 [00:05<16:43:54,  1.16it/s]

Checkpoint saved with 5 total records.


Fetching relationships:   0%|          | 6/69663 [00:06<16:46:11,  1.15it/s]

Checkpoint saved with 6 total records.


Fetching relationships:   0%|          | 7/69663 [00:06<14:39:22,  1.32it/s]

Checkpoint saved with 7 total records.


Fetching relationships:   0%|          | 8/69663 [00:07<14:15:10,  1.36it/s]

Checkpoint saved with 8 total records.


Fetching relationships:   0%|          | 9/69663 [00:07<14:04:42,  1.37it/s]

Checkpoint saved with 9 total records.


Fetching relationships:   0%|          | 10/69663 [00:08<13:14:02,  1.46it/s]

Checkpoint saved with 10 total records.


Fetching relationships:   0%|          | 11/69663 [00:10<20:09:35,  1.04s/it]

Checkpoint saved with 11 total records.


Fetching relationships:   0%|          | 12/69663 [00:10<17:25:41,  1.11it/s]

Checkpoint saved with 12 total records.


Fetching relationships:   0%|          | 13/69663 [00:11<16:01:31,  1.21it/s]

Checkpoint saved with 13 total records.


Fetching relationships:   0%|          | 14/69663 [00:12<15:15:11,  1.27it/s]

Checkpoint saved with 14 total records.


Fetching relationships:   0%|          | 15/69663 [00:12<13:37:08,  1.42it/s]

Checkpoint saved with 15 total records.


Fetching relationships:   0%|          | 16/69663 [00:13<12:49:07,  1.51it/s]

Checkpoint saved with 16 total records.


Fetching relationships:   0%|          | 17/69663 [00:14<13:09:09,  1.47it/s]

Checkpoint saved with 17 total records.


Fetching relationships:   0%|          | 18/69663 [00:14<12:57:04,  1.49it/s]

Checkpoint saved with 18 total records.


Fetching relationships:   0%|          | 19/69663 [00:15<13:13:00,  1.46it/s]

Checkpoint saved with 19 total records.


Fetching relationships:   0%|          | 20/69663 [00:16<12:17:53,  1.57it/s]

Checkpoint saved with 20 total records.


Fetching relationships:   0%|          | 21/69663 [00:16<11:50:41,  1.63it/s]

Checkpoint saved with 21 total records.


Fetching relationships:   0%|          | 22/69663 [00:17<13:15:35,  1.46it/s]

Checkpoint saved with 22 total records.


Fetching relationships:   0%|          | 23/69663 [00:18<16:22:01,  1.18it/s]

Checkpoint saved with 23 total records.


Fetching relationships:   0%|          | 24/69663 [00:19<18:31:24,  1.04it/s]

Checkpoint saved with 24 total records.


Fetching relationships:   0%|          | 25/69663 [00:20<17:09:13,  1.13it/s]

Checkpoint saved with 25 total records.


Fetching relationships:   0%|          | 26/69663 [00:21<16:22:55,  1.18it/s]

Checkpoint saved with 26 total records.


Fetching relationships:   0%|          | 27/69663 [00:22<15:58:51,  1.21it/s]

Checkpoint saved with 27 total records.


Fetching relationships:   0%|          | 28/69663 [00:22<14:31:40,  1.33it/s]

Checkpoint saved with 28 total records.


Fetching relationships:   0%|          | 29/69663 [00:23<14:17:00,  1.35it/s]

Checkpoint saved with 29 total records.


Fetching relationships:   0%|          | 30/69663 [00:24<14:46:06,  1.31it/s]

Checkpoint saved with 30 total records.


Fetching relationships:   0%|          | 31/69663 [00:24<13:44:02,  1.41it/s]

Checkpoint saved with 31 total records.


Fetching relationships:   0%|          | 32/69663 [00:25<13:26:38,  1.44it/s]

Checkpoint saved with 32 total records.


Fetching relationships:   0%|          | 33/69663 [00:26<12:35:37,  1.54it/s]

Checkpoint saved with 33 total records.


Fetching relationships:   0%|          | 34/69663 [00:26<12:52:28,  1.50it/s]

Checkpoint saved with 34 total records.


Fetching relationships:   0%|          | 35/69663 [00:27<12:46:19,  1.51it/s]

Checkpoint saved with 35 total records.


Fetching relationships:   0%|          | 36/69663 [00:28<16:11:51,  1.19it/s]

Checkpoint saved with 36 total records.


Fetching relationships:   0%|          | 37/69663 [00:29<14:30:59,  1.33it/s]

Checkpoint saved with 37 total records.


Fetching relationships:   0%|          | 38/69663 [00:29<14:10:23,  1.36it/s]

Checkpoint saved with 38 total records.


Fetching relationships:   0%|          | 39/69663 [00:30<13:45:43,  1.41it/s]

Checkpoint saved with 39 total records.


Fetching relationships:   0%|          | 40/69663 [00:31<13:03:37,  1.48it/s]

Checkpoint saved with 40 total records.


Fetching relationships:   0%|          | 41/69663 [00:31<12:58:31,  1.49it/s]

Checkpoint saved with 41 total records.


Fetching relationships:   0%|          | 41/69663 [00:32<15:09:09,  1.28it/s]


KeyboardInterrupt: 