In [7]:
import urllib.request
import json
import urllib.parse
from IPython.display import display, HTML
from tqdm import tqdm

def search(*args, API_KEY=None, reusability=None, media_type=None, file_format=None, cursor_based=True, max_runs=None):
    if not API_KEY:
        raise ValueError("API_KEY is required")

    # Join the search terms with a '+' separator
    search_string = '+'.join(args)

    # Construct the base URL
    base_url = "https://www.europeana.eu/api/v2/search.json?"
    filters = []

    # Apply the reusability filter if provided
    if reusability:
        reusability_filter = 'reusability=' + ','.join(reusability)
        filters.append(reusability_filter)

    # Apply the media type filter if provided
    if media_type:
        media_type_filter = 'qf=TYPE%3A%22' + '%22%20OR%20TYPE%3A%22'.join(media_type) + '%22'
        filters.append(media_type_filter)

    # Apply the file format filter if provided
    if file_format:
        file_format_filter = 'qf=MIME_TYPE%3A' + '%20OR%20MIME_TYPE%3A'.join(
            [urllib.parse.quote_plus(f'application/{fmt}') for fmt in file_format]
        )
        filters.append(file_format_filter)

    # Join filters with '&'
    filters_string = '&'.join(filters)

    cursor = '*'
    all_data = []
    run_count = 0

    # Set up progress bar using tqdm
    if max_runs is None:
        progress_bar = tqdm()
    else:
        progress_bar = tqdm(total=max_runs)

    while cursor and (max_runs is None or run_count < max_runs):
        run_count += 1
        
        # Construct the full URL with the cursor parameter
        if cursor_based:
            url = f"{base_url}query={search_string}&rows=100&cursor={cursor}&text_fulltext=true&wskey={API_KEY}&{filters_string}"
        else:
            # If not cursor-based, we use the basic pagination (start=1, 101, etc.)
            start = len(all_data) + 1
            url = f"{base_url}query={search_string}&rows=100&start={start}&text_fulltext=true&wskey={API_KEY}&{filters_string}"

        # Display the URL for the current iteration
        text = "Europeana.eu Page " + str(len(all_data) // 100 + 1)
        display(HTML(f'Requesting URL: <a href="{url}" target="_blank">{text}</a>'))

        try:
            # Open the URL and read the response
            with urllib.request.urlopen(url) as response:
                data = json.load(response)

            # Append the new data to the all_data list
            all_data.extend(data['items'])

            # Check for cursor in the response for the next page
            cursor = urllib.parse.quote_plus(data.get('nextCursor', ''))

            # Save the intermediate data to a file
            with open('search.json', 'w') as outfile:
                json.dump(all_data, outfile)

            # If no cursor is returned, stop the loop
            if not cursor:
                print("Reached the end of the result set.")
                break

        except urllib.error.HTTPError as e:
            print(f"HTTP Error: {e.code} - {e.reason}")
            break
        except urllib.error.URLError as e:
            print(f"URL Error: {e.reason}")
            break
        except Exception as e:
            print(f"Unexpected error: {e}")
            break
        
        # Update progress bar
        progress_bar.update(1)

    # Close the progress bar
    progress_bar.close()

    return all_data

# Example usage
API_KEY = 'tinvoyfulty'  # Replace with your actual API key
results = search(
    'newspapers', 
    API_KEY=API_KEY, 
    reusability=['open', 'permission', 'restricted'], 
    media_type=['TEXT'], 
    file_format=['pdf'], 
    cursor_based=True, 
    max_runs=5  # Limit the number of cursor-based runs for testing
)


  0%|          | 0/5 [00:00<?, ?it/s]

 20%|██        | 1/5 [00:00<00:03,  1.26it/s]

 40%|████      | 2/5 [00:01<00:02,  1.44it/s]

 60%|██████    | 3/5 [00:02<00:01,  1.44it/s]

 80%|████████  | 4/5 [00:02<00:00,  1.42it/s]

100%|██████████| 5/5 [00:03<00:00,  1.40it/s]


In [11]:
len(results)

500

In [13]:
# Assuming 'results' is a list of dictionaries, and each item has a unique 'id'
seen = set()
duplicates = [item for item in results if item['id'] in seen or seen.add(item['id'])]

if duplicates:
    print("Duplicates found:", duplicates)
else:
    print("No duplicates found.")

No duplicates found.
