# Embedding of the Collection

In this notebook we aim to epmbedd the entire collection to retrieve topics and words that can describe in different ways the collection. The final goal of the embedding is to define the objects that will be used for the final bucket of curiosity.

## 1) Data extraction: working with images from the Wellcome collection

In [25]:
import sys
import os
sys.path.append(os.path.abspath("../"))
from scripts.wellcome_utils import fetch_images, fetch_specific_image, fetch_specific_work, fetch_works

from pathlib import Path
from tqdm.auto import tqdm
import gzip
import json
import requests
import pandas as pd

In [26]:
# Download the Names of the Works in the Wellcome Collection
# https://developers.wellcomecollection.org/docs/examples/extracting-features-from-text
import shutil

snapshot_url = "https://data.wellcomecollection.org/catalogue/v2/images.json.gz"

data_dir = Path("../../data").resolve()
data_dir.mkdir(exist_ok=True)

file_name = Path(snapshot_url).parts[-1]
zipped_path = data_dir / file_name
unzipped_path = zipped_path.with_suffix("")


if not unzipped_path.exists():
    if not zipped_path.exists():
        r = requests.get(snapshot_url, stream=True)
        download_progress_bar = tqdm(
            unit="B",
            total=int(r.headers["Content-Length"]),
            desc=f"downloading {file_name}",
        )
        with open(zipped_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    download_progress_bar.update(len(chunk))

    with gzip.open(zipped_path, "rb") as f_in:
        with open(unzipped_path, "wb") as f_out:
            unzip_progress_bar = tqdm(
                unit="B",
                total=zipped_path.stat().st_size,
                desc=f"unzipping {file_name}",
            )
            while True:
                chunk = f_in.read(1024)
                if not chunk:
                    break
                f_out.write(chunk)
                unzip_progress_bar.update(len(chunk))


In [27]:
# Opening the entire archive won't fit into memory: read 1 line at a time

total_metadata_images = []
image_ids = []
image_titles = []

with open(unzipped_path, "r", encoding="utf-8") as f:
    for line in f:
        record = json.loads(line)
        total_metadata_images.append(record)
        image_ids.append(record["source"]['id'])
        image_titles.append(record["source"]['title'])

images_id_title = pd.DataFrame({
    "ids": image_ids,
    "titles": image_titles
})

print(f"Total images: {len(total_metadata_images)}")
print(f"Total image ids and titles: {len(images_id_title)}")

Total images: 128885
Total image ids and titles: 128885


In [28]:
# Removing duplicates
images_id_title = images_id_title.drop_duplicates(subset=["ids"])
images_id_title

Unnamed: 0,ids,titles
0,nf7cjmsg,"Hypertensive, vascular disease"
1,ycj8m76a,Thomas Linacre. Photogravure after Q. Metsys.
2,uwjy4qtv,A man in a monk's habit is buried alive with a...
3,g8ed63dg,"Goddess Isis feeding Horus, wall relief"
4,tfm3zjpu,Gin Lane by William Hogarth.
...,...,...
128861,u2cscemq,LASS : Leicestershire AIDS Support Services.
128863,w5unt6ak,How to help and care when a friend has AIDS / ...
128870,n8ppt3cf,Sheffield Centre for HIV and Sexual Health : 2...
128882,fzfs54aq,A large red ribbon decorated with hearts in th...


In [30]:
# Runs for approx 10 minutes 
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def fetch_and_process(img_id):
    try:
        work = fetch_specific_work(img_id)
        description = work.get('description', None)
    except Exception as e:
        print(f"Error fetching image {img_id}: {e}")
        return None, None, img_id
    return work, description, img_id

corresponding_works = []
image_descrs = []

with ThreadPoolExecutor(max_workers=20) as executor:
    futures = {executor.submit(fetch_and_process, img_id): img_id for img_id in images_id_title.ids}
    
    for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching images"):
        work, description, id = future.result()
        if work:
            corresponding_works.append({"id": id, "total_work": work})
            image_descrs.append({"id": id, "descr": description})


Fetching images:  18%|█▊        | 17542/98448 [01:27<06:50, 196.89it/s]

Error fetching image c5mtsy8s: HTTPSConnectionPool(host='api.wellcomecollection.org', port=443): Read timed out. (read timeout=10)


Fetching images:  79%|███████▉  | 77856/98448 [06:45<04:30, 76.07it/s] 

Error fetching image u8pszxv3: HTTPSConnectionPool(host='api.wellcomecollection.org', port=443): Read timed out. (read timeout=10)


Fetching images:  97%|█████████▋| 95041/98448 [08:22<00:15, 221.53it/s]

Error fetching image rqshezhn: 410 Client Error: Gone for url: https://api.wellcomecollection.org/catalogue/v2/works/rqshezhn


Fetching images: 100%|██████████| 98448/98448 [08:38<00:00, 189.75it/s]


In [31]:
failed_images_ids = ["c5mtsy8s", "u8pszxv3", "rqshezhn"] 
# remove line where failed
images_id_title = images_id_title[~images_id_title.ids.isin(failed_images_ids)]
images_id_title.shape[0], len(image_descrs)

(98445, 98445)

In [34]:
image_descrs = pd.DataFrame(image_descrs)
image_descrs

Unnamed: 0,id,descr
0,m58nmkc3,"<p>A tree with exposed roots, a native contemp..."
1,bz48rjxm,"<p>Whole-length, standing, holding staff. A ha..."
2,ncczkpq7,
3,ycj8m76a,
4,vszzcjgp,Apparatus formerly used by Luigi Galvani - sma...
...,...,...
98440,w5unt6ak,<p>Leaflet giving information about supporting...
98441,n8ppt3cf,<p>Letterhead (1992) for Sheffield Centre for ...
98442,fneppy6s,<p>One of a series of leaflets giving informat...
98443,f7xbyna4,


In [None]:
# merge on ids
textual_data = pd.merge(
    images_id_title,
    image_descrs,
    left_on="ids",
    right_on="id",
    how="left"
)
textual_data = textual_data.drop(columns=["id"])
textual_data = textual_data.rename(columns={"descr": "work_descriptions"})

In [42]:
textual_data

Unnamed: 0,ids,titles,work_descriptions
0,nf7cjmsg,"Hypertensive, vascular disease","Scan - axial, brain"
1,ycj8m76a,Thomas Linacre. Photogravure after Q. Metsys.,
2,uwjy4qtv,A man in a monk's habit is buried alive with a...,
3,g8ed63dg,"Goddess Isis feeding Horus, wall relief","Goddess Isis feeding Horus, here depicted as a..."
4,tfm3zjpu,Gin Lane by William Hogarth.,"Gin Lane. Anonimous after William Hogarth, 1751."
...,...,...,...
98440,u2cscemq,LASS : Leicestershire AIDS Support Services.,<p>Leaflet giving information about the work o...
98441,w5unt6ak,How to help and care when a friend has AIDS / ...,<p>Leaflet giving information about supporting...
98442,n8ppt3cf,Sheffield Centre for HIV and Sexual Health : 2...,<p>Letterhead (1992) for Sheffield Centre for ...
98443,fzfs54aq,A large red ribbon decorated with hearts in th...,<p>Images include from top left: a bunch of bl...


In [43]:
print(f"Only {textual_data.work_descriptions.isna().sum() / textual_data.shape[0] * 100:.1f} % of the images have a corresponding work descriptions")

Only 57.8 % of the images have a corresponding work descriptions


In [20]:
BASE_URL = "https://api.wellcomecollection.org/catalogue/v2/"

response = requests.get(BASE_URL + "/images", params={"query": "horse"}).json()
for i, result in enumerate(response["results"]):
    print(f"{i+1}. {result['source']['title']}")
    print(f"https://wellcomecollection.org/works/{result['id']}")
    print()

1. Horse restrained in horse-box for injection
https://wellcomecollection.org/works/php3h72c

2. Above, an arabian horse, a race horse, a draft horse and an ass; below, two zebras. Engraving.
https://wellcomecollection.org/works/nqthe7vj

3. Above, a carriage horse baulks and kicks the horse behind; centre, a baulking race horse; below, two race horses with jockeys. Chromolithograph.
https://wellcomecollection.org/works/q4t4ue9p

4. Arterial nodules, horse
https://wellcomecollection.org/works/tfem429t

5. Horse doctor giving medicine to a horse, German, 18th century
https://wellcomecollection.org/works/mgjfzfcg

6. Backing a horse correctly
https://wellcomecollection.org/works/cny59bkp

7. Navicular disease: horse's hoof
https://wellcomecollection.org/works/fus5ckpk

8. Examing horse's spinal reflexes
https://wellcomecollection.org/works/kaejvuhx

9. Examing horse's spinal reflexes
https://wellcomecollection.org/works/v43z7na8

10. Examining horse's foot balance
https://wellcomecollect

In [19]:
response = requests.get(BASE_URL + "/works", params={"query": "horse"}).json()
for i, result in enumerate(response["results"]):
    print(f"{i+1}. {result['title']}")
    print(f"https://wellcomecollection.org/works/{result['id']}")
    print()

1. Horses and roads, or, How to keep a horse sound on his legs / by Free-lance.
https://wellcomecollection.org/works/bba45wys

2. Horsemen leading a large number of horses in a ring for sale at a horse fair. Coloured lithograph, 1871, by W. H. Simmons after R. Bonheur.
https://wellcomecollection.org/works/jwnsubmq

3. Horse foetuses: five figures showing the foetus of a horse during the gestation period, with dissections of its abdomen and stomach demonstrating the foetal circulation system. Engraving by T. Cowan after B. Herring, ca. 1860.
https://wellcomecollection.org/works/c3xsp8nx

4. Horse Measurement Data Analysis
https://wellcomecollection.org/works/qhj4t4zy

5. Horse-blister ointment / prepared by Richard Hine.
https://wellcomecollection.org/works/bpsdauz5

6. Horse Cough Powders : a table-spoonful to be given in a bran mash / Fredk. L. Gooch.
https://wellcomecollection.org/works/ppe4pch9

7. Horse care products : caring for horses since 1830 : high feed costs? Laminitis? Lack

In [23]:
textual_data[textual_data.ids == "v4t9q43f"]

Unnamed: 0,ids,titles,work_descriptions
20859,v4t9q43f,Horse restrained in horse-box for injection,Vol 2 of 'Vues des Cordilleres et monumens des...


In [24]:
fetch_specific_work("v4t9q43f")

{'description': 'A horse restrained in a horse-box for an\nintramuscular injection.\nRoyal Veterinary College.',
 'workType': {'id': 'q', 'label': 'Digital Images', 'type': 'Format'},
 'thumbnail': {'url': 'https://iiif.wellcomecollection.org/image/A0000654/full/300,/0/default.jpg',
  'license': {'id': 'cc-by-nc',
   'label': 'Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)',
   'url': 'https://creativecommons.org/licenses/by-nc/4.0/',
   'type': 'License'},
  'accessConditions': [],
  'locationType': {'id': 'thumbnail-image',
   'label': 'Thumbnail image',
   'type': 'LocationType'},
  'type': 'DigitalLocation'},
 'alternativeTitles': [],
 'id': 'v4t9q43f',
 'title': 'Horse restrained in horse-box for injection',
 'type': 'Work',
 'availabilities': [{'id': 'online',
   'label': 'Online',
   'type': 'Availability'}]}

### Displaying images on query


In [70]:
def get_full_res_url(iiif_url:str, resolution="full") -> str:
    """IIIF urls are the following format: {base}/{region}/{resolution}/{rotation}/{quality}.{format}
    example: "https://iiif.wellcomecollection.org/image/V0021817/full/300,/0/default.jpg"
             Here resolution="300,"
    Args:
        iiif_url (str): _description_
        resolution (str, optional): _description_. Defaults to "full".

    Returns:
        str: modified url to get image at expected resolution
    """
    if 'iiif' in iiif_url:
        # split the string at the fifth '/' symbol (right before resolution)
        parts = iiif_url.split('/', 5)
        # Keeps the left part
        left_part = "/".join(parts[:5]) 
        # Manually add the right part depending on expected resolution
        full_res_url = f"{left_part}/full/{resolution}/0/default.jpg"
    else:
        raise Exception("Not a valid IIIF URL.")
    return full_res_url

In [72]:
type(Image)

type

In [81]:
import requests
from IPython.display import Image, display

# Step 1: Query the API for digitized works related to "horse"
search_url = "https://api.wellcomecollection.org/catalogue/v2/images"
params = {
    "query": "cow",
    "pageSize": 1
}
response = requests.get(search_url, params=params)
data = response.json()

# Step 2: Loop through the works
for work in data.get("results", []):
    title = work.get("source").get("title")
    iiif_url = work.get("thumbnail", {}).get("url")
    
    if iiif_url:
        print(title)
        full_res_url = get_full_res_url(iiif_url)
        display(Image(url=full_res_url))



Adult cow heart.


## 2) Title and description embedding

### A) Word embedding 

### B) Sentence embedding

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

monolingual_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
multilingual_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

monolingual_embeddings = monolingual_model.encode(textual_data.titles)
multilingual_embeddings = multilingual_model.encode(textual_data.titles)

print(monolingual_embeddings)
print(multilingual_embeddings)


### C) Word2Vec embedding