# Embedding of the Collection

In this notebook we aim to epmbedd the entire collection to retrieve topics and words that can describe in different ways the collection. The final goal of the embedding is to define the objects that will be used for the final bucket of curiosity.

## 1) Data extraction: working with images from the Wellcome collection

In [1]:
import sys
import os
sys.path.append(os.path.abspath("../"))
from scripts.wellcome_utils import fetch_images, fetch_specific_image, fetch_specific_work, fetch_works

In [2]:
fetching = fetch_images(pageSize=100)
WellcomeColl = fetching['results']

In [3]:
fetching

{'type': 'ResultList',
 'pageSize': 100,
 'totalPages': 1289,
 'totalResults': 128885,
 'results': [{'locations': [{'url': 'https://iiif.wellcomecollection.org/image/L0040187/info.json',
     'credit': 'Wellcome Collection',
     'license': {'id': 'cc-by',
      'label': 'Attribution 4.0 International (CC BY 4.0)',
      'url': 'http://creativecommons.org/licenses/by/4.0/',
      'type': 'License'},
     'accessConditions': [{'method': {'id': 'view-online',
        'label': 'View online',
        'type': 'AccessMethod'},
       'status': {'id': 'open', 'label': 'Open', 'type': 'AccessStatus'},
       'type': 'AccessCondition'}],
     'locationType': {'id': 'iiif-image',
      'label': 'IIIF Image API',
      'type': 'LocationType'},
     'type': 'DigitalLocation'}],
   'source': {'id': 'yxcd6m5x',
    'title': 'Example of Arabic text',
    'type': 'Work'},
   'aspectRatio': 0.645,
   'thumbnail': {'url': 'https://iiif.wellcomecollection.org/image/L0040187/info.json',
    'credit': 'Wel

In [4]:
WellcomeColl

[{'locations': [{'url': 'https://iiif.wellcomecollection.org/image/L0040187/info.json',
    'credit': 'Wellcome Collection',
    'license': {'id': 'cc-by',
     'label': 'Attribution 4.0 International (CC BY 4.0)',
     'url': 'http://creativecommons.org/licenses/by/4.0/',
     'type': 'License'},
    'accessConditions': [{'method': {'id': 'view-online',
       'label': 'View online',
       'type': 'AccessMethod'},
      'status': {'id': 'open', 'label': 'Open', 'type': 'AccessStatus'},
      'type': 'AccessCondition'}],
    'locationType': {'id': 'iiif-image',
     'label': 'IIIF Image API',
     'type': 'LocationType'},
    'type': 'DigitalLocation'}],
  'source': {'id': 'yxcd6m5x',
   'title': 'Example of Arabic text',
   'type': 'Work'},
  'aspectRatio': 0.645,
  'thumbnail': {'url': 'https://iiif.wellcomecollection.org/image/L0040187/info.json',
   'credit': 'Wellcome Collection',
   'license': {'id': 'cc-by',
    'label': 'Attribution 4.0 International (CC BY 4.0)',
    'url': '

In [5]:
image_titles = [img["source"]["title"] for img in WellcomeColl]
image_ids = [img["source"]["id"] for img in WellcomeColl]

In [6]:
len(image_titles), len(image_ids)

(100, 100)

In [7]:
# runs approx 30 sec
corresponding_works = []
image_descrs = []
for img_id in image_ids:
    try:
        work = fetch_specific_work(img_id)
        corresponding_works.append(work)
        fetch_specific_image(img_id)
    except Exception as e:
        print(f"Error fetching image {img_id}: {e}")
len(work)

Error fetching image yxcd6m5x: 404 Client Error: Not Found for url: https://api.wellcomecollection.org/catalogue/v2/images/yxcd6m5x
Error fetching image ehkbxd9x: 404 Client Error: Not Found for url: https://api.wellcomecollection.org/catalogue/v2/images/ehkbxd9x
Error fetching image b5kqccbb: 404 Client Error: Not Found for url: https://api.wellcomecollection.org/catalogue/v2/images/b5kqccbb
Error fetching image qyakhepx: 404 Client Error: Not Found for url: https://api.wellcomecollection.org/catalogue/v2/images/qyakhepx
Error fetching image v75tpcju: 404 Client Error: Not Found for url: https://api.wellcomecollection.org/catalogue/v2/images/v75tpcju
Error fetching image bv4zwr3h: 404 Client Error: Not Found for url: https://api.wellcomecollection.org/catalogue/v2/images/bv4zwr3h
Error fetching image uzx7bfzh: 404 Client Error: Not Found for url: https://api.wellcomecollection.org/catalogue/v2/images/uzx7bfzh
Error fetching image rhv8bsd6: 404 Client Error: Not Found for url: https://

8

In [8]:
from pathlib import Path
from tqdm.auto import tqdm
import gzip
import json
import requests

In [9]:
# Download the Names of the Works in the Wellcome Collection
# https://developers.wellcomecollection.org/docs/examples/extracting-features-from-text
import shutil

snapshot_url = "https://data.wellcomecollection.org/catalogue/v2/images.json.gz"

data_dir = Path("../../data").resolve()
data_dir.mkdir(exist_ok=True)

file_name = Path(snapshot_url).parts[-1]
zipped_path = data_dir / file_name
unzipped_path = zipped_path.with_suffix("")


if not unzipped_path.exists():
    if not zipped_path.exists():
        r = requests.get(snapshot_url, stream=True)
        download_progress_bar = tqdm(
            unit="B",
            total=int(r.headers["Content-Length"]),
            desc=f"downloading {file_name}",
        )
        with open(zipped_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    download_progress_bar.update(len(chunk))

    with gzip.open(zipped_path, "rb") as f_in:
        with open(unzipped_path, "wb") as f_out:
            unzip_progress_bar = tqdm(
                unit="B",
                total=zipped_path.stat().st_size,
                desc=f"unzipping {file_name}",
            )
            while True:
                chunk = f_in.read(1024)
                if not chunk:
                    break
                f_out.write(chunk)
                unzip_progress_bar.update(len(chunk))


In [22]:
# Opening the entire archive won't fit into memory: read 1 line at a time

total_images = []
image_titles = []
image_ids = []

with open(unzipped_path, "r", encoding="utf-8") as f:

    for line in f:
        record = json.loads(line)
        total_images.append(record)
        image_titles.append(record["source"]['title'])
        image_ids.append(record["source"]['id'])

print(f"Total images: {len(total_images)}")
print(f"Total image titles: {len(image_titles)}")
print(f"Total image ids: {len(image_ids)}")

Total images: 128885
Total image titles: 128885
Total image ids: 128885


In [None]:
# Runs for approx 11 minutes 
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def fetch_and_process(img_id):
    try:
        work = fetch_specific_work(img_id)
        description = work.get('description', None)
    except Exception as e:
        print(f"Error fetching image {img_id}: {e}")
        return None, None
    return work, description

corresponding_works = []
image_descrs = []

with ThreadPoolExecutor(max_workers=20) as executor:
    futures = {executor.submit(fetch_and_process, img_id): img_id for img_id in image_ids}
    
    for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching images"):
        work, description = future.result()
        if work:
            corresponding_works.append(work)
            image_descrs.append(description)


Fetching images:   8%|▊         | 10881/128885 [00:54<09:15, 212.29it/s]

Error fetching image ck9wjgk9: HTTPSConnectionPool(host='api.wellcomecollection.org', port=443): Read timed out. (read timeout=10)


Fetching images:  90%|█████████ | 116580/128885 [10:14<00:58, 208.62it/s]

Error fetching image ejdrx7sr: HTTPSConnectionPool(host='api.wellcomecollection.org', port=443): Read timed out. (read timeout=10)


Fetching images:  94%|█████████▍| 121582/128885 [10:34<00:53, 135.99it/s]

Error fetching image rqshezhn: 410 Client Error: Gone for url: https://api.wellcomecollection.org/catalogue/v2/works/rqshezhn


Fetching images:  94%|█████████▍| 121731/128885 [10:35<00:31, 225.71it/s]

Error fetching image rqshezhn: 410 Client Error: Gone for url: https://api.wellcomecollection.org/catalogue/v2/works/rqshezhn


Fetching images: 100%|██████████| 128885/128885 [11:07<00:00, 193.05it/s]


In [None]:
# We have duplicates in the list of image ids TODO: remove them before 
unique_image_ids = set(image_ids)
print(f"Original length: {len(image_ids)}")
print(f"Length after converting to set: {len(unique_image_ids)}")
print(f"This means ")

Original length: 128885
Length after converting to set: 98448


In [None]:
# # Remove duplicates while preserving order
# unique_image_ids = list(dict.fromkeys(image_ids))
# print(f"\nOriginal length: {len(image_ids)}")
# print(f"Length after removing duplicates: {len(unique_image_ids)}")

In [12]:
# Takes a few hours to fecth (takes image by image)

# corresponding_works = []
# image_descrs = []
# for img_id in image_ids:
#     try:
#         work = fetch_specific_work(img_id)
#         corresponding_works.append(work)
#     except Exception as e:
#         print(f"Error fetching image {img_id}: {e}")
#     try:
#         image_descrs.append(work['description'])
#     except Exception as e:
#         #print(f"Some image had no description: {img_id}")
#         image_descrs.append(None)

# len(image_descrs)

In [None]:
failed_images_ids = ["ck9wjgk9", "ejdrx7sr", "rqshezhn", "rqshezhn"] # putting the last one twice because pop only removes the first instance
for failed_image_id in failed_images_ids:
    failed_index = image_ids.index(failed_image_id)
    image_ids.pop(failed_index)
    image_titles.pop(failed_index)

In [26]:
len(image_descrs), len(image_ids), len(image_titles)

(128881, 128881, 128881)

In [27]:
import pandas as pd
textual_data = pd.DataFrame({'ids': image_ids, 'titles': image_titles, 'work_descriptions': image_descrs})

In [28]:
textual_data

Unnamed: 0,ids,titles,work_descriptions
0,nf7cjmsg,"Hypertensive, vascular disease",
1,ycj8m76a,Thomas Linacre. Photogravure after Q. Metsys.,
2,uwjy4qtv,A man in a monk's habit is buried alive with a...,
3,g8ed63dg,"Goddess Isis feeding Horus, wall relief","<p>Whole-length, standing, holding staff. A ha..."
4,tfm3zjpu,Gin Lane by William Hogarth.,
...,...,...,...
128876,u2cscemq,LASS : Leicestershire AIDS Support Services.,<p>Leaflet giving information about supporting...
128877,hb8vukjp,Take care of love ... : Aids doesn't mean you ...,
128878,fzfs54aq,A large red ribbon decorated with hearts in th...,<p>Leaflet giving information about the work o...
128879,f7xbyna4,Wellington and Peel in the roles of the body-s...,


## 2) Title and description embedding

### A) Word embedding 

### B) Sentence embedding

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

monolingual_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
multilingual_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

monolingual_embeddings = monolingual_model.encode(sentences)
multilingual_embeddings = multilingual_model.encode(sentences)
print(monolingual_embeddings)
print(multilingual_embeddings)


### C) Word2Vec embedding