In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import urllib

# Kyoto Costume Institute Digital Archives

In [7]:
ITEMS_LIST = []

def get_items_per_decade(decade):
    try:
        r = requests.get(f"https://www.kci.or.jp/en/archives/digital_archives/{decade}s/index.html")
        page = BeautifulSoup(r.text, 'html.parser')
        cards = page.find_all(class_="card js-cardCategory")
        for i, card in enumerate(cards):
          id = str(decade) + '_' + str(i+1)
          spec = card.find(class_="card__spec__year").text 
          country = spec.split(' - ')[-1] if ' - ' in spec else 'unknown'
          item_info = {"ID": id, 
                       "Category": card.find(class_="card__title").text, 
                       "Decade": decade, 
                       "Country": None if 'unknown' in country.lower() else country}
          img_url = card.find(class_="card__image").get('src').replace("../../../..", "https://www.kci.or.jp")
          urllib.request.urlretrieve(img_url, f"kci_imgs/{id}.jpg")
          ITEMS_LIST.append(item_info)
       
    except Exception as e:
        if e is ConnectionError:
            return "ConErr"
        return None

In [8]:
for dec in range(1890, 2011, 10):
    get_items_per_decade(dec)

ITEMS_DATA = pd.DataFrame(ITEMS_LIST)
ITEMS_DATA

Unnamed: 0,ID,Category,Decade,Country
0,1890_1,Dress (Presentation Dress),1890,
1,1890_2,Day dress,1890,
2,1890_3,"Bracelet [lower], Belt [upper]",1890,Japan
3,1890_4,Vanity Case,1890,Japan
4,1890_5,Day Dress,1890,
...,...,...,...,...
237,2010_5,ShoesãNOVA Shoeã,2010,
238,2010_6,T-shirt,2010,
239,2010_7,Day Ensemble âMiss No. 5â,2010,
240,2010_8,"Backpack [Left], Bag [Right]",2010,


In [9]:
ITEMS_DATA.to_csv("kci_metadata.csv")

# THE NEW YORK PUBLIC LIBRARY DIGITAL COLLECTIONS
# German old master prints

In [96]:
def get_items_per_page(page_num):
    r = requests.get(f"https://digitalcollections.nypl.org/collections/2b332a20-e136-0132-f347-58d385a7bbd0?format=html&id=2b332a20-e136-0132-f347-58d385a7bbd0&per_page=250&page={page_num}#/?tab=navigation")
    page = BeautifulSoup(r.text, 'html.parser')
    items = page.find_all(class_="description")
    links = [i.find('a') for i in items]
    urls = [(a.text, a.get('href')) for a in links]
    return urls

URLS = []
for page_num in range(1, 5):
    URLS.extend(get_items_per_page(page_num))

In [97]:
def get_item_info(item_name, item_url):
    try:
        r = requests.get("https://digitalcollections.nypl.org" + item_url)
        page = BeautifulSoup(r.text, 'html.parser')
        info_block = page.find(id="item-content-data").text.replace("\n", ' ')

        created = info_block.split("Date Created:")[1].strip().split(' ')
        created = int(created[2]) if created[1] == '-' else int(created[0])
        medium_raw = info_block.split("Physical Description")[1].strip()
        medium = medium_raw[0]
        idx = 1
        while medium_raw[idx].islower():
            medium += medium_raw[idx]
            idx += 1
        inventory_number = int(info_block.split("TMS ID:")[1].strip().split(' ')[0].replace('TMS', ''))
        digitized = int(info_block.split(": Digitized")[0].split(' ')[-1])
        
        item_info = {"Inventory Number": inventory_number, 
                    "Title": item_name, 
                    "Medium": medium, 
                    "Date Created": created,
                    "Date Digitised": digitized}
        img_url = page.find('img').get('src')
        urllib.request.urlretrieve(img_url, f"nypl_imgs/{inventory_number}.jpg")
        return item_info
       
    except Exception as e:
        return e

In [98]:
ITEMS_LIST = []

for url in URLS:
   ITEMS_LIST.append(get_item_info(*url))

ITEMS_DATA = pd.DataFrame(ITEMS_LIST)
ITEMS_DATA.to_csv("nypl_metadata.csv")

# The University of Manchester - The Museum of Medicine and Health

In [None]:
! pip install selenium webdriver_manager==4.0.2

In [126]:
from time import sleep
import os
import logging

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

In [122]:
logging.getLogger('WDM').setLevel(logging.NOTSET)
os.environ['WDM_LOG'] = 'False'

# Driver set-up
service = Service(ChromeDriverManager().install().replace("THIRD_PARTY_NOTICES.", ''))
driver = webdriver.Chrome(service=service)

In [None]:
def get_items_per_page(page_num):
    page_url = f"https://www.digitalcollections.manchester.ac.uk/collections/historyofmedicine_mmh/{page_num}"
    driver.get(page_url)
    sleep(1)
    items = driver.find_elements(By.XPATH, '//div[@class="collections_carousel_item"]')
    urls = [i.find_element(By.TAG_NAME, 'a').get_attribute('href') for i in items]
    return urls

URLS = []
for page_num in range(1, 4):
    URLS.extend(get_items_per_page(page_num))

In [161]:
def get_item_info(item_url):
    driver.get(item_url)
    sleep(0.3)
    info_block = driver.find_element(By.XPATH, '//div[@class="panel-body"]').text

    inventory_number = info_block.split("Classmark:")[1].strip().split("\n")[0].replace('.', '_')
    print(inventory_number)
    title = info_block.split("Title:")[1].strip().split("\n")[0]
    origin = info_block.split("Origin Place:")[1].strip().split("\n")[0]
    if "Date of Creation:" in info_block:
        creation = info_block.split("Date of Creation:")[1].strip().split("\n")[0]
        creation = int(creation[:4]) if len(creation) > 4 else int(creation)
    else:
        creation = None
    acquisition = info_block.split("Date of Acquisition:")[1].strip().split("\n")[0]
    acquisition = int(acquisition[:4]) if len(acquisition) > 4 else int(acquisition)
    material = info_block.split("Material(s):")[1].strip().split("\n")[0].split('; ')[0]
    item_info = {"Inventory Number": inventory_number, 
                "Title": title, 
                "Country of Origin": origin, 
                "Main Material": material,
                "Date Created": creation,
                "Date Acquired": acquisition}
    return item_info
       

In [162]:
ITEMS_LIST = []

for url in URLS:
   try:
      ITEMS_LIST.append(get_item_info(url))
   except Exception as e:
      print(e)
      continue

ITEMS_DATA = pd.DataFrame(ITEMS_LIST)
ITEMS_DATA.to_csv("manchester_metadata.csv")

MMH_1970_76
MMH_1970_117
MMH_1970_264
MMH_1972_2
MMH_1978_104
MMH_1978_133
MMH_1979_132
MMH_1982_6
MMH_1984_86
MMH_1985_40
MMH_1994_10
MMH_1999_121
list index out of range
MMH_2001_87
MMH_2002_6
MMH_2002_134
MMH_2004_171
MMH_2004_199
MMH_2004_345
MMH_2004_444
MMH_2006_55
invalid literal for int() with base 10: '19th'
MMH_2007_41
MMH_2008_112
MMH_2015_27
MMH_2021_1
list index out of range
