In [155]:
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from joblib import Parallel, delayed
import re
import json
import requests
import dotenv, os
from pprint import pprint

# Import and set up OCR pipeline

dotenv.load_dotenv(".env", override=True)
os.environ["USE_TORCH"] = "1"

print("hello, world!")

print('USE_TORCH = ', os.environ.get('USE_TORCH'))


model = ocr_predictor(det_arch="linknet_resnet18", reco_arch="crnn_mobilenet_v3_small", assume_straight_pages=True, det_bs=2048, reco_bs=2048, pretrained=True)

model.det_predictor.model.postprocessor.bin_thresh = 0.002
model.det_predictor.model.postprocessor.box_thresh = 0.002


print("setup complete")

hello, world!
USE_TORCH =  1
setup complete


In [156]:
# OCR PIPELINE :)
%%time
def worker(file_name):
    doc = DocumentFile.from_images(file_name)
    with torch.no_grad():
        res =  model(doc)
        # checks for valid rotation
        line = [block["lines"][0]["words"] for block in res.export()["pages"][0]["blocks"]]
        confidences = [word["confidence"] > .5 for block in line for word in block]
        validity = sum(confidences) / len(confidences)
        if validity < .85:
            print("Upside down image detected.")
            rotated = (imutils.rotate(cv2.imread(file_name), angle=180))
            cv2.imwrite(file_name, rotated)
            doc = DocumentFile.from_images(file_name)
            new_res = model(doc)
            line = [block["lines"][0]["words"] for block in new_res.export()["pages"][0]["blocks"]]
            confidences = [word["confidence"] > .5 for block in line for word in block]
            new_validity = sum(confidences) / len(confidences)
            if new_validity < .85:
                unreadable.append(file_name)
            if new_validity > validity:
                res = new_res
                validity = new_validity
        print(f"{progress[file_name]}: {file_name} | {validity}")
        return res

progress = {}
tasks = []
results = []
unreadable = []
folder = "pdfs/tmp/"
for i, name in enumerate(os.listdir(folder)):
    if "jpeg" in name:
        file_path = "pdfs/tmp/" + name
        tasks.append(file_path)
        progress[file_path] = i

print("setup complete",end='\n\n')

results = Parallel(n_jobs=-1)(delayed(worker)(x) for x in tasks)
print("finished")

UsageError: Line magic function `%%time` not found.


In [None]:
# %matplotlib widget
for result in results:
    result.show()

In [None]:
# Save results to file
with open("ocr-data.json", "w") as file:
    for result in results:
        json_output = result.export()
        json.dump(json_output, file)
        file.write('\n\r\n')

In [None]:
# Match results from OCR to filenames based on OCR logs
connections = []
with open("order.txt", "r") as file:
    for line in file.readlines():
        line = line.rstrip()
        if "jpeg" in line:
            title = line.split(":")
            res_title = title[1].split("|")[0][27:-6]
            connections.append([title[0], res_title])

connections.sort(key=lambda x: int(x[0]))
order = []
for itm in connections:
    order.append(itm[1])

In [None]:
# Read saved data and add it back to results variable
with open("ocr-data.json", "r") as file:
    all_data = file.read()
    split = all_data.split("\n\n\n")

In [None]:
# Pretty print of OCR results

# # Use this if not reading from saved data
# for result in results:
#     json_output = result.export()

# Use this if reading from saved data
for item in split:
    # a new line must be causing an issue in dividing up the results
    # somewhere in the json file.
    try:
        json_output = json.loads(item)
    except:
        break
   
    blocks = json_output["pages"][0]["blocks"]
    for block in blocks:
        for line in block["lines"]:
            words = line["words"]
            print("-------")
            line = []
            confidences = []
            for word in words:
                geo = word["geometry"]
                size = (geo[1][0] - geo[0][0]) * 100000 * (geo[1][1] - geo[0][1])
                if size > 35:
                    confidences.append(word["confidence"])
                    if word["confidence"] >= 0.5:
                        line.append(word["value"])
            if len(confidences) > 0:
                if sum(confidences)/ len(confidences) > 0.7:
                    print(" ".join(line))
                # else:
                    # print("ERROR: " + " ".join(line))
    print("\n\n\n\n")


In [None]:
# Creates low confidence data spreadsheet
file = open("output.csv", "w")
for i, result in enumerate(results):
    json_output = result.export()
    blocks = json_output["pages"][0]["blocks"]
    for block in blocks:
        words = block["lines"][0]["words"]
        line = []
        confidences = []
        for word in words:
            geo = word["geometry"]
            size = (geo[1][0] - geo[0][0]) * 100000 * (geo[1][1] - geo[0][1])

            if word["confidence"] < 0.5:
                conf = word["confidence"] * 100
                file.write("{}\t{:.2f}\t{:.2f}\t{}\n".format(word["value"], size, conf))
file.close()
print("done")

In [None]:
# Parse whitestudiocards to get title, desc, retrieval, etc.
def num_freq(searchee):
    count = 0
    for char in searchee:
        if char.isnumeric():
            count += 1
    return count / len(searchee)

cards = {}
# toggle depending on reading from saved data
toggle = split
# toggle = results
for card_num, result in enumerate(toggle):
    try:
        name = order[card_num]
        new_card = {"desc_list": [], "date": "", "year": []}
        cards[name] = new_card
        # toggle depending on reading from saved data
        # json_output = result.export()
        json_output = json.loads(result)
        blocks = json_output["pages"][0]["blocks"]
    except:
        pass

    for i, block in enumerate(blocks):
        for line in block["lines"]:
            words = line["words"]
            # print("-------")
            line = []
            confidences = []
            for word in words:
                geo = word["geometry"]
                size = (geo[1][0] - geo[0][0]) * 100000 * (geo[1][1] - geo[0][1])
                if size > 35:
                    confidences.append(word["confidence"])
                    if word["confidence"] >= 0.5:
                        line.append(word["value"])
            if len(confidences) > 0:
                if sum(confidences)/ len(confidences) > 0.7:
                    text = (" ".join(line))
                    if i == 0:
                        new_card["job"] = text
                    if i == 1 or i == 2:
                        flagged = False
                        if num_freq(text) > .7 :
                            new_card["retrieval"] = text
                            if i == 2:
                                flagged = True
                        else:
                            if "job" not in new_card:
                                new_card["job"] = text
                        if "retrieve" in text.lower() and "retrieval" not in new_card:
                            # cut off "retrieve : "
                            new_card["retrieval"] = text[11:]

                    if i >= 2:
                        if "retrieve" not in text.lower() and not flagged:
                            matches = re.findall(r"(18\d{2}|19\d{2}|20\d{2})", text)
                            if len(matches) > 0:
                                new_card["date"] = text
                                for match in matches:
                                    new_card["year"].append(int(match))
                            new_card["desc_list"].append(text)
                        flagged = False
                else:
                    # print("ERROR: " + " ".join(line))
                    pass

    for j in range(1, len(new_card["desc_list"]), 3):
        # line near last is mostly numbers, it's data that's not needed
        try:
            if num_freq(new_card["desc_list"][-j].replace(" ", "")) > .4:
                del new_card["desc_list"][-j]
            else:  # we've hit actual text
                break
        except: # Index error
            pass


    new_card["description"] = " ".join(new_card["desc_list"])
    # print("File: " + name)
    # for key, value in new_card.items():
    #     if key != "desc_list":
    #         print(f"{key.upper()}: {value}")
    # print("\n")


In [None]:
import requests, json
from requests.structures import CaseInsensitiveDict

"""
In this example 40 titles are extracted, yet 2,238 results should be reachable.
"""

def trawler(uuid):
    global count, headers
    url = f"https://api.repo.nypl.org/api/v2/collections/{uuid}"
    resp = requests.get(url, headers=headers)
    res = resp.json()
    response = res["nyplAPI"]["response"]

    if "collection" in response:  # We've not hit item level yet
        items = response["collection"]
         # Some items in the json are dicts and others are lists of dicts
        if isinstance(items, dict):
            items = [items]

        # If still a parent item or a container, keep traversing down
        for item in items:
            if int(item["numSubCollections"]) > 0:
                trawler(item["uuid"])
            elif item["type"] == "Container":
                trawler(item["uuid"])

    else:
        items = response["item"]
        if isinstance(items, dict):
            items = [items]

        for item in items:
            titles = item["mods"]["titleInfo"]
            
            if isinstance(titles, dict):
                titles = [titles]
    
            for title in titles:
                count += 1
                print(title["title"])
   
headers = CaseInsensitiveDict()
token = ""
with open("api_token.txt", "r") as file:
    token = file.read()

headers["Authorization"] = f"Token token=\"{token}\""

count = 0  # for keeping track of extracted titles

# UUID of White Studio collection
trawler("7c22cac0-c5b8-012f-4613-58d385a7bc34")
print(f"Total extracted titles: {count}")

The adventures of Lady Ursula keysheets.
The adventures of Lady Ursula keysheets.
After five keysheet.
The age of reason keysheet.
Alibi Bill keysheet.
Alice in Wonderland keysheet.
An American widow keysheet.
Anatole keysheet.
Androcles and the lion keysheets.
Androcles and the lion keysheets.
Ann Boyd keysheet.
Any house keysheet.
90 in the shade keysheet.
All for the ladies keysheets.
All for the ladies keysheets.
Adele keysheets.
Adele keysheets.
Adele keysheets.
Adele keysheets.
Adele keysheets.
All aboard keysheet.
All over town keysheets.
All over town keysheets.
All over town keysheets.
The amber empress keysheets.
The amber empress keysheets.
The amber empress keysheets.
The amber empress keysheets.
The amber empress keysheets.
America keysheets.
America keysheets.
American beauties keysheet.
Around the map keysheet.
Baron Trenck keysheet
Interior of City Center of Music and Drama, New York
Entrance Lobby of City Center of Music and Drama, New York
Interior of City Center of M

In [None]:
# getting digitzed White Studio cards together (just cleaning scrapped web data)
titles = set()
with open("input2.txt") as file:
    data = file.readlines()
    for line in data:
        line = line.rstrip()
        if len(line) > 1:
            if line[-1] == ".":
                line = line[:-1]
            titles.add(line)
with open("input.txt") as file:
    data = file.readlines()
    for line in data:
        line = line.rstrip()
        if len(line) > 2:
            titles.add(line)

with open("whitestudio-digitized.txt", "w") as file:
    for title in titles:
        file.write(f"{title.rstrip()}\n") 

In [None]:
# Matching White Studio cards to digital collections
old_count = 0
count = 0
# Compiled from manual review
mismatches = [4,7,12,21,24,26,27,35,36,37,38,39,41,42,43,48,49,50,57,60,68,73,74,80,84,85,86,87,88,89,93]
print("\t\t\t\tCatalog Card + Date | Digital Collections | Filename")
for card in cards:
    try:
        job = cards[card]["job"].lower()
        dates = cards[card]["year"]
        if len(dates) > 0:
            dates = " ".join([str(x) for x in dates])
        else:
            dates = ""
    except:
        # these cards have edge cases... will come back to later
        # print(card, cards[card])
        pass
    for title in titles:
        title = title.lower()
        match_count = len(set(re.findall(r'\b\w+\b', job)) & set(re.findall(r'\b\w+\b', title)))
        if match_count > 2:
            old_count += 1
            if old_count not in mismatches:
                count +=1
                print(f"Potential match #{count} found:\t{job} {dates} | {title} | {card}")
print(f"Total potential matches: {count}")

In [165]:
from bs4 import BeautifulSoup
import time

# Web scrape all white studios catalog

def get_class(url, name, page):
    test = None
    while test == None:
        resp = requests.get(url, headers=headers)
        html = resp.content
        soup = BeautifulSoup(html, "html.parser")
        test = soup.find("dd", attrs={"data": "definition-Call Number"})
        if test == None:
            print(f"{name} retrying page {page}... url {url}")
            # edge case, no class mark
            for case in edge_cases:
                if case in name:
                    return
            time.sleep(2)
    inner_span = test.find("span").find("span")
    if inner_span:
        call_number_text = inner_span.text.strip()
        with open("catalog-scraping-final.csv", "a") as file:
            file.write("{}\t{}\t{}\n".format(page, name, call_number_text))
        print(f"Succsess page {page}")
    else:
        print(f"Page {page} error: {url}")

        
edge_cases = ["Esther Morris gets women the vote", "Tyler Perry's Boo"]
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"}
for i in range(30,31):
    url = f"https://www.nypl.org/research/research-catalog/search?q=White%20Studio&search_scope=contributor&page={i}"
    results = None
    while results == None:
        resp = requests.get(url, headers=headers)
        html = resp.content
        soup = BeautifulSoup(html, "html.parser")
        results = soup.find('div', id='search-results-list')
        if results == None:
            print(f"a retrying page {i}... url: {url}")
            time.sleep(2)
    if resp.status_code == 200:
        count = 0
        for child in results:
            check = str(child.attrs.items())
            if "css-1792eun" in check or "css1792eun" in check:
                count += 1
                if "Available Online" not in child.text:
                    title = child.text.split(".")[0]
                    link = child.find("a", attrs={"role": "link"})["href"]
                    url = "https://www.nypl.org" + link
                    get_class(url, title, i)
    else:
        print(f"Page {i} error: {resp.status_code}")

output.close

Succsess page 30
Succsess page 30
Tyler Perry's Boo! : a Madea Halloween / Lionsgate and Tyler Perry Studios ; producers, Tyler Perry, Ozzie Areu, Will Areu ; writer, Tyler Perry ; director, Tyler Perry retrying page 30... url https://www.nypl.org/research/research-catalog/bib/b21136790
Succsess page 30
Succsess page 30
Succsess page 30
Succsess page 30
Succsess page 30
Succsess page 30
Succsess page 30
Succsess page 30
Succsess page 30
Succsess page 30
Succsess page 30
Succsess page 30
Succsess page 30
Succsess page 30
Succsess page 30
Succsess page 30
Succsess page 30
Succsess page 30
Succsess page 30
Succsess page 30
Succsess page 30
Succsess page 30
Succsess page 30


<function TextIOWrapper.close()>

In [288]:
from bs4 import BeautifulSoup
import time

# Web scrape box entry on catalog
        
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"}
for i in range(1,8):
    url = f"https://www.nypl.org/research/research-catalog/bib/b16187984?item_page={i}"
    resp = requests.get(url, headers=headers)
    html = resp.content
    soup = BeautifulSoup(html, "html.parser")
    results = soup.find("table", attrs={"id": "bib-item-table"}).find("tbody")
    for child in results.children:
        test = child.find("td", attrs={"data-th": "Call Number"})
        print(test.find("span").text.rstrip())

output.close

*T-Vim 1956-002 MWEZ+ n.c. 16532, Box 1 (pp. 1-69)
*T-Vim 1956-002 MWEZ+ n.c. 16532, Box 2 (pp. 70-129)
*T-Vim 1956-002 MWEZ+ n.c. 16532, Box 3 (pp. 130-200)
*T-Vim 1956-002 MWEZ+ n.c. 16537, Box 1 (pp. 1-50)
*T-Vim 1956-002 MWEZ+ n.c. 16537, Box 2 (pp. 51-100)
*T-Vim 1956-002 MWEZ+ n.c. 16537, Box 3 (pp. 101-150)
*T-Vim 1956-002 MWEZ+ n.c. 16537, Box 4 (pp. 151-196)
*T-Vim 1956-002 MWEZ+ n.c. 16538, Box 1 (pp. 1-50)
*T-Vim 1956-002 MWEZ+ n.c. 16538, Box 2 (pp. 51-100)
*T-Vim 1956-002 MWEZ+ n.c. 16538, Box 3 (pp. 101-150)
*T-Vim 1956-002 MWEZ+ n.c. 16538, Box 4 (pp. 151-200)
*T-Vim 1956-002 MWEZ+ n.c. 16539, Box 1 (pp. 27-50)
*T-Vim 1956-002 MWEZ+ n.c. 16539, Box 2 (pp. 51-100)
*T-Vim 1956-002 MWEZ+ n.c. 16539, Box 3 (pp. 101-150)
*T-Vim 1956-002 MWEZ+ n.c. 16539, Box 4 (pp. 151-200)
*T-Vim 1956-002 MWEZ+ n.c. 16540, Box 1 (pp. 1-50)
*T-Vim 1956-002 MWEZ+ n.c. 16540, Box 2 (pp. 51-100)
*T-Vim 1956-002 MWEZ+ n.c. 16540, Box 3 (pp. 100-150)
*T-Vim 1956-002 MWEZ+ n.c. 16540, Box 4 (pp. 15

KeyboardInterrupt: 

In [287]:
# find matches between boxes in catalog and white studios in catalog
import csv

catalog_file = open("catalog-scraping-final.csv", "r")
catalog = csv.reader(catalog_file, delimiter='\t')

box_file = open("anniemarie.txt", "r")
box = box_file.read().splitlines()

for box_entry in box:
    box_list = box_entry.split()
    box_num = box_list[4][:-1]

    for catalog_entry in catalog:
        catalog_page_list = catalog_entry[2].split("p.")
        catalog_list = catalog_page_list[0].split()
        if "MWEZ" in catalog_list:
            catalog_num = catalog_list[3][:-1]
            if catalog_num == box_num:
                print("Catalog: ",end="")
                print(" ".join(catalog_entry[1:]))
                print(" ".join(box_list[5:]),end="\n\n")

Catalog: A Polish wedding keysheet *T-Vim 1956-002. MWEZ 16532, p. 4, 5
Box 1 (pp. 1-69)

Catalog: Preserving Mr *T-Vim 1956-002. MWEZ 16532, p. 148
Box 1 (pp. 1-69)

Catalog: Seven days keysheet *T-Vim 1956-002. MWEZ 16532, p. 139
Box 1 (pp. 1-69)

Catalog: The thief keysheet *T-Vim 1956-002. MWEZ 16532, p. 140
Box 1 (pp. 1-69)

Catalog: The price keysheet *T-Vim 1956-002. MWEZ 16532, p. 140
Box 1 (pp. 1-69)

Catalog: The thief keysheet *T-Vim 1956-002. MWEZ 16532, p. 140
Box 1 (pp. 1-69)

Catalog: The price keysheet *T-Vim 1956-002. MWEZ 16532, p. 140
Box 1 (pp. 1-69)

Catalog: The pigeon keysheet *T-Vim 1956-002. MWEZ 16532, p. 147
Box 1 (pp. 1-69)

Catalog: The pigeon keysheet *T-Vim 1956-002. MWEZ 16532, p. 147
Box 1 (pp. 1-69)

