In [21]:
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from joblib import Parallel, delayed
import re
import json
import dotenv, os
from pprint import pprint

# Import and set up OCR pipeline

dotenv.load_dotenv(".env", override=True)
os.environ["USE_TORCH"] = "1"

print("hello, world!")

print('USE_TORCH = ', os.environ.get('USE_TORCH'))


model = ocr_predictor(det_arch="linknet_resnet18", reco_arch="crnn_mobilenet_v3_small", assume_straight_pages=True, det_bs=2048, reco_bs=2048, pretrained=True)

model.det_predictor.model.postprocessor.bin_thresh = 0.002
model.det_predictor.model.postprocessor.box_thresh = 0.002


print("setup complete")

hello, world!
USE_TORCH =  1
setup complete


In [None]:
# OCR PIPELINE :)
%%time
def worker(file_name):
    doc = DocumentFile.from_images(file_name)
    with torch.no_grad():
        res =  model(doc)
        # checks for valid rotation
        line = [block["lines"][0]["words"] for block in res.export()["pages"][0]["blocks"]]
        confidences = [word["confidence"] > .5 for block in line for word in block]
        validity = sum(confidences) / len(confidences)
        if validity < .85:
            print("Upside down image detected.")
            rotated = (imutils.rotate(cv2.imread(file_name), angle=180))
            cv2.imwrite(file_name, rotated)
            doc = DocumentFile.from_images(file_name)
            new_res = model(doc)
            line = [block["lines"][0]["words"] for block in new_res.export()["pages"][0]["blocks"]]
            confidences = [word["confidence"] > .5 for block in line for word in block]
            new_validity = sum(confidences) / len(confidences)
            if new_validity < .85:
                unreadable.append(file_name)
            if new_validity > validity:
                res = new_res
                validity = new_validity
        print(f"{progress[file_name]}: {file_name} | {validity}")
        return res

progress = {}
tasks = []
results = []
unreadable = []
folder = "pdfs/tmp/"
for i, name in enumerate(os.listdir(folder)):
    if "jpeg" in name:
        file_path = "pdfs/tmp/" + name
        tasks.append(file_path)
        progress[file_path] = i

print("setup complete",end='\n\n')

results = Parallel(n_jobs=-1)(delayed(worker)(x) for x in tasks)
print("finished")

In [None]:
# %matplotlib widget
for result in results:
    result.show()

In [None]:
# Save results to file
with open("ocr-data.json", "w") as file:
    for result in results:
        json_output = result.export()
        json.dump(json_output, file)
        file.write('\n\r\n')

In [None]:
# Match results from OCR to filenames based on OCR logs
connections = []
with open("order.txt", "r") as file:
    for line in file.readlines():
        line = line.rstrip()
        if "jpeg" in line:
            title = line.split(":")
            res_title = title[1].split("|")[0][27:-6]
            connections.append([title[0], res_title])

connections.sort(key=lambda x: int(x[0]))
order = []
for itm in connections:
    order.append(itm[1])

In [None]:
# Read saved data and add it back to results variable
with open("ocr-data.json", "r") as file:
    all_data = file.read()
    split = all_data.split("\n\r\n")

In [None]:
# Pretty print of OCR results

# # Use this if not reading from saved data
# for result in results:
#     json_output = result.export()

# Use this if reading from saved data
for item in split:
    # a new line must be causing an issue in dividing up the results
    # somewhere in the json file.
    try:
        json_output = json.loads(item)
    except:
        break
   
    blocks = json_output["pages"][0]["blocks"]
    for block in blocks:
        for line in block["lines"]:
            words = line["words"]
            print("-------")
            line = []
            confidences = []
            for word in words:
                geo = word["geometry"]
                size = (geo[1][0] - geo[0][0]) * 100000 * (geo[1][1] - geo[0][1])
                if size > 35:
                    confidences.append(word["confidence"])
                    if word["confidence"] >= 0.5:
                        line.append(word["value"])
            if len(confidences) > 0:
                if sum(confidences)/ len(confidences) > 0.7:
                    print(" ".join(line))
                # else:
                    # print("ERROR: " + " ".join(line))
    print("\n\n\n\n")


In [None]:
# Creates low confidence data spreadsheet
file = open("output.csv", "w")
for i, result in enumerate(results):
    json_output = result.export()
    blocks = json_output["pages"][0]["blocks"]
    for block in blocks:
        words = block["lines"][0]["words"]
        line = []
        confidences = []
        for word in words:
            geo = word["geometry"]
            size = (geo[1][0] - geo[0][0]) * 100000 * (geo[1][1] - geo[0][1])

            if word["confidence"] < 0.5:
                conf = word["confidence"] * 100
                file.write("{}\t{:.2f}\t{:.2f}\t{}\n".format(word["value"], size, conf))
file.close()
print("done")

In [28]:
# Parse whitestudiocards to get title, desc, retrieval, etc.
def num_freq(searchee):
    count = 0
    for char in searchee:
        if char.isnumeric():
            count += 1
    return count / len(searchee)

cards = {}
# toggle depending on reading from saved data
toggle = split
# toggle = results
for card_num, result in enumerate(toggle):
    try:
        name = order[card_num]
        new_card = {"desc_list": [], "date": "", "year": []}
        cards[name] = new_card
        # toggle depending on reading from saved data
        # json_output = result.export()
        json_output = json.loads(result)
        blocks = json_output["pages"][0]["blocks"]
    except:
        pass

    for i, block in enumerate(blocks):
        for line in block["lines"]:
            words = line["words"]
            # print("-------")
            line = []
            confidences = []
            for word in words:
                geo = word["geometry"]
                size = (geo[1][0] - geo[0][0]) * 100000 * (geo[1][1] - geo[0][1])
                if size > 35:
                    confidences.append(word["confidence"])
                    if word["confidence"] >= 0.5:
                        line.append(word["value"])
            if len(confidences) > 0:
                if sum(confidences)/ len(confidences) > 0.7:
                    text = (" ".join(line))
                    if i == 0:
                        new_card["job"] = text
                    if i == 1 or i == 2:
                        flagged = False
                        if num_freq(text) > .7 :
                            new_card["retrieval"] = text
                            if i == 2:
                                flagged = True
                        else:
                            if "job" not in new_card:
                                new_card["job"] = text
                        if "retrieve" in text.lower() and "retrieval" not in new_card:
                            # cut off "retrieve : "
                            new_card["retrieval"] = text[11:]

                    if i >= 2:
                        if "retrieve" not in text.lower() and not flagged:
                            matches = re.findall(r"(18\d{2}|19\d{2}|20\d{2})", text)
                            if len(matches) > 0:
                                new_card["date"] = text
                                for match in matches:
                                    new_card["year"].append(int(match))
                                print(new_card["year"])

                            new_card["desc_list"].append(text)
                        flagged = False
                else:
                    # print("ERROR: " + " ".join(line))
                    pass

    for j in range(1, len(new_card["desc_list"]), 3):
        # line near last is mostly numbers, it's data that's not needed
        try:
            if num_freq(new_card["desc_list"][-j].replace(" ", "")) > .4:
                del new_card["desc_list"][-j]
            else:  # we've hit actual text
                break
        except: # Index error
            pass


    new_card["description"] = " ".join(new_card["desc_list"])
    # print("File: " + name)
    # for key, value in new_card.items():
    #     if key != "desc_list":
    #         print(f"{key.upper()}: {value}")
    # print("\n")


[1926]
[1924]
[1935]
[1935]
[1925]
[1930]
[1936]
[1935]
[1921]
[1935]
[1923]
[1923]
[1925]
[1924]
[1930]
[1936]
[1930]
[1934]
[1936]
[1935]
[1925]
[1924]
[1932]
[1935]
[1923]
[1925]
[1921, 1922]
[1928]
[1917]
[2066]
[1930]
[1935]
[1925]
[1925, 1927]
[1926]
[1925]
[1925]
[1924]
[1916]
[1929]
[1932]
[1926]
[1920]
[1926]
[1926]
[1916]
[1935]
[1910]
[1928]
[1933]
[1928]
[1936]
[1928]
[1925]
[1926]
[1922]
[1918]
[1923, 1924]
[1923]
[1936]
[1933]
[1923]
[1909]
[1930]
[1930]
[1918]
[1918, 1917]
[1929]
[1931, 1932]
[1930]
[1920]
[1936]
[1931]
[1931]
[1923]
[1935]
[1921]
[1917]
[1931]
[1924]
[1930]
[1935]
[1925]
[1931]
[1925]
[1923]
[1923, 1922]
[1930]
[1920]
[1923]
[1924]
[1929]
[1929, 1925]
[1925]
[1928]
[1936]
[1910]
[1928]
[1935]
[1935]
[1920]
[1930]
[1931]
[1926]
[1935]
[1927]
[1925]
[1924]
[1923]
[1925, 1926]
[1935]
[1919]
[1925]
[1925]
[1935]
[1927]
[1926]
[1935]
[1929]
[1930]
[1931]
[1923, 1923]
[1924]
[1935]
[1924]
[1930]
[1926]
[1931]
[1926]
[1926, 1926]
[1925]
[1922]
[1936]
[1925]
[1

In [5]:
import requests, json
from requests.structures import CaseInsensitiveDict

# trying to interface with API of NYPL site to get list of 
# digitized whitestudio card collection. Turns out the API is broken lol.
def trawler(uuid):
    global count, headers
    url = f"https://api.repo.nypl.org/api/v2/collections/{uuid}"
    resp = requests.get(url, headers=headers)
    res = resp.json()
    print(json.dumps(res, indent=4))
    # if int(res["nyplAPI"]["response"]["numSubCollections"]) == 0:
        # print(json.dumps(res["nyplAPI"]["response"], indent=4))
    items = res["nyplAPI"]["response"]["item"]
    if isinstance(items, dict):
        items = [items]
    for item in items:
        count += 1
        print(item["mods"]["titleInfo"]["title"])

    else:
        for collection in res["nyplAPI"]["response"]["collection"]:
            try:
                print("COLLECTION: " + collection["mods"]["titleInfo"]["title"])
                trawler(collection["uuid"])
            except:
                pass

headers = CaseInsensitiveDict()


token = ""
with open("api_token.txt", "r") as file:
    token = file.read()
    
headers["Authorization"] = f"Token token=\"{token}\""

count = 0
# UUID of White Studio Collection
trawler("c54c7e80-c5ba-012f-7863-58d385a7bc34")
print(count)

{
    "nyplAPI": {
        "request": {
            "uuid": "c54c7e80-c5ba-012f-7863-58d385a7bc34"
        },
        "response": {
            "headers": {
                "status": "success",
                "code": "200",
                "message": "ok"
            },
            "numSubCollections": "1",
            "numItems": "497",
            "numResults": "497",
            "collection": {
                "apiUri": "https://api.repo.nypl.org/api/v2/mods/37ef4c00-c5bb-012f-528b-58d385a7bc34",
                "levelUri": "https://api.repo.nypl.org/api/v2/collections/37ef4c00-c5bb-012f-528b-58d385a7bc34",
                "grandParentLevelUri": "https://api.repo.nypl.org/api/v2/collections/7c22cac0-c5b8-012f-4613-58d385a7bc34",
                "uuid": "37ef4c00-c5bb-012f-528b-58d385a7bc34",
                "type": "Container",
                "numSubCollections": "0",
                "numItems": "1",
                "depth": "3",
                "parentUUIDs": "[\"c54c7e80-c5ba-01

In [None]:
# getting digitzed White Studio cards together (just cleaning scrapped web data)
titles = set()
with open("input2.txt") as file:
    data = file.readlines()
    for line in data:
        line = line.rstrip()
        if len(line) > 1:
            if line[-1] == ".":
                line = line[:-1]
            titles.add(line)
with open("input.txt") as file:
    data = file.readlines()
    for line in data:
        line = line.rstrip()
        if len(line) > 2:
            titles.add(line)

with open("whitestudio-digitized.txt", "w") as file:
    for title in titles:
        file.write(f"{title.rstrip()}\n") 

In [None]:
# Matching White Studio cards to digital collections
count = 0
for card in cards:
    try:
        job = cards[card]["job"].lower()
    except:
        # these cards have edge cases... will come back to later
        # print(card, cards[card])
        pass
    for title in titles:
        title = title.lower()
        match_count = len(set(re.findall(r'\b\w+\b', job)) & set(re.findall(r'\b\w+\b', title)))
        if match_count > 2:
            count += 1
            print(f"Potential match #{count} found:\t{job} | {title}")
    # if match_count > limit and "non-player-character" not in name:
print(f"Total potential matches: {count}")

In [None]:
# %matplotlib widget

order = ["greytiming1146"]
for result in results:
    result.show()