# Imports

In [2]:
import requests
import json
import traceback
import itertools
import re
import random
import logging
import mysql.connector
from faker import Faker

from python.db_connection import DbConnection as DBC
from python.fakeinfgen import fakeInfos

## Create neccesary instances and configs

In [4]:
fake = Faker()

In [6]:
FORMAT = '%(asctime)-15s %(clientip)s %(user)-8s %(message)s'
logging.basicConfig(format=FORMAT)

In [8]:
dbc = DBC()

# Functions

In [10]:
def flattenObj(obj):
    dictVal = {}
    
    if isinstance(obj,(dict, list)):
        
        # add indexes to list for iteration
        obj = dict(enumerate(obj)) if isinstance(obj, list) else obj
        
        # loop over entrys and store them in new dict
        for key, val in obj.items():
            if isinstance(val,(str, int)):
                dictVal[str(key)] = val
            
            # flatten again of the new value is a dict or list
            flattened = flattenObj(val)
            for key_new, val_new in flattened.items():
                dictVal[f"{key}_{key_new}"] = val_new
    
    return dictVal

In [12]:
def nestedDictGet(item, *keys):
    nextBaseItem = item
    for key in keys:        
        # test if type is list and get the index
        # beforeand check if the given value is a 
        # integer and a valid position in the dict
        if isinstance(nextBaseItem, list):
            if str(key).isdigit():
                if int(key) < len(nextBaseItem):
                    nextBaseItem = nextBaseItem[int(key)]
                    continue
            # return none if the index ist not valid for the
            # list and is a not existend as a key in the dict
            elif nextBaseItem.get(key) is None:
                return None
        
        # str and int are the information wanted, directly return it
        if isinstance(nextBaseItem,(str,int)):
            return nextBaseItem
        
        # test if the key exists onthe dict
        # otherwise returns null
        if not nextBaseItem.get(key):
            return None
        
        # as a default get the next nested object based on the key
        nextBaseItem = nextBaseItem.get(key)
        
    return nextBaseItem

In [14]:
def extractListSubject(subjects):
    if isinstance(subjects, str):
        return subjects

    topics = []
    if isinstance(subjects, list):
        for item in subjects:
            flattened = flattenObj(subjects)
            topics += extractListSubject(flattened)
    
    if isinstance(subjects, dict):
        for key, val in subjects.items():
            if not key.find("@authority") >= 0 and not isinstance(val, dict):
                topics += val if isinstance(val,list) and not isinstance(val, dict) else [val] # extractListSubject(val))
    
    # have every topic only one time by 
    # first cating to set and then back to list
    return list(set(topics))

In [16]:
def getInformationFromDifferentPaths(flattend, PATHS):
    information = None
    for path in PATHS:
        val = flattend.get(str(path))
        if val is not None:
            information = val
            break
    return information

In [18]:
def extractOneGenre(genre_flattend, GENRE_PATHS= ["0_#text", "#text"]):
    return getInformationFromDifferentPaths(genre_flattend, GENRE_PATHS)


In [20]:
def extractCreator(name_flattend, NAME_PATHS = ["0_namePart_0", "namePart_0", "0_namePart"]):
    return getInformationFromDifferentPaths(name_flattend, NAME_PATHS)

In [22]:
def extractPublisher(publisher_flattend, PUBLISHER_PATHS = ["publisher_0", "publisher_1"]):
    return getInformationFromDifferentPaths(publisher_flattend, PUBLISHER_PATHS)

In [24]:
def extractDate(date_flattend, DATE_PATHS = ["dateCreated", "dateCreated_0_#text", "dateIssued", "dateIssued_#text", "dateIssued_0_#text", "dateIssued_0"]):
    publish_date = getInformationFromDifferentPaths(date_flattend, DATE_PATHS)
    
    # remove chars because sometimes c (=circa) is included
    search = re.search(r'\d+', str(publish_date))
    return search[0] if publish_date and search else None

In [26]:
def extractEdition(edition_flattend, EDITION_PATHS = ["edition"]):
    edition = getInformationFromDifferentPaths(edition_flattend, EDITION_PATHS)
    
    # remove chars because sometimes c (=circa) is included
    search = re.search(r'\d+', str(edition))
    return search[0] if edition and search else None

In [28]:
def extractAbstract(abstract_flattend, ABSTRACT_PATHS = ["#text", "0"]):
    return getInformationFromDifferentPaths(abstract_flattend, ABSTRACT_PATHS)

In [42]:
def extractInformation(item):
    titleInfo_flattend = flattenObj(nestedDictGet(item, "titleInfo"))
    
    title = titleInfo_flattend.get("title") if titleInfo_flattend.get("title") is not None else titleInfo_flattend.get("0_title")
    subtitle = titleInfo_flattend.get("subTitle") if titleInfo_flattend.get("subTitle") is not None else titleInfo_flattend.get("0_subTitle")
    
    name_flattend = flattenObj(nestedDictGet(item, "name"))
    creator = creator = extractCreator(name_flattend)
        
    date_flattend = flattenObj(nestedDictGet(item, "originInfo"))
    publish_date = extractDate(date_flattend)
    
    abstract_flattend = flattenObj(nestedDictGet(item, "abstract"))
    abstract = extractAbstract(abstract_flattend)
    
    subjects = extractListSubject(nestedDictGet(item, "subject"))
    
    genre_flattend = flattenObj(nestedDictGet(item, "genre")) 
    genre = extractOneGenre(genre_flattend) if genre_flattend != {} else fake.word()

    fake = Faker()
    Faker.seed(title + str(subjects))
    
    return title, subtitle, publish_date, abstract, creator, subjects, genre

In [32]:
def extractPicture(item):
    title, subtitle, publish_date, abstract, creator, subjects, genre = extractInformation(item)

In [34]:
def extractBook(item):
    title, subtitle, publish_date, abstract, creator, subjects, genre = extractInformation(item)
    
    language_flattend = flattenObj(nestedDictGet(item, "language"))
    language = language_flattend.get("languageTerm_1_#text")
    
    originInfo_flattend = flattenObj(nestedDictGet(item, "originInfo"))
    edition = extractEdition(originInfo_flattend)
    
    publisher = extractPublisher(originInfo_flattend)
    
    # factor 100 to create cent values when dividing again
    random.seed(title)
    price = random.randint(100, 10000)/100

    fake = Faker()
    Faker.seed(title+language+str(subjects))

    val_verlag = {
        "Kurzname" : publisher,
        "Name" : fake.company(),
        "Postleitzahl" : fake.postalcode(),
        "Strasse" : fake.street_address(),
        "Internetadresse" : fake.domain_name(),
        "Beschreibung" : fake.paragraph()
    }

    val_schlagwort = [
        {   "Wort": word if word else fake.word(), "Beschreibung": fake.paragraph() } for word in subjects
    ]

    val_buch = {
        "ISBN": fake.isbn13().replace("-",""),
        "Titel": title,
        "Untertitel": subtitle,
        "VerlagId": "",
        "Erscheinungsjahr": publish_date,
        "SorteId": "",
        "Kurzbeschreibung": abstract if abstract else fake.paragraph(),
        "Preis": str(price),
        "Auflage": edition,
        "Sprache": language
    }

    creator = creator if creator else fake.name()
    val_person = {
        "Vorname": creator.split(" ")[0],
        "Name": creator.split(" ")[-1],
        "Email": f"{creator.split(' ')[0]}@{creator.split(' ')[-1]}.{fake.tld()}",
        "Geburtsdatum": fake.date_of_birth()
    }

    val_autor = {
        "PersonenId": "",
        "Beschreibung": fake.paragraph()
    }

    val_sorte = {
        "Name": genre,
        "Beschreibung": fake.paragraph()
    }
    
    return val_schlagwort, val_verlag, val_buch, val_person, val_autor, val_sorte

# Testing of extraction

In [39]:
r = requests.get("https://api.lib.harvard.edu/v2/items.json?q=*&limit=250&sort=recordIdentifier&resourceType=text")
# r = requests.get("https://api.lib.harvard.edu/v2/items.json?q=*&limit=250&sort=recordIdentifier")
#r = requests.get("https://api.lib.harvard.edu/v2/items.json?q=*&resourceType=still%20image&limit=250&sort=recordIdentifier")
#r = requests.get("https://api.lib.harvard.edu/v2/items.json?q=mona+lisa&resourceType=still%20image&limit=250&sort=recordIdentifier")
#r = requests.get("https://api.lib.harvard.edu/v2/items.json?q=*&resourceType=still%20image&limit=250&sort=recordIdentifier")
#r = requests.get("https://api.lib.harvard.edu/v2/items.json?q=*&limit=250&resourceType=moving%20image&sort=recordIdentifier")

In [40]:
print("status_code: " + str(r.status_code))
obj = json.loads(r.text)
print("lenght: " + str(len(obj["items"]["mods"])))

status_code: 200
lenght: 250


In [44]:
for num, item in enumerate(obj["items"]["mods"][:1]):
    try:
        if item["typeOfResource"] == "text":
            val_schlagwort, val_verlag, val_buch, val_person, val_autor, val_sorte = extractBook(item)
            # breakpoint()
            dbc.insert_book(val_schlagwort, val_verlag, val_buch, val_person, val_autor, val_sorte)
        elif item["typeOfResource"] == "moving image":
            # title, subtitle, publish_date, abstract, creator, subjects
            dbc.insert_video(extractInformation(item))
        else:
            dbc.insert_bild(extractInformation(item))
    except Exception as e:
        print(f"item index: {num}")
        logging.error(traceback.print_exc())
        logging.warning(item)
        break

In [37]:
def extractListSubject(subjects):
    if isinstance(subjects, str):
        return subjects

    topics = []
    if isinstance(subjects, list):
        for item in subjects:
            flattened = flattenObj(subjects)
            topics += extractListSubject(flattened)
    
    if isinstance(subjects, dict):
        for key, val in subjects.items():
            if not key.find("@authority") >= 0 and not isinstance(val, dict):
                topics += val if isinstance(val,list) and not isinstance(val, dict) else [val] # extractListSubject(val))
    
    # have every topic only one time by 
    # first cating to set and then back to list
    return list(set(topics))

val_schlagwort, val_verlag, val_buch, val_person, val_autor, val_sorte = extractBook(obj["items"]["mods"][6])

# API Call (no functions)

In [47]:
r = requests.get("https://api.lib.harvard.edu/v2/items.json?q=*&limit=250&sort=recordIdentifier")
obj = json.loads(r.text)

# obj.get("pagination").get("numFound") exists always
for start_value in range( obj.get("pagination").get("numFound") // 250 + 1):
    print(f"{(start_value+1)*250}")
    
    r = requests.get(f"https://api.lib.harvard.edu/v2/items.json?q=*&limit=250&start={start_value*250}&sort=recordIdentifier")
    obj = json.loads(r.text)
    
    for item in obj["items"]["mods"]:
        try:
            dbc.insert_book(extractInformation(item))
        except Exception as e:
            print(f"\nError in Round {start_value} Entry {num}:\n\n")
            print(traceback.print_exc())
            print("\n\n")
            print(item)
            test_break = True
            break
    if test_break: break

9500
9750
10000
10250
10500
10750
11000
11250
11500
11750
12000
12250
12500


# Test single entrys

In [37]:
start_value = 37
entry = 200

r = requests.get(f"https://api.lib.harvard.edu/v2/items.json?q=*&limit=250&start={start_value*250}&sort=recordIdentifier")
item = json.loads(r.text)["items"]["mods"][entry]

In [46]:
extractInformation(item)

('Khan of Ezinepazar',
 None,
 None,
 None,
 None,
 ['battlements', 'pillars', 'portals', 'arches', 'windows', 'inscriptions'])