# Imports

In [1]:
import requests
import json
import traceback
import itertools
import re
import random
import logging
import mysql.connector
from faker import Faker
from base64 import a85encode

from python.db_connection import DbConnection as DBC
from python.fakeinfgen import fakeInfos

## Create neccesary instances and configs

In [2]:
# fake = Faker()

In [3]:
FORMAT = '%(asctime)-15s %(clientip)s %(user)-8s %(message)s'
logging.basicConfig(filename="/logs/reqs.log",
                    filemode='w',
                    format=FORMAT)

FileNotFoundError: [Errno 2] No such file or directory: '/logs/reqs.log'

In [4]:
dbc = DBC()

In [5]:
def timeit(method):
    import time
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()        
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result
    
    return timed

# Functions

In [6]:
def flattenObj(obj):
    dictVal = {}
    
    if isinstance(obj,(dict, list)):
        
        # add indexes to list for iteration
        obj = dict(enumerate(obj)) if isinstance(obj, list) else obj
        
        # loop over entrys and store them in new dict
        for key, val in obj.items():
            if isinstance(val,(str, int)):
                dictVal[str(key)] = val
            
            # flatten again of the new value is a dict or list
            flattened = flattenObj(val)
            for key_new, val_new in flattened.items():
                dictVal[f"{key}_{key_new}"] = val_new
    
    return dictVal

In [7]:
def nestedDictGet(item, *keys):
    nextBaseItem = item
    for key in keys:        
        # test if type is list and get the index
        # beforeand check if the given value is a 
        # integer and a valid position in the dict
        if isinstance(nextBaseItem, list):
            if str(key).isdigit():
                if int(key) < len(nextBaseItem):
                    nextBaseItem = nextBaseItem[int(key)]
                    continue
            # return none if the index ist not valid for the
            # list and is a not existend as a key in the dict
            elif nextBaseItem.get(key) is None:
                return None
        
        # str and int are the information wanted, directly return it
        if isinstance(nextBaseItem,(str,int)):
            return nextBaseItem
        
        # test if the key exists onthe dict
        # otherwise returns null
        if not nextBaseItem.get(key):
            return None
        
        # as a default get the next nested object based on the key
        nextBaseItem = nextBaseItem.get(key)
        
    return nextBaseItem

In [8]:
def extractListSubject(subjects):
    if isinstance(subjects, str):
        return subjects

    topics = []
    if isinstance(subjects, list):
        for item in subjects:
            flattened = flattenObj(subjects)
            topics += extractListSubject(flattened)
    
    if isinstance(subjects, dict):
        for key, val in subjects.items():
            if not key.find("@authority") >= 0 and not isinstance(val, dict):
                topics += val if isinstance(val,list) and not isinstance(val, dict) else [val] # extractListSubject(val))
    
    # have every topic only one time by 
    # first cating to set and then back to list
    return list(set(topics))

In [9]:
def getInformationFromDifferentPaths(flattend, PATHS):
    information = None
    for path in PATHS:
        val = flattend.get(str(path))
        if val is not None:
            information = val
            break
    return information

In [10]:
def extractOneGenre(genre_flattend, GENRE_PATHS= ["0_#text", "#text"]):
    return getInformationFromDifferentPaths(genre_flattend, GENRE_PATHS)


In [11]:
def extractCreator(name_flattend, NAME_PATHS = ["0_namePart_0", "namePart_0", "0_namePart"]):
    return getInformationFromDifferentPaths(name_flattend, NAME_PATHS)

In [12]:
def extractPublisher(publisher_flattend, PUBLISHER_PATHS = ["publisher_0", "publisher_1"]):
    return getInformationFromDifferentPaths(publisher_flattend, PUBLISHER_PATHS)

In [13]:
def extractDate(date_flattend, DATE_PATHS = ["dateCreated", "dateCreated_0_#text", "dateIssued", "dateIssued_#text", "dateIssued_0_#text", "dateIssued_0"]):
    publish_date = getInformationFromDifferentPaths(date_flattend, DATE_PATHS)
    
    # remove chars because sometimes c (=circa) is included
    search = re.search(r'\d+', str(publish_date))
    return search[0] if publish_date and search else None

In [14]:
def extractEdition(edition_flattend, EDITION_PATHS = ["edition"]):
    edition = getInformationFromDifferentPaths(edition_flattend, EDITION_PATHS)
    
    # remove chars because sometimes c (=circa) is included
    search = re.search(r'\d+', str(edition))
    return search[0] if edition and search else None

In [15]:
def extractAbstract(abstract_flattend, ABSTRACT_PATHS = ["#text", "0"]):
    return getInformationFromDifferentPaths(abstract_flattend, ABSTRACT_PATHS)

In [16]:
def extractURL(location_flattend, LOCATION_PATHS = ["0_url_1_#text"]):
    return getInformationFromDifferentPaths(location_flattend, LOCATION_PATHS)

In [17]:
def extractInformation(item):
    titleInfo_flattend = flattenObj(nestedDictGet(item, "titleInfo"))
    
    title = titleInfo_flattend.get("title") if titleInfo_flattend.get("title") is not None else titleInfo_flattend.get("0_title")
    subtitle = titleInfo_flattend.get("subTitle") if titleInfo_flattend.get("subTitle") is not None else titleInfo_flattend.get("0_subTitle")
    
    name_flattend = flattenObj(nestedDictGet(item, "name"))
    creator = creator = extractCreator(name_flattend)
        
    date_flattend = flattenObj(nestedDictGet(item, "originInfo"))
    publish_date = extractDate(date_flattend)
    
    abstract_flattend = flattenObj(nestedDictGet(item, "abstract"))
    abstract = extractAbstract(abstract_flattend)
    
    subjects = extractListSubject(nestedDictGet(item, "subject"))
    
    fake = Faker()
    Faker.seed(title + str(subjects))

    genre_flattend = flattenObj(nestedDictGet(item, "genre")) 
    genre = extractOneGenre(genre_flattend) if genre_flattend != {} else fake.word()
    
    return title, subtitle, publish_date, abstract, creator, subjects, genre

In [18]:
def extractPicture(item):
    title, subtitle, publish_date, abstract, creator, subjects, genre = extractInformation(item)
    
    # only gets small preview
    location_flattend = flattenObj(nestedDictGet(item, "location"))
    url = extractURL(location_flattend)
    image_a85 = a85encode(requests.get(url).content) if url else None

    fake = Faker()
    Faker.seed(title + str(subjects))

    val_sorte = {
        "Name": genre,
        "Beschreibung": fake.paragraph()
    }

    creator = creator if creator else fake.name()
    val_person = {
        "Vorname": creator.split(" ")[0],
        "Name": creator.split(" ")[-1],
        "Email": f"{creator.split(' ')[0]}@{creator.split(' ')[-1]}.{fake.tld()}",
        "Geburtsdatum": fake.date_of_birth()
    }

    val_maler = {
        "PersonenId": "",
        "Beschreibung": fake.paragraph()
    }

    val_nichttextmedien = {
        "Titel": title,
        "Untertitel": subtitle,
        "Erscheinungsjahr": publish_date,
        "Kurzbeschreibung": abstract if abstract else fake.paragraph(),
        "SorteId": "",
        "Typ": ""
    }

    val_bild = {
        "NichtTextMedienId": "",
        "Bild": image_a85,
        "MalerId": ""
    }

    return val_sorte, val_person, val_maler, val_nichttextmedien, val_bild



In [19]:
def extractVideo(item):
    title, subtitle, publish_date, abstract, creator, subjects, genre = extractInformation(item)

    language_flattend = flattenObj(nestedDictGet(item, "language"))
    language = language_flattend.get("languageTerm_1_#text")

    fake = Faker()
    Faker.seed(title+language+str(subjects))

    val_sorte = {
        "Name": genre,
        "Beschreibung": fake.paragraph()
    }

    val_nichttextmedien = {
        "Titel": title,
        "Untertitel": subtitle,
        "Erscheinungsjahr": publish_date,
        "Kurzbeschreibung": abstract if abstract else fake.paragraph(),
        "SorteId": "",
        "Typ": ""
    }

    val_video = {
        "NichtTextMedienId": "",
        "Sprache": language
    }

    return val_sorte, val_nichttextmedien, val_video


In [20]:
def extractBook(item):
    title, subtitle, publish_date, abstract, creator, subjects, genre = extractInformation(item)
    
    language_flattend = flattenObj(nestedDictGet(item, "language"))
    language = language_flattend.get("languageTerm_1_#text")
    
    originInfo_flattend = flattenObj(nestedDictGet(item, "originInfo"))
    edition = extractEdition(originInfo_flattend)
    
    publisher = extractPublisher(originInfo_flattend)
    
    # factor 100 to create cent values when dividing again
    random.seed(title)
    price = random.randint(100, 10000)/100

    fake = Faker()
    Faker.seed(title+language+str(subjects))

    val_verlag = {
        "Kurzname" : publisher,
        "Name" : fake.company(),
        "Postleitzahl" : fake.postalcode(),
        "Strasse" : fake.street_address(),
        "Internetadresse" : fake.domain_name(),
        "Beschreibung" : fake.paragraph()
    }

    val_schlagwort = [
        {   "Wort": word if word else fake.word(), "Beschreibung": fake.paragraph() } for word in subjects
    ]

    val_buch = {
        "ISBN": fake.isbn13().replace("-",""),
        "Titel": title,
        "Untertitel": subtitle,
        "VerlagId": "",
        "Erscheinungsjahr": publish_date,
        "SorteId": "",
        "Kurzbeschreibung": abstract if abstract else fake.paragraph(),
        "Preis": str(price),
        "Auflage": edition,
        "Sprache": language
    }

    creator = creator if creator else fake.name()
    val_person = {
        "Vorname": creator.split(" ")[0],
        "Name": creator.split(" ")[-1],
        "Email": f"{creator.split(' ')[0]}@{creator.split(' ')[-1]}.{fake.tld()}",
        "Geburtsdatum": fake.date_of_birth()
    }

    val_autor = {
        "PersonenId": "",
        "Beschreibung": fake.paragraph()
    }

    val_sorte = {
        "Name": genre,
        "Beschreibung": fake.paragraph()
    }
    
    return val_schlagwort, val_verlag, val_buch, val_person, val_autor, val_sorte

# Testing of extraction

In [25]:
# r = requests.get("https://api.lib.harvard.edu/v2/items.json?q=*&limit=250&sort=recordIdentifier&resourceType=text")
# r = requests.get("https://api.lib.harvard.edu/v2/items.json?q=*&limit=250&sort=recordIdentifier")
# r = requests.get("https://api.lib.harvard.edu/v2/items.json?q=*&resourceType=still%20image&limit=250&sort=recordIdentifier")
#r = requests.get("https://api.lib.harvard.edu/v2/items.json?q=mona+lisa&resourceType=still%20image&limit=250&sort=recordIdentifier")
#r = requests.get("https://api.lib.harvard.edu/v2/items.json?q=*&resourceType=still%20image&limit=250&sort=recordIdentifier")
# r = requests.get("https://api.lib.harvard.edu/v2/items.json?q=*&limit=250&resourceType=moving%20image&sort=recordIdentifier")

In [26]:
print("status_code: " + str(r.status_code))
obj = json.loads(r.text)
print("lenght: " + str(len(obj["items"]["mods"])))

status_code: 200
lenght: 250


In [27]:
for num, item in enumerate(obj["items"]["mods"]):
    try:
        if item["typeOfResource"] == "text":
            val_schlagwort, val_verlag, val_buch, val_person, val_autor, val_sorte = extractBook(item)
            dbc.insert_book(val_schlagwort, val_verlag, val_buch, val_person, val_autor, val_sorte)
        elif item["typeOfResource"] == "moving image":
            val_sorte, val_nichttextmedien, val_video = extractVideo(item)
            dbc.insert_video(val_sorte, val_nichttextmedien, val_video)
        else:
            val_sorte, val_person, val_maler, val_nichttextmedien, val_bild = extractPicture(item)
            dbc.insert_bild(val_sorte, val_person, val_maler, val_nichttextmedien, val_bild)
    except Exception as e:
        logging.error(traceback.print_exc())
        logging.warning(item)

text
'extractBook'  53.32 ms
text
'extractBook'  37.98 ms
text
'extractBook'  39.57 ms
text
'extractBook'  42.81 ms
text
'extractBook'  40.15 ms
text
'extractBook'  42.14 ms
text
'extractBook'  40.73 ms
text
'extractBook'  40.96 ms
text
'extractBook'  44.53 ms
text
'extractBook'  46.28 ms
text
'extractBook'  41.21 ms
text
'extractBook'  42.39 ms
text
'extractBook'  39.84 ms
text
'extractBook'  40.63 ms
text
'extractBook'  43.47 ms
text
'extractBook'  41.75 ms
text
'extractBook'  38.91 ms
text
'extractBook'  44.90 ms
text
'extractBook'  37.12 ms
text
'extractBook'  41.51 ms
text
'extractBook'  44.05 ms
text
'extractBook'  38.24 ms
text
'extractBook'  37.72 ms
text
'extractBook'  38.96 ms
text
'extractBook'  37.35 ms
text
'extractBook'  41.71 ms
text
'extractBook'  39.02 ms
text
'extractBook'  40.00 ms
text
'extractBook'  37.89 ms
text
'extractBook'  38.72 ms
text
'extractBook'  43.22 ms
text
'extractBook'  40.36 ms
text
'extractBook'  44.56 ms
text
'extractBook'  42.50 ms
text
'extractB

# API Call (no functions)

In [47]:
r = requests.get("https://api.lib.harvard.edu/v2/items.json?q=*&limit=250&sort=recordIdentifier")
obj = json.loads(r.text)

# obj.get("pagination").get("numFound") exists always
for start_value in range( obj.get("pagination").get("numFound") // 250 + 1):
    print(f"{(start_value+1)*250}")
    
    r = requests.get(f"https://api.lib.harvard.edu/v2/items.json?q=*&limit=250&start={start_value*250}&sort=recordIdentifier")
    obj = json.loads(r.text)
    
   for item in obj["items"]["mods"]:
    try:
        if item["typeOfResource"] == "text":
            val_schlagwort, val_verlag, val_buch, val_person, val_autor, val_sorte = extractBook(item)
            dbc.insert_book(val_schlagwort, val_verlag, val_buch, val_person, val_autor, val_sorte)
        elif item["typeOfResource"] == "moving image":
            val_sorte, val_nichttextmedien, val_video = extractVideo(item)
            dbc.insert_video(val_sorte, val_nichttextmedien, val_video)
        else:
            val_sorte, val_person, val_maler, val_nichttextmedien, val_bild = extractPicture(item)
            dbc.insert_bild(val_sorte, val_person, val_maler, val_nichttextmedien, val_bild)
    except Exception as e:
        logging.error(e)

9500
9750
10000
10250
10500
10750
11000
11250
11500
11750
12000
12250
12500
