In [211]:
import glob
import bs4
import json
import requests
from tqdm import tqdm
import os
import pprint
import copy

class Item:
    soup = None
    
    dirname = "tei3"
    uri_prefix = "https://nakamura196.github.io/saji"
    
    dict4div1 = {}
    
    metadata = {
        "element": "item",
        "created": "9999-99-99",
        "children": [],
        "note" : []
    }
    
    def __init__(self, file):
        soup = bs4.BeautifulSoup(open(file), 'xml')
        
        self.metadata["title"] = file.split("/")[-1].split(".")[0]
        
        self.soup = soup
        # self.dict4div1 = getDict4div1()
        
        with open('../docs/etc/dict.json', encoding="utf8") as f:
            map = {}
            df = json.load(f)
            for key in df:
                map[key.upper()] = df[key].upper()
            self.dict4div1= map
            
        self.getCanvases()
            
    def getValueAndAttrs(self, e):
        item = {
            "text": e.text
        }
        
        map = e.attrs
        
        for key in map:
            item[key] = map[key]
            
        return item
    
    def extractElements(self, div, type):
        values = []
        if not div:
            return values
        metadata= div.find_all(type)
        for value in metadata:
            # self.metadata[type].append(value.text)
            values.append(self.getValueAndAttrs(value))
        return values
            
    def extractDates(self, div):
        values = []
        metadata= div.find_all("date")
        for value in metadata:
            item = {
                "value" : value.text
            }
            
            '''
            if value.get("when-custom"):
                item["when-custom"] 
            wc = value.get("when-custom")
            print("wc", wc)
            if wc:
                self.metadata["date"].append(wc)
            '''
            
            map = value.attrs
            
            for key in map:
                item[key] = map[key]
                    
            values.append(item)
        return values
                
    def extractCerts(self, div):
        values = []
        metadata= div.find_all("date")
        for value in metadata:
            if value.get("cert"):
                value = "date_cert_" + value.get("cert")
                # self.metadata.cert.append(value)
                values.append(value)
                
        return values
                
    def extractCreated(self):
        # created = self.metadata.created
        # if len(created) > 0:
        created = self.metadata["created"] 
        if False:
            created = "2022-02-26"
        else:
            pass
            # self.metadata["created"] = "unknown"
            
    def extractDivs(self):
        self.metadata["children"] = self.extractDiv1s()
            
    def extractDiv1s(self):
        children = []
        divs = self.soup.find_all("div1")
        for div in divs:
            if div.get("type"):
                value = div.get("type")
                element = "div1"
                
                div_new = {
                    "element" : element,
                    "type": value,
                    "type_formatted": self.getFormattedType(value),
                    "children" : self.extractDiv2s(div),
                    "fulltext" : self.extractFullText(div),
                    "cert" : self.extractCerts(div),
                    "date" : self.extractDates(div),
                    "persName": self.extractElements(div, "persName"),
                    "placeName": self.extractElements(div, "placeName"),
                    "title": self.extractDivTitle(div, element),
                }
                
                # 画像情報の取得
                imageInfo = self.extractImageInfo(div)
                for key in imageInfo:
                    div_new[key] = imageInfo[key]
                
                children.append(div_new)
        return children
        
    
    def getFormattedType(self, value):
        for key in self.dict4div1:
            if key in value.upper():
                return self.dict4div1[key]
            
        return "[Missing] "+value.upper()
                
    def extractDiv2s(self, div_):
        children = []
        divs = div_.find_all("div2")
        for div in divs:
            if div.get("type"):
                element = "div2"
                value = div.get("type")
                div_new = {
                    "element" : element,
                    "type": value,
                    "type_formatted": self.getFormattedType(value),
                    "children" :self.extractDiv3s(div),
                    
                    
                    "fulltext" : self.extractFullText(div),
                    "cert" : self.extractCerts(div),
                    "date" : self.extractDates(div),
                    "persName": self.extractElements(div, "persName"),
                    "placeName": self.extractElements(div, "placeName"),
                    "title": self.extractDivTitle(div, element)
                }
                
                imageInfo = self.extractImageInfo(div)
                for key in imageInfo:
                    div_new[key] = imageInfo[key]
                
                children.append(div_new)
        return children

    def extractDivTitle(self, div, type):
        id = "None"
        if div.get("facs"):
            id = div.get("facs").replace("#", "")
        return self.metadata["title"] + "-" + type + "-" + id
    
    def extractDiv3s(self, div_):
        children = []
        divs = div_.find_all("div3")
        for div in divs:
            if div.get("type"):
                value = div.get("type")
                element = "div3"
                div_new = {
                    "element" : element,
                    "type": value,
                    "type_formatted": self.getFormattedType(value),
                    "fulltext" : self.extractFullText(div),
                    "cert" : self.extractCerts(div),
                    "date" : self.extractDates(div),
                    "persName": self.extractElements(div, "persName"),
                    "placeName": self.extractElements(div, "placeName"),
                    "title": self.extractDivTitle(div, element)
                }
                
                imageInfo = self.extractImageInfo(div)
                for key in imageInfo:
                    div_new[key] = imageInfo[key]
                
                children.append(div_new)
        return children
    
    def extractNotes(self):
        notesStmt = self.soup.find("notesStmt")
        values = self.extractElements(notesStmt, "note")
        self.metadata["note"] = values
        
    def attachFullText(self):
        self.metadata["fulltext"] = self.extractFullText(self.soup)
                
    def extractFullText(self, div):
        fulltext = div.text.strip() # .replace("\n", " ").strip()
        return fulltext
    
    '''
    def extractFullText_org(self, div=self.soup):
        fulltext = self.soup.text.replace("\n", " ").strip()
        self.metadata["fulltext"] = fulltext
    '''
    
    def getCanvases(self):
        
        manifest = self.soup.find("surfaceGrp").get("facs")
        
        id = manifest.split("/")[-2]
        
        file = "../docs/iiif/" + id + "/manifest.json"
        
        if not os.path.exists(file):
        
            df = requests.get(manifest).json()
            
            fw = open(file, 'w')
            json.dump(df, fw, ensure_ascii=False, indent=4,
                    sort_keys=True, separators=(',', ': '))
            fw.close()
            
        with open(file) as f:
            df = json.load(f)
            
        canvases = df["sequences"][0]["canvases"]
        
        map = {}
        for i in range(len(canvases)):
            canvas = canvases[i]
            map[canvas['@id']] = canvas["images"][0]["resource"]["service"]["@id"]
            
            if i == 0:
                self.metadata["canvas"] = canvas["@id"]
        self.canvases = map
        
        # 以下、重複。要検討。
        self.manifest =  manifest
        self.metadata["manifest"] = manifest
        
    def extractMedia(self):
        self.metadata["thumbnail"] = self.soup.find("graphic").get("url").replace("/original/", "/medium/")
        self.metadata["tei_url"] = self.uri_prefix + "/"+ self.dirname+"/" + self.metadata["title"] + ".xml"
        
    def extractSourceDesc(self):
        sourceDesc = self.soup.find("sourceDesc").find("p")
        # print(sourceDesc.text)
        try:
            metadata_json = json.loads(sourceDesc.text)
            for field in metadata_json:
                value_array = metadata_json[field]
                field_fixed = field.replace("saji:", "")
                self.metadata[field_fixed] = value_array
        except Exception as e:
            print(e)
            
    def extractImageInfo(self, div):
        facs_id = div.get("facs")
        
        if not facs_id:
            return {}
        
        facs_ids = facs_id.replace("#", "").split(" ")
        
        # 二つのIDの場合、どうするか
        
        for facs_id in facs_ids:
        
            zone = self.soup.find(attrs={"xml:id" : facs_id})
            
            # zoneがない場合が存在します。
            if not zone:
                continue

            surface = zone.parent

            graphic = surface.find("graphic")

            canvas_uri = graphic.get("n")

            ulx = int(zone.get("ulx"))
            uly = int(zone.get("uly"))
            lrx = int(zone.get("lrx"))
            lry = int(zone.get("lry"))

            x = ulx
            y = uly

            w = lrx - x
            h = lry - y

            member_id = canvas_uri + "#xywh=" + str(x) + ","+str(y)+","+str(w)+","+str(h)

            canvases = self.canvases

            image = canvases[canvas_uri] + "/{},{},{},{}/200,/0/default.jpg".format(x, y, w, h)

            return {
                "thumbnail" : image,
                "member" : member_id,
                "canvas" : canvas_uri,
                # "manifest": self.manifest
            }
        
        return {}
        
            
    def convert2json(self):
        return self.metadata
    
    @staticmethod
    def getDateValue(_date):
        """
        dateノードのValueの取得

        Parameters
        --------------
        date : xml element

        Returns
        -------
        dateValue : string
            XXXX-XX-XXの形の日付文字列

        """

        dateValue = None

        if _date.get("when-custom"):
            dateValue = _date.get("when-custom")
        elif _date.get("from-custom"):
            dateValue = _date.get("from-custom")
        elif _date.get("to-custom"):
            dateValue = _date.get("to-custom")

        return dateValue
    
    @staticmethod
    def getDateType(_date):
        """
        dateノードのタイプの取得

        Parameters
        --------------
        date : xml element

        Returns
        -------
        n_type : string
            ノードのタイプ。when, from, toなど。

        """

        type = None

        if _date.get("when-custom"):
            type = "when"
        elif _date.get("from-custom"):
            type = "from"
        elif _date.get("to-custom"):
            type = "to"

        return type
    
    @staticmethod
    def getDataValueAndType(_date):
            return getDateValue(_date), getDateType(_date)


def handleFile(file):
    
    
    item = Item(file)
    item.extractMedia()
    item.extractCreated()
    item.extractDivs()
    item.extractNotes()
    item.extractSourceDesc()
    item.attachFullText()
    
    return copy.deepcopy(item.convert2json())

dirname = "tei3"
dir = "../docs/"+dirname
files = glob.glob(dir+"/*.xml")

items = []

for i in tqdm(range(len(files))):
    file = files[i]
    
    res = handleFile(file)
    
    # pprint.pprint(res)
    
    items.append(res)
    
    # print(res)
    
    # break
    
fw = open("data/items.json", 'w')
json.dump(items, fw, ensure_ascii=False, indent=4,
        sort_keys=True, separators=(',', ': '))
fw.close()

fw = open("data/items.min.json", 'w')
json.dump(items, fw)
fw.close()

 11%|████▍                                    | 116/1060 [00:00<00:01, 576.07it/s]

Invalid control character at: line 6 column 92 (char 528)


 38%|███████████████▋                         | 404/1060 [00:00<00:01, 539.86it/s]

Invalid control character at: line 9 column 95 (char 786)
Invalid control character at: line 5 column 81 (char 451)
Invalid control character at: line 6 column 100 (char 563)


 54%|██████████████████████                   | 570/1060 [00:01<00:00, 512.00it/s]

Invalid control character at: line 6 column 101 (char 570)
Invalid control character at: line 4 column 91 (char 354)
Invalid control character at: line 6 column 97 (char 549)
Invalid control character at: line 7 column 94 (char 643)


 68%|███████████████████████████▉             | 723/1060 [00:01<00:00, 485.26it/s]

Invalid control character at: line 6 column 100 (char 547)
Invalid control character at: line 8 column 100 (char 743)


 95%|██████████████████████████████████████  | 1007/1060 [00:02<00:00, 432.58it/s]

Expecting value: line 1 column 86 (char 85)
Invalid control character at: line 6 column 99 (char 506)


100%|████████████████████████████████████████| 1060/1060 [00:02<00:00, 483.63it/s]


Invalid control character at: line 5 column 61 (char 409)


In [213]:
with open('data/items.json') as f:
    df = json.load(f)

selections = []
for item in df:
    members = []
    manifest = item["manifest"]
    selection = {
        "@id": "https://nakamura196.github.io/saji/data/curation.json/range1",
        "@type": "sc:Range",
        "label": "Automatic curation by TEI",
        "members": members,
        "within" : {
            "label" : item["title"],
            "@type" : "sc:Manifest",
            "@id" : manifest
        }
    }
    
    print(item["title"])
    
    related = "https://tei-eaj.github.io/aozora_tei/tools/visualization/facs/?url=" + item["tei_url"]
    
    div1s = item["children"]
    
    for div1 in div1s:
        
        if "member" in div1:
        
            member1 = {
                "label": div1["title"],
                "@type": "sc:Canvas",
                "@id": div1["member"],
                "metadata": [
                   {
                       "label": "type",
                       "value": div1["type"]
                   },
                    {
                       "label": "type_formatted",
                       "value": div1["type_formatted"]
                   },
                    {
                       "label": "element",
                       "value": div1["element"]
                   },
                ],
                "thumbnail": item["thumbnail"],
                "related ": related 
            }
            members.append(member1)
        
        div2s = div1["children"]
        
        for div2 in div2s:
            div3s = div2["children"]
            
            for div3 in div3s:
                member3 = {
                    
                }
                # members.append(member3)
    
    # for key in item:
    
    member = {
        "label": item["title"],
        "@type": "sc:Canvas",
        "@id": item["canvas"],
        "metadata": [
           {
               "label": "element",
               "value": item["element"]
           }
        ],
        "thumbnail": item["thumbnail"],
        "related": related
    }
    members.append(member)
        
    
    selections.append(selection)
    
curation = {
    "@context": [
        "http://iiif.io/api/presentation/2/context.json",
        "http://codh.rois.ac.jp/iiif/curation/1/context.json"
    ],
    "@id": "https://nakamura196.github.io/saji/data/curation2.json",
    "@type": "cr:Curation",
    "selections" : selections,
    "label": "オスマン・トルコ語文書群のデータ整理"
}

fw = open("../docs/data/curation2.json", 'w')
json.dump(curation, fw, ensure_ascii=False, indent=4,
        sort_keys=True, separators=(',', ': '))
fw.close()

fw = open("../docs/data/curation2.min.json", 'w')
json.dump(curation, fw)
fw.close()

DSCN0586
DSCN2391
DSCN2385
DSCN0592
DSCN2434
DSCN1883
DSCN9207
DSCN9213
DSCN0237
DSCN2408
DSCN1673
DSCN2187
DSCN0974
DSCN9005
DSCN0747
DSCN2150
DSCN9011
DSCN9039
DSCN0196
DSCN0182
DSCN0633
DSCN9171
DSCN0155
DSCN0141
DSCN2030
DSCN9165
DSCN2018
DSCN0169
DSCN9159
DSCN0394
DSCN1934
DSCN1908
DSCN0357
DSCN2226
DSCN9373
DSCN9367
DSCN1060
DSCN1712
DSCN2233
DSCN9366
DSCN0342
DSCN9372
DSCN2227
DSCN1921
DSCN0381
DSCN0395
DSCN2019
DSCN0626
DSCN2031
DSCN2025
DSCN0154
DSCN0829
DSCN2179
DSCN9038
DSCN2145
DSCN0746
DSCN0975
DSCN0961
DSCN0949
DSCN2192
DSCN2409
DSCN0578
DSCN2347
DSCN9212
DSCN0550
DSCN0236
DSCN2421
DSCN0222
DSCN9206
DSCN1882
DSCN0544
DSCN2353
DSCN1869
DSCN1855
DSCN2390
DSCN1841
DSCN1699
DSCN1857
DSCN1843
DSCN9210
DSCN2345
DSCN2351
DSCN0546
DSCN1658
DSCN0220
DSCN2379
DSCN1664
DSCN2184
DSCN2190
DSCN2147
DSCN0181
DSCN2033
DSCN0624
DSCN0618
DSCN0383
DSCN1937
DSCN0397
DSCN9364
DSCN2231
DSCN9370
DSCN9358
DSCN0368
DSCN2218
DSCN9359
DSCN1705
DSCN0433
DSCN9371
DSCN2224
DSCN2230
DSCN1739
DSCN1936
D