# TEI Tools

> Tools for working with TEI/XML

In [None]:
#| default_exp api

In [None]:
#| hide
from nbdev.showdoc import *
from pprint import pprint
import glob
# %matplotlib inline 
# import matplotlib.pyplot as plt

In [None]:
#| export
from bs4 import BeautifulSoup

class TeiTools:
    def __init__(self):
        '''
        This is the main class for the tei-tools package.'''
        pass

    @staticmethod
    def analyze(path, freq={}):
        '''
        指定したファイルに含まれる要素および属性を抽出する
        
        Parameters:  
        * path: str  -  Path to the TEI/XML file  
        * freq: dict  -  A dictionary to store the results
        '''

        soup = BeautifulSoup(open(path,'r'), "xml")
            
        children = soup.find("TEI").findChildren(recursive=False)

        # freq = {}

        for child in children:
            p = child.name

            if p not in freq:
                freq[p] = {}

            elements = child.findChildren(recursive=True)
            # print(len(elements))

            m = freq[p]

            for element in elements:
                name = element.name
                if name not in m:
                    m[name] = {}

                attrs = element.attrs
                if len(attrs) == 0:
                    field = "none"
                    if field not in m[name]:
                        m[name][field] = 0
                    m[name][field] += 1
                for attr in attrs:
                    field = attr
                    if field not in m[name]:
                        m[name][field] = 0
                    m[name][field] += 1
        
        return freq

    @staticmethod
    def visualize(data):
        '''
        指定したデータを可視化する
        
        Parameters:  
        * data: dict  -  Data to visualize
        '''

        '''

        # a = range(0, 7)
        # b = [55,21,61,98,85,52,99]

        freq = {}
        for group in data:
            # count = 0
            # pprint(data[key])
            for element in data[group]:
                for attr in data[group][element]:
                    if element not in freq:
                        freq[element] = 0
                    freq[element] += data[group][element][attr]

        # 値の多い順にソート
        freq = {k: v for k, v in sorted(freq.items(), key=lambda item: item[1])}

        a = []
        b = []
        for key in freq:
            a.append(key)
            b.append(freq[key])
        plt.barh(a, b)
        plt.show()

        '''

        pass

    @staticmethod
    def analyzeDir(path):
        '''
        指定したパスに含まれるXMLファイルに含まれる要素および属性を抽出する
        
        Parameters:  
        * path: str  -  Path to the TEI/XML file e.g. "data/*.xml"
        '''

        freq = {}

        files = glob.glob(path, recursive=True)

        for file in files:
            freq = TeiTools.analyze(file, freq=freq)

        return freq

    @staticmethod
    def addWordElement(path, target="TEI"):
        '''
        指定したファイルに含まれるwordにタグを付与する
        
        Parameters:  
        * path: str  -  Path to the TEI/XML file  
        * target: str  -  Target element name to add word element

        Returns:  
        * str  -  The modified XML file
        '''

        soup = BeautifulSoup(open(path,'r'), "xml")

        elements = soup.find(target).findChildren(text=True, recursive=True)

        for element in elements:
            text = element.string.strip()
            if text == "":
                continue

            seg = soup.new_tag("seg")

            for i in range(len(text)):
                w = text[i:i+1]

                w = soup.new_tag("w")
                w.string = text[i:i+1]

                seg.append(w)

            element.replace_with(seg)

        return soup

    @staticmethod
    def save(path, soup):
        '''
        指定したパスにBeautifulSoupオブジェクトを保存する
        
        Parameters:  
        * path: str  -  Path to the TEI/XML file
        * soup: BeautifulSoup  -  The modified XML file
        '''

        f = open(path, 'w')
        f.write(soup.prettify())
        f.close()

## 準備

<!-- サンプルデータとして、校異源氏物語のTEI/XMLをダウンロード -->

In [None]:
#| hide
# !git clone https://github.com/kouigenjimonogatari/kouigenjimonogatari.github.io example

In [None]:
!mkdir example
!wget https://www.hi.u-tokyo.ac.jp/collection/degitalgallary/wakozukan/tei/data/main.xml -O example/main.xml 

mkdir: example: File exists
--2022-09-22 18:32:08--  https://www.hi.u-tokyo.ac.jp/collection/degitalgallary/wakozukan/tei/data/main.xml
www.hi.u-tokyo.ac.jp (www.hi.u-tokyo.ac.jp) をDNSに問いあわせています... 52.192.145.245, 52.197.208.150
www.hi.u-tokyo.ac.jp (www.hi.u-tokyo.ac.jp)|52.192.145.245|:443 に接続しています... 接続しました。
HTTP による接続要求を送信しました、応答を待っています... 200 OK
長さ: 24518 (24K) [text/xml]
`example/main.xml' に保存中


2022-09-22 18:32:08 (1.51 MB/s) - `example/main.xml' へ保存完了 [24518/24518]



## 要素の分析

TEI/XMLに含まれる要素および属性を一覧します。

In [None]:
show_doc(TeiTools.analyze)

---

### TeiTools.analyze

>      TeiTools.analyze (path, freq={})

指定したファイルに含まれる要素および属性を抽出する

Parameters:  
* path: str  -  Path to the TEI/XML file  
* freq: dict  -  A dictionary to store the results

In [None]:
input_path = "example/main.xml"
results = TeiTools.analyze(input_path)
results

{'teiHeader': {'fileDesc': {'none': 1},
  'titleStmt': {'none': 1},
  'title': {'none': 1},
  'publicationStmt': {'none': 1},
  'publisher': {'none': 1},
  'sourceDesc': {'none': 1},
  'listPerson': {'none': 1},
  'person': {'xml:id': 9},
  'persName': {'none': 9},
  'note': {'type': 27, 'subtype': 1, 'source': 1},
  'listPlace': {'none': 1},
  'place': {'xml:id': 5},
  'placeName': {'none': 5}},
 'text': {'body': {'none': 1},
  'div': {'type': 3, 'none': 2},
  'ab': {'style': 14, 'type': 15, 'xml:id': 15, 'none': 2},
  'lb': {'style': 14, 'none': 6},
  'seg': {'type': 23, 'xml:id': 23},
  'add': {'xml:id': 10},
  'note': {'target': 4, 'type': 4},
  'p': {'none': 6},
  'persName': {'corresp': 9},
  'name': {'type': 3, 'xml:id': 3},
  'placeName': {'corresp': 5}},
 'facsimile': {'surface': {'source': 1},
  'zone': {'xml:id': 14, 'ulx': 14, 'uly': 14, 'lrx': 14, 'lry': 14}}}

In [None]:
show_doc(TeiTools.visualize)

---

### TeiTools.visualize

>      TeiTools.visualize (data)

指定したデータを可視化する

Parameters:  
* data: dict  -  Data to visualize

In [None]:
TeiTools.visualize(results)

In [None]:
show_doc(TeiTools.analyzeDir)

---

### TeiTools.analyzeDir

>      TeiTools.analyzeDir (path)

指定したパスに含まれるXMLファイルに含まれる要素および属性を抽出する

Parameters:  
* path: str  -  Path to the TEI/XML file e.g. "data/*.xml"

In [None]:
input_path = "example/*.xml"
results = TeiTools.analyzeDir(input_path)
results

{'teiHeader': {'fileDesc': {'none': 2},
  'titleStmt': {'none': 2},
  'title': {'none': 2},
  'publicationStmt': {'none': 2},
  'publisher': {'none': 2},
  'sourceDesc': {'none': 2},
  'listPerson': {'none': 2},
  'person': {'xml:id': 18},
  'persName': {'none': 18},
  'note': {'type': 54, 'subtype': 2, 'source': 2},
  'listPlace': {'none': 2},
  'place': {'xml:id': 10},
  'placeName': {'none': 10},
  'seg': {'none': 131},
  'w': {'none': 2945}},
 'text': {'body': {'none': 2},
  'div': {'type': 6, 'none': 4},
  'ab': {'style': 28, 'type': 30, 'xml:id': 30, 'none': 4},
  'lb': {'style': 28, 'none': 12},
  'seg': {'type': 46, 'xml:id': 46, 'none': 190},
  'add': {'xml:id': 20},
  'note': {'target': 8, 'type': 8},
  'p': {'none': 12},
  'persName': {'corresp': 18},
  'name': {'type': 6, 'xml:id': 6},
  'placeName': {'corresp': 10},
  'w': {'none': 3569}},
 'facsimile': {'surface': {'source': 2},
  'zone': {'xml:id': 28, 'ulx': 28, 'uly': 28, 'lrx': 28, 'lry': 28},
  'seg': {'none': 17},
 

In [None]:
TeiTools.visualize(results)

## wordタグを付与する

In [None]:
show_doc(TeiTools.addWordElement)

---

### TeiTools.addWordElement

>      TeiTools.addWordElement (path)

指定したファイルに含まれるwordにタグを付与する

Parameters:  
* path: str  -  Path to the TEI/XML file

Returns:  
* str  -  The modified XML file

In [None]:
input_path = "example/main.xml"
soup = TeiTools.addWordElement(input_path)
TeiTools.save("example/main2.xml", soup)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()