<a href="https://colab.research.google.com/github/lucagiovannini7/baroque-networks/blob/main/Textgrid_to_Dracor_Converter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Textgrid-to-DraCor Conversion Script

* Input: links to .xmls from the Textgrid library
* Output: (almost) DraCor-ready .xmls

Modified version of the [Emothe-to-Dracor conversion script](https://github.com/lucagiovannini7/baroque-networks/blob/main/MassConvertEmoThe.ipynb).

Written mostly by [Daniil Skorinkin](https://github.com/DanilSko), with some help by [Luca Giovannini](https://github.com/LucaGiovannini7), 2022.

## Imports

In [None]:
from bs4 import BeautifulSoup, Tag
import re
import os
!pip install wget
import wget
import time
import requests
from tqdm import tqdm
from datetime import datetime
from string import punctuation

## Reading data

In [None]:
xmls_addresses = ['http://textgridlab.org/1.0/tgcrud-public/rest/textgrid:sssj.0/data',
'http://textgridlab.org/1.0/tgcrud-public/rest/textgrid:n2m1.0/data']

In [None]:
len(xmls_addresses)

In [None]:
!mkdir 'source_xmls'

In [None]:
for url in tqdm(xmls_addresses):
    filename = url.split(':')[-1].replace('/','_')
    time.sleep(0.1)
    response = requests.get(url, verify=False)
    response.encoding = response.apparent_encoding
    filetext = response.text
    with open(f'source_xmls/{filename}','w', encoding='utf-8') as filetowrite:
        filetowrite.write(filetext)

## Transformation of files

### Transformation functions

In [None]:
def add_partic_desc(soup):
    set_of_char_pairs = set() # множество пар ID + строка 
    for sp in soup.find_all('sp'):
        try:
          if 'who' in sp.attrs:
              set_of_char_pairs.add((sp['who'], sp.speaker.text.strip('.: '))) 
        except:
          pass
    add_particdesc_to_header(soup, set_of_char_pairs)

In [None]:
def guess_gender(persName):
    if persName.lower().strip('.:[]()').endswith('a') or persName.lower().strip('.:[]()').endswith('e'):
        return 'FEMALE'
    return 'MALE'

In [None]:
tags_to_remove = ['sponsor', 'funder', 'appInfo', 
                  'respStmt', 'principal', 'notesStmt', 
                  'encodingDesc', 'langUsage', 'pubPlace',
                  'extent', 'authority', 'editionStmt', 'creation', 'lb'
                 ]

tags_to_check_parent_and_remove = {'date':'publicationStmt'}
tags_with_attrs_to_remove = {'TEI':['xmlns:','xmlns:jxb','xmlns:tei','xmlns:tgl','xmlns:tgr','xmlns:tgs','xmlns:tns','xmlns:xsi'],
                             'teiHeader':['xmlns:xi','xmlns:fn','xmlns:a'], 
                             'head':['type', 'xml:id'], 'keywords':['scheme'],
                             'stage':['xml:id','rend'], 'editor':['role'], 
                             'p':['type','xml:id', 'n', 'rend'], 'sp':['xml:id'], 'l':['xml:id', 'rend'],
                             'lb':['xml:id'], 'castItem':['xml:id','rend'],'pb':['xml:id','type'],
                             'div':['xml:id', 'n', 'type', 'subtype'],'speaker':['xml:id'],
                             'role':['xml:id'], 'seg':['xml:id', 'rend'],'hi':['rend','xml:id']}

attrs_to_be_renamed = [
    {'tag':'div', 'attr':'type',         ##### leftovers from the Emothe converter
     'old_value':'elenco', 
     'new_value':'Dramatis_Personae'},
    {'tag':'div', 'attr':'type', 
     'old_value':'elenco', 
     'new_value':'Dramatis_Personae'}
]
tags_to_rename_and_add_attr = {'div1':('div', {'type':'act'}), 'div2':('div', {'type':'scene'})}
tags_to_check_attr_and_remove = [
    {'tag':'author', 'attr':'ana', 
     'value':'fiable'},
    {'tag':'author', 'attr':'key', 
     'value':'archivo'},
    {'tag':'title', 'attr':'key',                ##### leftovers from the Emothe converter
     'value':'archivo'},
    {'tag':'title', 'attr':'key', 
     'value':'orden'}
] 

tags_to_replace_with_children = ['lg','hi','seg']

In [None]:
def remove_tags(soup, tags_to_remove):
    for tag in tags_to_remove:
        all_occurrences = soup.findAll(tag)
        for tag_instance in all_occurrences:
            tag_instance.decompose()

In [None]:
def remove_tags_parent_check(soup, tags_to_check_parent_and_remove):
    for tag in tags_to_check_parent_and_remove:
        all_occurrences = soup.findAll(tag)
        parent = tags_to_check_parent_and_remove[tag]
        for tag_instance in all_occurrences:
            if tag_instance.parent.name == parent:
                tag_instance.decompose()

In [None]:
def rename_attrs(soup, attrs_to_be_renamed):
    for attr in attrs_to_be_renamed:
        tagname = attr['tag']
        attrname = attr['attr']
        old_value = attr['old_value']
        new_value = attr['new_value']
        all_occurrences = soup.findAll(tagname, attrs={attrname: old_value})
        for tag_instance in all_occurrences: 
            tag_instance[attrname] = new_value

In [None]:
def remove_attrs(soup, tags_with_attrs_to_remove):
    for tag in tags_with_attrs_to_remove:
        all_occurrences = soup.findAll(tag)
        attrs = tags_with_attrs_to_remove[tag]
        for tag_instance in all_occurrences:
            for attr in attrs:
                del tag_instance[attr]

In [None]:
def check_attr_and_remove(soup, tags_to_check_attr_and_remove):
    for tag in tags_to_check_attr_and_remove:
        tagname = tag['tag']
        attrname = tag['attr']
        old_value = tag['value']
        all_occurrences = soup.findAll(tagname, attrs={attrname: old_value})
        for tag_instance in all_occurrences: 
            tag_instance.decompose()

In [None]:
def rename_tags_and_add_attr(soup, tags_to_rename_and_add_attr):
    for tag in tags_to_rename_and_add_attr:
        tagtofind = tag
        newname = tags_to_rename_and_add_attr[tag][0]
        newattrs = tags_to_rename_and_add_attr[tag][1]
        all_occurrences = soup.findAll(tagtofind)
        for tag_instance in all_occurrences: 
            tag_instance.name = newname
            for attr in newattrs:
                value = newattrs[attr]
                tag_instance[attr] = value

In [None]:
def replace_with_children(soup, tags_to_replace_with_children):
    for tag in tags_to_replace_with_children:
        all_occurrences = soup.findAll(tag)
        for tag_instance in all_occurrences: 
            tag_instance.replaceWithChildren()

In [None]:
## creating an artificial particDesc

def add_who_and_partic(soup):
    set_of_char_pairs = set() # set of character id  -- character name pairs for particDesc 
    for sp in soup.find_all('sp'):
        add_who(sp)
        if 'who' in sp.attrs:
            set_of_char_pairs.add((sp['who'], sp.speaker.text.strip('.:! '))) #
    add_particdesc_to_header(soup, set_of_char_pairs)


def add_particdesc_to_header(soup, set_of_char_pairs):
    #print(set_of_char_pairs)
    profileDesc = soup.find('profileDesc')
    if profileDesc is None:
        profileDesc = Tag(name = 'profileDesc')
        teiHeader = soup.find('teiHeader')
        teiHeader.append(profileDesc)
    particDesc = Tag(name = 'particDesc')
    profileDesc.append(particDesc)
    listPerson = Tag(name = 'listPerson')
    particDesc.append(listPerson)
    
    used_ids = []
    for pair in set_of_char_pairs:
        current_id = pair[0]
        if current_id not in used_ids:
            person = Tag(name = 'person')
            person['xml:id'] = current_id
            persName = Tag(name = 'persName')
            person.append(persName)
            persName.append(pair[1])
            person['sex'] = guess_gender(pair[1]) 
            listPerson.append(person)
            used_ids.append(current_id)



def add_who(sp):
    speaker = sp.find('speaker')
    if speaker is None:
        return
    speaker_text = speaker.text
    speaker_id = speaker_text.lower().strip('. ')
    speaker_id = re.sub('\s', '_', speaker_id)
    speaker_id = re.sub('[\'▪〈◊〉●—‛…ᵒ]', '', speaker_id)
    speaker_id = ''.join([i for i in speaker_id if i not in punctuation])
    sp['who'] = speaker_id

In [None]:
def recreate_titleStmt(soup):
    title_text = soup.titleStmt.title.text
    soup.titleStmt.decompose()
    author_name = soup.find('author')
    author_text = author_name.text
    try:
        pnd_as_string = author_name['key'][4:] ##### skip ""pnd:"
    except:
        pass  
    new_titleStmt = f'''
    <titleStmt>
      <title></title>
      <subtitle/>
      <author>
        <persName>
          <forename></forename>
          <surname></surname>
        </persName>
        <idno type="wikidata"></idno>
        <idno type="pnd"></idno>
      </author>
    <titleStmt>
'''
    titlesoup = BeautifulSoup(new_titleStmt, 'xml')
    soup.publicationStmt.insert_before(titlesoup)
    try:
        soup.fileDesc.find('surname').append(author_text.split(',')[0])
        soup.fileDesc.find('forename').append(author_text.split(',')[1].strip())
    except:
        soup.fileDesc.find('surname').append("insert surname")
        soup.fileDesc.find('forename').append("insert forename")
    soup.fileDesc.find('title').append(str(title_text))
    try:
        soup.fileDesc.find('idno',attrs={'type':'pnd'}).append(pnd_as_string)
    except:
        soup.fileDesc.find('idno',attrs={'type':'pnd'}).append("insert pnd here")

In [None]:
def recreate_sourceDesc(soup):

    textgrid_id = soup.publicationStmt.find('idno',attrs={'type':'TextGridUri'}).text
    textgrid_id = textgrid_id[-6:]
    originalSource = soup.sourceDesc.title.text
    soup.sourceDesc.decompose()
    digitalSource_as_string = f'''
    <sourceDesc>
      <bibl type="digitalSource">
        <name>TextGrid Repository</name>
        <idno type="URL">http://www.textgridrep.org/textgrid:{textgrid_id}</idno>
        <availability>
          <licence>
            <ab>CC-BY-3.0</ab>
            <ref target="http://creativecommons.org/licenses/by/3.0/de/legalcode">Lizenzvertrag</ref>
          </licence>
        </availability>
      </bibl>
      <bibl type="digitalSource">
        <title></title>
      </bibl>
    </sourceDesc>
    '''
    digitalSourcesoup = BeautifulSoup(digitalSource_as_string, 'xml')
    digitalSourcesoup.title.append(originalSource)
    soup.fileDesc.append(digitalSourcesoup)


In [None]:
def add_standoff(soup):
    standoff_as_string = f'''
    <standOff>
        <listEvent>
        <event type="print" when="9999">
        <desc/>
        </event>
        <event type="premiere" when="9999">
        <desc/>
        </event>
        <event type="written" when="9999">
        <desc/>
        </event>
        </listEvent>
        <listRelation>
        <relation name="wikidata" active="INSERT" passive="INSERT"/>
        </listRelation>
    </standOff>
    '''
    standoffsoup = BeautifulSoup(standoff_as_string, 'xml')
    standoff = standoffsoup.standOff
    soup.teiHeader.insert_after(standoff)

In [None]:
def replace_pbstmt(soup):
    try:
        soup.find('publicationStmt').decompose()
        pubstmt_as_string = """
          <publicationStmt>
            <publisher xml:id="dracor">DraCor</publisher>
            <idno type="URL">https://dracor.org</idno>
            <availability>
              <licence>
                <ab>CC0 1.0</ab>
                <ref target="https://creativecommons.org/publicdomain/zero/1.0/">Licence</ref>
              </licence>
            </availability>
          </publicationStmt>
        """
        pbsoup = BeautifulSoup(pubstmt_as_string, 'xml')
        pbstmt = pbsoup.publicationStmt
        soup.titleStmt.insert_after(pbstmt)
    except:
        pass

In [None]:
def enhance_keywords(soup):
   classcode = '<classCode scheme="http://www.wikidata.org/entity/">insert wikidata code</classCode>'
   keywords_tag = soup.find('keywords')
   pbsoup = BeautifulSoup(classcode, 'xml')
   classcode = pbsoup.find('classCode')
   keywords_tag.append(classcode)

In [None]:
def add_revisionDesc(soup):
    revisionDesc_as_string = f'''
    <revisionDesc>
      <listChange>
        <change when="2022-11-22">(LG) conversion from source</change>
      </listChange>
    </revisionDesc>
    '''
    revDescsoup = BeautifulSoup(revisionDesc_as_string, 'xml')
    soup.profileDesc.append(revDescsoup)

In [None]:
#leftovers from the emothe script
'''
def replace_titlepage_with_head(soup):
    try:
        titlePage = soup.find('titlePage')
        titletext = titlePage.find('title').text
        head = soup.new_tag('head')
        head.append(titletext)
        titlePage.insert_after(head)
        titlePage.decompose()
    except:
        pass
'''

In [None]:
#leftovers from the emothe script
'''
def split_author_name(soup):
    author = soup.find('author')
    text = author.text
    splitname = text.split(',')
    if len(splitname) == 2:
        author.clear()
        forename = soup.new_tag('forename')
        forename.append(splitname[0].strip())
        author.append(forename)
        surname = soup.new_tag('surname')
        surname.append(splitname[1].strip())
        author.append(surname)
'''

### Applying transformation

In [None]:
!mkdir 'transformed'

In [None]:
def process_file(path_to_file):
    with open(path_to_file, 'r', encoding='utf-8') as openfile: 
        file_as_text = openfile.read()
    #    try:
        soup = BeautifulSoup(file_as_text, 'xml')
        add_partic_desc(soup)
        remove_tags(soup, tags_to_remove)
        remove_tags_parent_check(soup, tags_to_check_parent_and_remove)
        rename_attrs(soup, attrs_to_be_renamed)
        remove_attrs(soup, tags_with_attrs_to_remove)
        rename_tags_and_add_attr(soup, tags_to_rename_and_add_attr)
        add_who_and_partic(soup)        
        add_standoff(soup)
        recreate_titleStmt(soup)
        recreate_sourceDesc(soup)
        replace_pbstmt(soup)
        enhance_keywords(soup)
        add_revisionDesc(soup)
        check_attr_and_remove(soup, tags_to_check_attr_and_remove)
        replace_with_children(soup, tags_to_replace_with_children)
        #replace_titlepage_with_head(soup)
        #split_author_name(soup)
        new_path = soup.surname.text + '_' + soup.title.text + '.xml'
        new_path = 'transformed/' + new_path
        print(new_path)
        with open(new_path, 'w', encoding='utf-8') as output:
            output.write(str(soup))
     #   except:
      #      print(f'failed to process' , path_to_file)

In [None]:
#process all:
for xmlfilename in os.listdir('source_xmls'):
    print(xmlfilename)
    process_file(os.path.join('source_xmls', xmlfilename))

### Adding indents

In [None]:
#upload the xml formatter

if not os.path.isfile('format.conf'):
    wget.download('https://raw.githubusercontent.com/lucagiovannini7/baroque-networks/main/dracor-xmls/xmlformat.pl')

!cp xmlformat.pl /usr/local/bin/xmlformat

!chmod 755 -R /usr/local/bin/xmlformat


In [None]:
#upload the format.conf file

if not os.path.isfile('format.conf'):
    wget.download('https://raw.githubusercontent.com/dracor-org/gerdracor/main/format.conf')

In [None]:
for filename in os.listdir('transformed'):
    if '.xml' in filename:
        file_path = os.path.join('transformed', filename)
        try:
            os.system(f'xmlformat --config-file=format.conf \"{file_path}\" > \"{file_path.replace(".xml","")}_indented.xml\" ')
            os.system(f'rm {file_path}')
        except:
            print('indentation failed:', filename)