# Emothe-to-DraCor Conversion Script

Python script used to convert 150+ texts found in the [Biblioteca Digital Artelope](https://emothe.uv.es/biblioteca/) ([Emothe project](https://emothe.uv.es/), University of Valencia) to a DraCor-ready .xml format.

Written by [Daniil Skorinkin](https://github.com/DanilSko) (2022).

## Imports

In [None]:
from bs4 import BeautifulSoup, Tag
import re
import os
!pip install wget
import wget
import time
import requests
from tqdm import tqdm
from datetime import datetime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9675 sha256=83f8e22cd7eb65e99be571b4d5c1ca5e9f532b38078c77a81b604e01ac6ad0b5
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [None]:
#soup.findAll('div',type='elenco')

## Reading data

In [None]:
xmls_addresses = ['https://emothe.uv.es/biblioteca/textosXML/EMOTHE0361_ElCaballeroDeOlmedo.xml']

In [None]:
len(xmls_addresses)

1

In [None]:
!mkdir 'source_xmls'

In [None]:
for url in tqdm(xmls_addresses):
    filename = url.split('/')[-1]
    time.sleep(0.1)
    filetext = requests.get(url, verify=False).text
    with open(f'source_xmls/{filename}','w') as filetowrite:
        filetowrite.write(filetext)

100%|██████████| 1/1 [00:00<00:00,  1.35it/s]


## Transformation of files

### Transformation functions

In [None]:
def add_partic_desc(soup):
    set_of_char_pairs = set() # множество пар ID + строка 
    for sp in soup.find_all('sp'):
        try:
          if 'who' in sp.attrs:
              set_of_char_pairs.add((sp['who'], sp.speaker.text.strip('.: '))) 
        except:
          pass
    add_particdesc_to_header(soup, set_of_char_pairs)

In [None]:
def guess_gender(persName):
    if persName.lower().strip('.:[]()').endswith('a'):
        return 'FEMALE'
    return 'MALE'

In [None]:
def add_particdesc_to_header(soup, set_of_char_pairs):
    #print(set_of_char_pairs)
    profileDesc = soup.find('profileDesc')
    if profileDesc is None:
        profileDesc = Tag(name = 'profileDesc') 
    particDesc = Tag(name = 'particDesc')
    profileDesc.append(particDesc)
    listPerson = Tag(name = 'listPerson')
    particDesc.append(listPerson)
    for pair in set_of_char_pairs:
        person = Tag(name = 'person')
        person['xml:id'] = pair[0].strip('#')
        persName = Tag(name = 'persName')
        person.append(persName)
        person['sex'] = guess_gender(pair[1]) 
        #print(pair[1])
        persName.append(pair[1])
        listPerson.append(person)
    teiHeader = soup.find('teiHeader')
    teiHeader.append(profileDesc)

In [None]:
tags_to_remove = ['sponsor', 'funder', 'appInfo', 
                  'respStmt', 'principal', 'notesStmt', 
                  'encodingDesc', 'langUsage', 'pubPlace',
                  'extent', 'authority', 'editionStmt', 'revisionDesc'
                 ]

tags_to_check_parent_and_remove = {'date':'publicationStmt'}
tags_with_attrs_to_remove = {'author':['key'], 'stage':['xml:id'], 'editor':['role'], 
                             'l':['xml:id', 'n'], 'sp':['xml:id'], 'div1':['xml:id', 'n'],
                             'div':['xml:id', 'n'], 'div2':['xml:id', 'n'],
                             'role':['xml:id']}

attrs_to_be_renamed = [
    {'tag':'div', 'attr':'type', 
     'old_value':'elenco', 
     'new_value':'Dramatis_Personae'},
    {'tag':'div', 'attr':'type', 
     'old_value':'elenco', 
     'new_value':'Dramatis_Personae'}
]
tags_to_rename_and_add_attr = {'div1':('div', {'type':'act'}), 'div2':('div', {'type':'scene'})}
tags_to_check_attr_and_remove = [
    {'tag':'author', 'attr':'ana', 
     'value':'fiable'},
    {'tag':'author', 'attr':'key', 
     'value':'archivo'},
    {'tag':'title', 'attr':'key', 
     'value':'archivo'},
    {'tag':'title', 'attr':'key', 
     'value':'orden'}
] 

tags_to_replace_with_children = ['lg']

In [None]:
def remove_tags(soup, tags_to_remove):
    for tag in tags_to_remove:
        all_occurrences = soup.findAll(tag)
        for tag_instance in all_occurrences:
            tag_instance.decompose()

In [None]:
def remove_tags_parent_check(soup, tags_to_check_parent_and_remove):
    for tag in tags_to_check_parent_and_remove:
        all_occurrences = soup.findAll(tag)
        parent = tags_to_check_parent_and_remove[tag]
        for tag_instance in all_occurrences:
            if tag_instance.parent.name == parent:
                tag_instance.decompose()

In [None]:
def rename_attrs(soup, attrs_to_be_renamed):
    for attr in attrs_to_be_renamed:
        tagname = attr['tag']
        attrname = attr['attr']
        old_value = attr['old_value']
        new_value = attr['new_value']
        all_occurrences = soup.findAll(tagname, attrs={attrname: old_value})
        for tag_instance in all_occurrences: 
            tag_instance[attrname] = new_value

In [None]:
def remove_attrs(soup, tags_with_attrs_to_remove):
    for tag in tags_with_attrs_to_remove:
        all_occurrences = soup.findAll(tag)
        attrs = tags_with_attrs_to_remove[tag]
        for tag_instance in all_occurrences:
            for attr in attrs:
                del tag_instance[attr]

In [None]:
def check_attr_and_remove(soup, tags_to_check_attr_and_remove):
    for tag in tags_to_check_attr_and_remove:
        tagname = tag['tag']
        attrname = tag['attr']
        old_value = tag['value']
        all_occurrences = soup.findAll(tagname, attrs={attrname: old_value})
        for tag_instance in all_occurrences: 
            tag_instance.decompose()

In [None]:
def rename_tags_and_add_attr(soup, tags_to_rename_and_add_attr):
    for tag in tags_to_rename_and_add_attr:
        tagtofind = tag
        newname = tags_to_rename_and_add_attr[tag][0]
        newattrs = tags_to_rename_and_add_attr[tag][1]
        all_occurrences = soup.findAll(tagtofind)
        for tag_instance in all_occurrences: 
            tag_instance.name = newname
            for attr in newattrs:
                value = newattrs[attr]
                tag_instance[attr] = value

In [None]:
def replace_with_children(soup, tags_to_replace_with_children):
    for tag in tags_to_replace_with_children:
        all_occurrences = soup.findAll(tag)
        for tag_instance in all_occurrences: 
            tag_instance.replaceWithChildren()

In [None]:
def add_standoff(soup):
    #today = datetime.today().strftime('%Y')
    standoff_as_string = f'''
    <standOff>
        <listEvent>
        <event type="print" when="9999">
        <desc/>
        </event>
        <event type="premiere" when="9999">
        <desc/>
        </event>
        <event type="written" when="9999">
        <desc/>
        </event>
        </listEvent>
        <listRelation>
        <relation name="wikidata" active="INSERT" passive="INSERT"/>
        </listRelation>
    </standOff>
    '''
    standoffsoup = BeautifulSoup(standoff_as_string, 'xml')
    standoff = standoffsoup.standOff
    soup.teiHeader.insert_after(standoff)

In [None]:
def replace_pbstmt(soup):
    try:
        soup.find('publicationStmt').decompose()
        pubstmt_as_string = """
          <publicationStmt>
            <publisher xml:id="dracor">DraCor</publisher>
            <idno type="URL">https://dracor.org</idno>
            <availability>
              <licence>
                <ab>CC0 1.0</ab>
                <ref target="https://creativecommons.org/publicdomain/zero/1.0/">Licence</ref>
              </licence>
            </availability>
          </publicationStmt>
        """
        pbsoup = BeautifulSoup(pubstmt_as_string, 'xml')
        pbstmt = pbsoup.publicationStmt
        soup.titleStmt.insert_after(pbstmt)
    except:
        pass

In [None]:
def replace_textclass(soup):
    soup.find('textClass').decompose()
    textClass_as_string = """
      <textClass>
        <keywords>
          <term type="genreTitle">insert genre</term>
        </keywords>
        <classCode scheme="http://www.wikidata.org/entity/">insert wikidata code</classCode>
      </textClass>
    """
    txtclasssoup = BeautifulSoup(textClass_as_string, 'xml')
    txtclasstag = txtclasssoup.textClass
    soup.particDesc.insert_after(txtclasstag)

In [None]:
def replace_titlepage_with_head(soup):
    try:
        titlePage = soup.find('titlePage')
        titletext = titlePage.find('title').text
        head = soup.new_tag('head')
        head.append(titletext)
        titlePage.insert_after(head)
        titlePage.decompose()
    except:
        pass

In [None]:
def split_author_name(soup):
    author = soup.find('author')
    text = author.text
    splitname = text.split(',')
    if len(splitname) == 2:
        author.clear()
        forename = soup.new_tag('forename')
        forename.append(splitname[0].strip())
        author.append(forename)
        surname = soup.new_tag('surname')
        surname.append(splitname[1].strip())
        author.append(surname)

### Applying transformation

In [None]:
!mkdir 'transformed'

In [None]:
def process_file(path_to_file):
    with open(path_to_file, 'r') as openfile: #, encoding='utf-16'
        file_as_text = openfile.read()
    #    try:
        soup = BeautifulSoup(file_as_text, 'xml')
        add_partic_desc(soup)
        remove_tags(soup, tags_to_remove)
        remove_tags_parent_check(soup, tags_to_check_parent_and_remove)
        rename_attrs(soup, attrs_to_be_renamed)
        remove_attrs(soup, tags_with_attrs_to_remove)
        rename_tags_and_add_attr(soup, tags_to_rename_and_add_attr)
        replace_pbstmt(soup)
        add_standoff(soup)
        replace_textclass(soup)
        check_attr_and_remove(soup, tags_to_check_attr_and_remove)
        replace_with_children(soup, tags_to_replace_with_children)
        replace_titlepage_with_head(soup)
        split_author_name(soup)
        new_path = path_to_file.replace('.xml', '_transformed.xml')
        new_path = new_path.replace('source_xmls', 'transformed')
        with open(new_path, 'w') as output:
            output.write(str(soup))
     #   except:
      #      print(f'failed to process' , path_to_file)

In [None]:
#test on one:
#process_file('source_xmls/EMOTHE0383_LosAmantes.xml')

In [None]:
#process all:
for xmlfilename in os.listdir('source_xmls'):
    if '.xml' in xmlfilename:
        process_file(os.path.join('source_xmls', xmlfilename))

### Adding indents

In [None]:
#upload the xml formatter

if not os.path.isfile('format.conf'):
    wget.download('https://raw.githubusercontent.com/lucagiovannini7/baroque-networks/main/dracor-xmls/xmlformat.pl')

!cp xmlformat.pl /usr/local/bin/xmlformat

!chmod 755 -R /usr/local/bin/xmlformat


In [None]:
#upload the format.conf file

if not os.path.isfile('format.conf'):
    wget.download('https://raw.githubusercontent.com/dracor-org/gerdracor/main/format.conf')

In [None]:
for filename in os.listdir('transformed'):
    if '_transformed.xml' in filename:
        file_path = os.path.join('transformed', filename)
        try:
            os.system(f'xmlformat --config-file=format.conf \"{file_path}\" > \"{file_path.replace(".xml","")}_indented.xml\" ')
            os.system(f'rm {file_path}')
        except:
            print('indentation failed:', filename)