In [None]:
import csv
from xml.etree.ElementTree import Element, SubElement, tostring
from xml.dom import minidom
from google.colab import files

In [None]:
filename = "Brodsky_XML_Locations.csv"
coordinates = "Brodsky_Coordinates.csv"

In [None]:
def csv_to_places_xml(main_csv, coord_csv):
    seen_keys = set()
    root = Element("places")

    coords_dict = {}
    with open(coord_csv, newline='', encoding='utf-8') as f_coords:
        reader_coords = csv.DictReader(f_coords, delimiter=';')
        for row in reader_coords:
            key = row.get("Location key", "").strip()
            if key:
                lat = row.get("Lat", "").strip()
                lon = row.get("Long", "").strip()
                coords_dict[key] = (lat, lon)

    with open(main_csv, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            location_key = row.get("Location key", "").strip()
            if not location_key or location_key in seen_keys:
                continue
            seen_keys.add(location_key)

            place = Element("place")
            place.set("xml:id", location_key)

            location_type = row.get("Location type", "").strip()
            if location_type:
                place.set("type", location_type)

            location = row.get("Location", "").strip()
            place_name = SubElement(place, "placeName")
            place_name.set("{http://www.w3.org/XML/1998/namespace}lang", "en")
            place_name.text = location

            location_elem = SubElement(place, "location")

            country = row.get("Country", "").strip()
            if country and country != "NA":
                country_elem = SubElement(location_elem, "country")
                country_elem.text = country

            city = row.get("City", "").strip()
            if city and city != "NA":
                city_elem = SubElement(location_elem, "city")
                city_elem.text = city

            if location_key in coords_dict:
                lat, lon = coords_dict[location_key]
                if lat and lon:
                    geo_elem = SubElement(location_elem, "geo")
                    geo_elem.text = f"{lat} {lon}"

            root.append(place)

    rough_string = tostring(root, encoding="utf-8")
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent="  ", encoding="utf-8").decode("utf-8")

In [None]:
xml_output = csv_to_places_xml(filename, coordinates)
print(xml_output)

<?xml version="1.0" encoding="utf-8"?>
<places>
  <place xml:id="roman_empire" type="ancient_country">
    <placeName xml:lang="en">Roman Empire</placeName>
    <location>
      <country>Italy</country>
    </location>
  </place>
  <place xml:id="venetian_lagoon" type="bay">
    <placeName xml:lang="en">Venetian Lagoon</placeName>
    <location>
      <country>Italy</country>
      <city>Venice</city>
    </location>
  </place>
  <place xml:id="pension_accademia" type="hotel">
    <placeName xml:lang="en">Pension Accademia</placeName>
    <location>
      <country>Italy</country>
      <city>Venice</city>
      <geo>45.4320649 12.3270656</geo>
    </location>
  </place>
  <place xml:id="venice" type="city">
    <placeName xml:lang="en">Venice</placeName>
    <location>
      <country>Italy</country>
      <city>Venice</city>
    </location>
  </place>
  <place xml:id="adriatic_sea" type="sea">
    <placeName xml:lang="en">Adriatic Sea</placeName>
    <location>
      <country>Italy</co

In [None]:
def csv_to_persons_xml(csv_file):
    seen_keys = set()
    root = Element("persons")

    with open(csv_file, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            name_key = row.get("Name key", "").strip()
            if not name_key or name_key in seen_keys:
                continue
            seen_keys.add(name_key)

            person_elem = Element("person")
            person_elem.set("xml:id", name_key)

            pers_name = SubElement(person_elem, "persName")
            pers_name.set("{http://www.w3.org/XML/1998/namespace}lang", "en")
            pers_name.text = row.get("Person / Entity", "").strip()

            type_val = row.get("Type", "").strip()
            if type_val and type_val != "NA":
                for t in type_val.split(","):
                    t = t.strip()
                    if t:
                        type_elem = SubElement(person_elem, "type")
                        type_elem.text = t

            role_val = row.get("Role", "").strip()
            if role_val and role_val != "NA":
                for r in role_val.split(","):
                    r = r.strip()
                    if r:
                        role_elem = SubElement(person_elem, "role")
                        role_elem.text = r

            birth_val = row.get("Birth_year", "").strip()
            if birth_val and birth_val != "NA":
                birth_elem = SubElement(person_elem, "birth")
                birth_elem.set("when", birth_val)
                birth_elem.text = birth_val

            death_val = row.get("Death_year", "").strip()
            if death_val and death_val != "NA":
                death_elem = SubElement(person_elem, "death")
                death_elem.set("when", death_val)
                death_elem.text = death_val

            year_val = row.get("Year", "").strip()
            if year_val and year_val != "NA":
                date_elem = SubElement(person_elem, "date")
                date_elem.set("when", year_val)
                date_elem.text = year_val

            country_val = row.get("Country", "").strip()
            if country_val and country_val != "NA":
                for c in country_val.split(","):
                    c = c.strip()
                    if c:
                        country_elem = SubElement(person_elem, "country")
                        country_elem.text = c

            root.append(person_elem)

    rough_string = tostring(root, encoding="utf-8")
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent="  ", encoding="utf-8").decode("utf-8")

In [None]:
personsfile = "Brodsky_XML_Persons.csv"
persons_output = csv_to_persons_xml(personsfile)
print(persons_output)

<?xml version="1.0" encoding="utf-8"?>
<persons>
  <person xml:id="god">
    <persName xml:lang="en">God</persName>
    <type>christianity</type>
  </person>
  <person xml:id="chort">
    <persName xml:lang="en">Chort</persName>
    <type>literature_character</type>
    <date when="1831">1831</date>
    <country>Russia</country>
  </person>
  <person xml:id="solokha">
    <persName xml:lang="en">Solokha</persName>
    <type>literature_character</type>
    <date when="1831">1831</date>
    <country>Russia</country>
  </person>
  <person xml:id="st_sofia">
    <persName xml:lang="en">Saint Sofia</persName>
    <type>christianity</type>
  </person>
  <person xml:id="st_faith">
    <persName xml:lang="en">Saint Faith</persName>
    <type>christianity</type>
  </person>
  <person xml:id="st_hope">
    <persName xml:lang="en">Saint Hope</persName>
    <type>christianity</type>
  </person>
  <person xml:id="st_charity">
    <persName xml:lang="en">Saint Charity</persName>
    <type>christiani