# XML Updater Tool

Using a relational data to automatically update XML code, and vice versa.

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import os, pickle

In [2]:
hdir = os.path.expanduser('~')
pickle_path = hdir + "/Box/Notes/Digital_Humanities/Corpora/pickled_tokenized_cleaned_corpora"
data_path = ext_corp_path = hdir + "/Box/Notes/Digital_Humanities/Datasets/exported_database_data/basic_corresondences"

### Read in CSV Database Files

In [21]:
locs = pd.read_csv(data_path + '/location_data.csv', names=['id', 'name'])

Split on `\x0b` for IDs with more than one value separated by a line break.

In [23]:
locs[locs['id']==5].iloc[0]['name'].split('\x0b')

['سمرقند', 'ثمرقند']

Create DataFrame with doubled entries for IDs with multiple values.

In [46]:
locs = pd.DataFrame(sum([[(x.id, z) for z in x.name.split('\x0b')] for x in locs.fillna('').itertuples()], []), columns=['id', 'name'])

In [47]:
locs.head(10)

Unnamed: 0,id,name
0,1,حصار
1,2,کندرود
2,3,بخارا
3,4,ولایت بلخ
4,5,سمرقند
5,5,ثمرقند
6,6,خوقند
7,7,کابل
8,8,قزان
9,9,تاشکند


In [48]:
locs.count()

id      1460
name    1460
dtype: int64

Add columns to differentiate which names are unique and which IDs are unique.

In [50]:
locs['name_count'] = locs.groupby(['name'])['id'].transform('count')

locs.loc[locs['name'] == "حصار"]

Unnamed: 0,id,name,Counts,name_count
0,1,حصار,2,2
794,351,حصار,2,2


In [52]:
locs['id_count'] = locs.groupby(['id'])['name'].transform('count')

locs.loc[locs['name'] == "سمرقند"]

Unnamed: 0,id,name,Counts,name_count,id_count
4,5,سمرقند,1,1,2


Unpickle

In [53]:
with open(pickle_path + "/xml_corpora.pkl", "rb") as f:
    ind_man_docs, hyd_man_docs, trans_man_docs,\
                combo_xml_final, combo_xml_all = pickle.load(f)

In [54]:
combo_xml_all.keys()

dict_keys(['ser818', 'ser179', 'ser183', 'ser187', 'ser212', 'ser215', 'ser237', 'ser537', 'ser561', 'ser596', 'ser626', 'ser706', 'ser72', 'ser91', 'IVANUz_1936_ser185', 'NLR_f-940_ser190', 'RGVIA_400-1-1015_ser143', 'TsGARUz_i126-1-938-2_ser82', 'TsGARUz_i126_1_1160_ser193', 'TsGARUZ_i126_1_1729_101_ser213', 'TsGARUz_i126_1_1730_19_ser218', 'TsGARUz_i126_1_1730_22_ser217', 'TsGARUz_i126_1_1730_2_ser188', 'TsGARUZ_i126_1_1730_81_ser227', 'TsGARUZ_i126_1_1986_1_ser201', 'TsGARUz_i126_1_1990_20_ser186', 'TsGARUZ_i126_1_1990_3_ser192', 'TsGARUz_R-2678_ser184', 'ser560', 'ser808', 'ser809', 'ser811', 'ser812', 'ser813', 'ser814', 'ser815', 'ser816', 'ser817', 'ser842', 'ser843', 'ser857', 'ser876', 'ser877', 'ser898'])

In [57]:
tree = BeautifulSoup(combo_xml_all["ser898"])

In [58]:
first = tree.find_all('location')[0]

In [59]:
'locid' in first.attrs

False

In [60]:
first.attrs['locid'] = 5

In [61]:
first

<location locid="5">بلجوان</location>

In [63]:
hits = locs[locs['name']==first.text]

In [64]:
len(hits)

2

In [53]:
tree.find_all('location')

[<location locid="5">بلجوان</location>, <location>فیض اباد</location>]

In [56]:
str(tree)

'<?xml-model href="../../../../../Projects/xml_development_eurasia/schemas/persian_documents_schema_basic.rnc" type="application/relax-ng-compact-syntax"?><html><body><document serial="898">\n<div type="heading">\n<!-- inscriptio -->\n<ts type="inscriptio"></ts>\n\t\t جناب حضرت وزارت پناهی امیدگاهی و صاحب دولتم سلمه الله تعالی\n\t\t<lb></lb>\n</div>\n<div type="section">\n<!-- left column -->\n<ts type="apprecatio"></ts>\n\t\t عرضه داشت اینغلام\n\t\t<honorific type="inferior">رضاجوی</honorific>\n<honorific type="inferior">جانسپار</honorific>\n<flag type="meaning">خرمان</flag>\n\t\t کثیر الاخلاص\n\t\t<lb></lb>\n\t\t وافر الاعتقاد و خبر خواه عقیدت نهاد قلیل الخدمت کثیر الامید بجناب\n\t\t<lb></lb>\n\t\t ذاة خجسته صفات زیب بخش امارت و زینت افزای بساط\n\t\t<lb></lb>\n\t\t عزت و حرمت ترازندۀ لوای معدلت و نیک نامی فرازنده اعلام\n\t\t<lb></lb>\n<diplo type="orthography">حشمت</diplo> و انتظام ناظم مناظم و امور دین و دولت بساط\n\t\t<lb></lb>\n\t\t شهریاری حافظ مسند عزت و نامداری مقرب حضرت خاقان 