# Search BDTNS by Sign

In [1]:
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
import re

In [2]:
with open("../test fasttext/output/ogsl.p", "rb") as f:
    o = pd.read_pickle(f)

In [3]:
d = dict(zip(o['name'], o['utf8']))
d2 = dict(zip(o['value'], o['name']))

In [4]:
with open('output/bdtns.p', 'rb') as f: 
    df = pd.read_pickle(f)

In [5]:
separators = ['{', '}', '-']
separators2 = ['.', '+', '|']
operators = ['&', '%', '@', '×']
flags = "][!?<>⸢⸣⌈⌉*/"
table = str.maketrans(dict.fromkeys(flags))

In [6]:
def signs(row):  
    row_l = []
    sign_names = []
    sign_sequence = ''
    row = row.translate(table).lower()
    for s in separators: # first split row into signs   
        row = row.replace(s, ' ').strip()
    s_l = row.split()
    for sign in s_l:
        if sign[-1] == ')' and '(' in sign: # qualified sign - get only the qualifier
            sign = sign.split('(')[1][:-1]
#        if '|' in sign:  # separate |DU.DU| and |DU+DU| into its components but not |DU&DU|
                        # and also not |DU.DU&DU|
#            flag = False
#            for o in operators:
#                if o in sign:
#                    flag = True
#            if not flag:
        if '|' in sign: 
            for s in separators2:
                sign = sign.replace(s, ' ').strip() 
            sign_l = sign.split()
            row_l.extend(sign_l)
            continue
        else:
            if '×' in sign: 
                sign_l = sign.split('×')
                sign_l = [d2[sign] if sign in d2 else sign for sign in sign_l]
                sign = '|' + '×'.join(sign_l) + '|'
        row_l.append(sign)
        sign_names = [d2[sign] if sign in d2 else sign for sign in row_l]
    return ' '.join(sign_names).upper()

In [7]:
df["sign_names"] = df["text"].progress_apply(signs)

HBox(children=(IntProgress(value=0, max=1156363), HTML(value='')))




# The Search Function
The search function takes as input any style of transliteration recognized in [OGSL](http://orac.org/ogsl) in upper or lower case. Signs may be connected with hyphens or spaces, determinatives may be written between curly brackets ({d}En-ki), or on the line (d-nin-gisz-zi-da). Shin may be represented by š, c, or sz and sign index numbers may be written on the line, or with Unicode subscript numbers ('e₂' and 'e2' are equivalent, but 'é' will yield no results). '{d}Nin-giš-zi-da-ke₄', 'd-nin-ŋeš-zi-da-ke₄', or 'AN nin gisz ZI da ke4' will all return the same results.  

The search engine will find any matching sequence of signs, independent of the transliteration, thus 'nig2 sig' will also find 'ninda sig'.

The search results are listed in a DataFrame. If there are 25 results or less, the DataFrame provides links to the [BDTNS](http://bdtns.filol.csic.es) pages of the matching texts.

In [11]:
num = '0123456789x{}-c'
ind = '₀₁₂₃₄₅₆₇₈₉ₓ   š'
tab = str.maketrans(num, ind)
anchor = '<a href="http://bdtns.filol.csic.es/{}", target="_blank">{}</a>'

The regular expression in `contains()`must be re-written. It does not match a DIRI compound at the beginning or end of the search expression. 'nin-gir₂'has 0 matches, '{d}nin-gir₂' has many matches. 'nig₂-diri-ga' has matches, but 'nig₂-diri' does not.

In [32]:
search = input()
search = search.lower().replace('sz', 'š').translate(tab).strip()
search_l = search.split()
search_l = [d2[s] if s in d2 else s for s in search_l]
search_l = [re.escape(s) for s in search_l]
s = ' '.join(search_l).upper()
#s = s.replace('|', '')
print(s)
show = ['bdtns_no', 'line_label', 'text']
results = df[show].loc[df['sign_names'].str.contains('(?:(?<=\s)|(?<=^))'+s+'(?=\s|$)', regex=True)].copy()
print(str(len(results)) + ' hits')
if len(results) <= 25: # add links only for 25 hits or less
    results['bdtns_no'] = [anchor.format(val,val) for val in results['bdtns_no']]
    results = results.style
results

d nin-gir2-su-kid
AN \|SAL\.TUG₂\| GIR₂ SU KID
16 hits


Unnamed: 0,bdtns_no,line_label,text
271220,183,o.i 10,SAG×SIG₇ igi e₂ {d}Nin-gir₂-su-ke₄ ba-ak
277507,335,r. 6,ugula Sipa-{d}Nin-gir₂-su-ke₄-in-pa₃
353277,1030,o. 4,Ur-{d}Lamma sanga /{d}Nin-gir₂-su-ke₄
422008,32435,o. 4,Sipa-{d}Nin-gir₂-su-
451423,20704,o.i 5,Lu₂-{d}[nin]-⌈gir₂⌉-su-ke₄
464212,22480,t.o. 7,Ur-{d}Ba-u₂ sanga {d}Nin-gir₂-su-ke₄
484556,27765,r.i 41,0.0.3 Sipa-{d}Nin-⌈gir₂⌉-[su-ke₄]-in-⌈pa₃⌉
522771,35295,o. 4,kišib Sipa-{d}Nin-gir₂-su-ke₄-i₃-pa₃
522778,35295,s. 4,Sipa-{d}Nin-gir₂-su-ke₄-i₃-pa₃
600494,53505,o. 6,Ur-{d}Nin-gir₂-su-ke₄


In [9]:
df[df['text'].str.contains('esir₂')]

Unnamed: 0,bdtns_no,line_label,text,comments,sign_names
1049,038646,o. 2,siki esir₂-a sa₁₀-sa₁₀-de₃,,SIK₂ |LAGAB×KUL| A |NINDA₂×ŠE| |NINDA₂×ŠE| NE
3110,038728,r. 20,u₃ Umma{ki}-a ma₂ esir₂ ba-al-la,,|IGI.DIB| |GIŠ.KUŠU₂| KI A MA₂ |LAGAB×KUL| BA ...
3710,038738,o.i 16,0.4.1 4 1/2 sila₃ esir₂ e₂-a,,0.4.1 4 12 SILA₃ |LAGAB×KUL| E₂ A
3802,038738,r.ii 13,0.0.2 8 sila₃ esir₂ e₂-a ku₃-bi igi-4-gal₂ 2 1...,,0.0.2 8 SILA₃ |LAGAB×KUL| E₂ A KU₃ BI IGI 4 IG...
3808,038738,r.ii 19,13 gu₂ esir₂ had₂ ku₃-bi 1 1/3 gin₂ la₂ 6 še,,13 GU₂ |LAGAB×KUL| UD KU₃ BI 1 13 DUN₃@G LAL 6 ŠE
3995,038744,o.ii 1,a-ša₃ nag-esir₂,,A ŠA₃ |KA×A| |LAGAB×KUL|
5094,158579,r. 1,7 kid-sig₅ TUN₃?-ba esir₂ su-ba,,7 KID |IGI.ERIN₂| DUN₃ BA |LAGAB×KUL| SU BA
5209,158573,o. 1,150 gu₂ esir₂ had₂,,150 GU₂ |LAGAB×KUL| UD
5211,158573,o. 3,4.0.0 esir₂ e₂-a gur,,4.0.0 |LAGAB×KUL| E₂ A GUR
6769,158612,o. 1,0.0.2 esir₂ e₂-a,,0.0.2 |LAGAB×KUL| E₂ A


In [None]:
s in df.iloc[195]['sign_names']

In [None]:
s

In [None]:
df[df["text"].str.contains('diri')]

In [None]:
df[df['sign_names'].str.contains('SI\.A')]

In [None]:
df[df['sign_names'].str.contains('A₂ SAL.KUR KA', regex=False)]