# Search BDTNS by Sign

In [None]:
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
import re
import pickle

In [None]:
with open("../test fasttext/output/ogsl.p", "rb") as f:
    o = pickle.load(f)

In [None]:
d = dict(zip(o['name'], o['utf8']))
d2 = dict(zip(o['value'], o['name']))

In [None]:
with open('output/bdtns.p', 'rb') as f: 
    df = pickle.load(f)

In [None]:
separators = ['{', '}', '-']
separators2 = ['.', '+', '|']
operators = ['&', '%', '@', '×']
flags = "][!?<>⸢⸣⌈⌉*/"
table = str.maketrans(dict.fromkeys(flags))

In [None]:
def signs(row):  
    row_l = []
    sign_names = []
    sign_sequence = ''
    row = row.translate(table).lower()
    for s in separators: # first split row into signs   
        row = row.replace(s, ' ').strip()
    s_l = row.split()
    s_l = [d2[sign] if sign in d2 else sign for sign in s_l]
    for sign in s_l:
        if sign[-1] == ')' and '(' in sign: # qualified sign - get only the qualifier
            sign = sign.split('(')[1][:-1]
        if '.' in sign or '+' in sign: 
            for s in separators2:
                sign = sign.replace(s, ' ').strip() 
            sign_l = sign.split()
            row_l.extend(sign_l)
            continue
        if '×' in sign and not '|' in sign:
            sign_l = sign.split('×')
            sign_l = [d2[sign] if sign in d2 else sign for sign in sign_l]
            sign = '|' + '×'.join(sign_l) + '|'
        row_l.append(sign)
        sign_names = [d2[sign] if sign in d2 else sign for sign in row_l]
    return ' '.join(sign_names).upper()

In [None]:
df["sign_names"] = df["text"].progress_apply(signs)

# The Search Function
The search function takes as input any style of transliteration recognized in [OGSL](http://orac.org/ogsl) in upper or lower case. Signs may be connected with hyphens or spaces, determinatives may be written between curly brackets ({d}En-ki), or on the line (d-nin-gisz-zi-da). Shin may be represented by š, c, or sz and sign index numbers may be written on the line, or with Unicode subscript numbers ('e₂' and 'e2' are equivalent, but 'é' will yield no results). '{d}Nin-giš-zi-da-ke₄', 'd-nin-ŋeš-zi-da-ke₄', or 'AN nin gisz ZI da ke4' will all return the same results.  

The search engine will find any matching sequence of signs, independent of the transliteration, thus 'nig2 sig' will also find 'ninda sig'.

The search results are listed in a DataFrame. If there are 25 results or less, the DataFrame provides links to the [BDTNS](http://bdtns.filol.csic.es) pages of the matching texts.

In [None]:
num = '0123456789x{}-c*'
ind = '₀₁₂₃₄₅₆₇₈₉ₓ   š×'
tab = str.maketrans(num, ind)
anchor = '<a href="http://bdtns.filol.csic.es/{}", target="_blank">{}</a>'

In [None]:
def search(search): 
    search = search.lower().replace('sz', 'š').translate(tab).strip()
    search_l = search.split()
    search_l = [d2[s] if s in d2 else s for s in search_l]
    row_l = []
    for sign in search_l: 
        if '.' in sign or '+' in sign: 
            for s in separators2:
                sign = sign.replace(s, ' ').strip() 
                sign_l = sign.split()
            row_l.extend(sign_l)
        elif '×' in sign:
            sign_l = sign.replace('|', '').split('×')
            sign_l = [d2[sign] if sign in d2 else sign for sign in sign_l]
            sign = '|' + '×'.join(sign_l) + '|'
            row_l.append(sign)
        else: 
            row_l.append(sign)
        print(row_l)
    search_l = [re.escape(s) for s in row_l]
    signs = ' '.join(search_l).upper()
    show = ['id_text', 'line_label', 'text']
    #results = df[show].loc[df['sign_names'].str.contains('(?:(?<=\s)|(?<=^))'+signs+'(?=\s|$)', regex=True)].copy()
    results = df[show].loc[df['sign_names'].str.contains(r'\b'+signs+r'\b', regex=True)].copy()
    print(signs)
    print(str(len(results)) + ' hits')
    if len(results) <= 25: # add links only for 25 hits or less
        results['id_text'] = [anchor.format(val,val) for val in results['id_text']]
        results = results.style
    return results

# Search Instructions
Search for a sequence of sign values in any transliteration system recognized by [OGSL](http://oracc.org/ogsl). Thus, sugal₇, sukkal, or luh, in upper or lower case will all return the same results.

The Shin may be represented by š, c, or sz in upper or lower case.

Sign indexes may be represented by regular numbers or by index numbers (sig₇ or sig7).

Compound signs (such as diri) are resolved in their component signs if the compound represents a simple sequence of signs. Thus diri is resolved as SI A, but gu₇ is resolved as |KA×GAR|.

To search for a compound sign by sign name, enter it between pipes (|). The "times" sign may be represented by \* (enter |UR₂×A| or |UR₂\*A|).

In [None]:
s = input()

In [None]:
search(s)

In [None]:
df[df['text'].str.contains('esir₂')]

In [None]:
s in df.iloc[195]['sign_names']

In [None]:
s

In [None]:
df[df["text"].str.contains('diri')]

In [None]:
df[df['sign_names'].str.contains('SI\.A')]

In [None]:
df[df['sign_names'].str.contains('A₂ SAL.KUR KA', regex=False)]

In [None]:
show = ['id_text', 'line_label', 'text']
df[show]

In [None]:
df

In [None]:
%%timeit
search('diri-ga')

In [None]:
d2['diri']