In [1]:
import pandas as pd
from collections import Counter
from itertools import chain
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
## CONSTANTS
lang2code = {
    "Gitksan" : "git", 
    "Arapaho" : "arp", 
    "Lezgi" : "lez", 
    "Nyangbo" : "nyb", 
    "Tsez" : "ddo",
    "Uspanteko" : "usp",  
    "Natugu" : "ntu"
    }
code2lang = {v:k for k,v in lang2code.items()}

PATH = "/content/drive/MyDrive/2023glossingST/"

In [20]:
# def escape_parentheses(string):
#     escaped_string = re.sub(r'([()])', r'\\\1', string)
#     return escaped_string
specials = {'(', ')', '[', ']', '*', '|', ''}
def escape(string):
    escaped_string = ''
    for char in string:
        if char in specials:
            escaped_string += "\\" + char
        else:
            escaped_string += char
    return escaped_string

In [4]:
CODE = "ddo"
LANG = code2lang[CODE]
DATA = f"{PATH}data/{LANG}/{CODE}-train-track1-uncovered"

In [21]:
from collections import defaultdict

data = defaultdict(list)
with open(DATA, "r", encoding="utf-8") as fin:
  for line in fin.readlines():
    line = escape(line)

    if line.startswith("\\t"):
      data["source"].append(line.strip("\\t ").strip("\n"))
    if line.startswith("\g"):
      data["target"].append(line.strip("\g ").strip("\n"))
    if line.startswith("\l"):
      data["trans"].append(line.strip("\l ").strip("\n"))

In [23]:
FRAG_LEN = 2
WINDOW_SIZE = 2

In [24]:
df = pd.DataFrame.from_dict(data)
df["tokenized_src"] = df["source"].map(lambda x:x.split())
df["tokenized_trg"] = df["target"].map(lambda x:x.split())
df["tokenized_trans"] = df["trans"].map(lambda x:x.split())


In [25]:
TOKENS = list(chain(*df["tokenized_src"].tolist()))
print(len(TOKENS))

37458


In [26]:
len(df.source)

3558

In [27]:
counts = Counter()
for t in TOKENS:
  counts.update([t])

In [28]:
tokens = {t for t,c in counts.most_common() if c > 30}

In [30]:
def get_tok2gloss(df):
  toks = df.tokenized_src
  gls = df.tokenized_trg
  tok2gloss = defaultdict(dict)
  for i, (t,g) in enumerate(zip(toks, gls)):
    for n,x in enumerate(t):
      tok2gloss[i][x]=g[n]
  return tok2gloss

In [31]:
tok2gloss = get_tok2gloss(df)

In [32]:
import re
SEP, BD, BLANK = " [SEP] ", "@", "#"

def slice(line, fragment_len=FRAG_LEN):
  slices = []
  for i in range(len(line)-1):
    frag = line[i:i+fragment_len]
    if set(frag).intersection(tokens) or set(frag).intersection(specials):
      continue
    else:
      slices.append(" ".join(frag))
  return slices

def extract_template(line, fragment):
  format = lambda x: r"{}".format(x)
  return re.sub(format(fragment), BLANK, format(line))

def segment(ex, direction,windows=WINDOW_SIZE):
  ex = ex.split()
  if direction == "l": ex = ex[::-1]
  splits = []
  sid = 0
  for i in range(1, WINDOW_SIZE + 1):
    split = ex[sid:sid+i]
    if direction == "l":
      splits.append(split[::-1])
    else:
      splits.append(split)
    
  return splits


env_process = lambda x,y:segment(x,y) if x else [BD] * WINDOW_SIZE

In [33]:
def extract_environment(template):
  envs = []
  contexts = template.split(BLANK)
  for i in range(len(contexts) - 1):
    prev, post = env_process(contexts[i],"l"), env_process(contexts[i+1],"r")
    for pr, po in zip(prev, post):
      envs.append(SEP.join([" ".join(pr), " ".join(po)]))

  return envs

In [34]:
ex = df.tokenized_src[6]
frags = slice(ex)
line = " ".join(ex)
template = extract_template(line, frags[0])
template

'Ža nesi # nełał hunix, oc’ira ƛˤeba adäztow nełqor roƛ’i boqno zawru, k’ačaɣˤ c’oxno.'

In [35]:
col = "tokenized_src"
MIN = 5
# seqs=df.tokenized
# glosses = df.tokenized_g
envss=[]
f2t = defaultdict(set)
t2f = defaultdict(set)
e2t = defaultdict(set)
f2e = defaultdict(set)
frag_ids = []

for i in range(len(df)):
  seq = df[col][i]
  if len(seq) < MIN:
    continue
  fragments = slice(seq)
  for f in fragments:
    if f[0] in specials:
      f = "\\"+f
    frag_ids.append((f, i))
    template = extract_template(" ".join(seq),f)
    envs = extract_environment(template)
    if len(envs) < 2:
      continue
    for e in envs:
      f2e[f].add(e)
      e2t[e].add((template,i))
      
  

In [36]:
matched = set()
new = {}

for frag, id in frag_ids:
  if (frag, id) in new:
    continue


  ref = f2e[frag]
  for frag2,id2 in frag_ids:
    if frag2 == frag:
      continue
    comp = f2e[frag2]
    matches = ref.intersection(comp)
    if matches:
      for environment in matches:
        #print(environment)
        new_temps = e2t[environment]
        new_temps = [(e, i) for e, i in new_temps if not i == id2]
        #print(new_temps)
        new[(frag2,id2)] = new_temps
 

In [38]:
def translate_template(template_id):
  template, id = template_id
  translated_template = " ".join([BLANK if s == BLANK else tok2gloss[id][s] for s in template.split()])
  return translated_template

def translate_fragment(frag_id):
  frag, id = frag_id
  return " ".join([tok2gloss[id][s] for s in frag.split()])

In [39]:
sub = lambda fragment, templates: [re.sub(BLANK,fragment,temp) for temp in templates]

In [40]:
def get_examples(new):
  p = set()
  for frag in new:
    try:
      temps = new[frag]
      trg_frag = translate_fragment(frag)
      trg_temp = [translate_template(t) for t in temps]

      src_frag = frag[0]
      src_temp = [t for t,i in temps]
      target = (sub(trg_frag,trg_temp))
      src = sub(src_frag,src_temp)
      for i,(t,s) in enumerate(zip(target,src)):
        p.add((s,t))
    except KeyError:
      continue
  return [i for i in p]




In [41]:
new_examples = get_examples(new)

In [42]:
len(list(chain(*[i.split() for i,j in new_examples])))

88461

In [43]:
len(new_examples)

7690

In [44]:
source, target = df["source"].tolist(), df["target"].tolist()

In [45]:
for s, t in new_examples:
  source.append(s)
  target.append(t)