In [1]:
import json
from collections import defaultdict
import re
from tabulate import tabulate
import pandas as pd
from IPython.display import HTML, display
#%load_ext google.colab.data_table

In [2]:
!pip install gdown

You should consider upgrading via the 'c:\users\mao-chang ku\appdata\local\programs\python\python37\python.exe -m pip install --upgrade pip' command.


In [3]:
!gdown --id "1ERabe16qZQogWJkmmPUyx6DegZM2uKp3" -O "corpus.jsonl"

Downloading...
From: https://drive.google.com/uc?id=1ERabe16qZQogWJkmmPUyx6DegZM2uKp3
To: c:\Users\Mao-Chang Ku\Desktop\atayal\corpus.jsonl

  0%|          | 0.00/24.8k [00:00<?, ?B/s]
100%|██████████| 24.8k/24.8k [00:00<00:00, 24.2MB/s]


In [4]:
with open("corpus.jsonl", "r", encoding="utf-8") as file:
  corpus = json.load(file)


In [5]:
word_index = {}

for article_idx, article in enumerate(corpus):
  for sentence_idx, sentence in enumerate(article['atayal']):
    for token_idx, token in enumerate(sentence):
      locator = [article_idx, sentence_idx, token_idx]
      if token['word'] not in word_index:
        word_index[token['word']] = [locator]
      else:
        word_index[token['word']].append(locator)

In [6]:
sent_index = {}

for article_idx, article in enumerate(corpus):
  for sentence_idx, sentence in enumerate(article['mandarin']):
    locator = [article_idx, sentence_idx, token_idx]
    for i in range(0, len(sentence)):
      if sentence[i] not in sent_index:
        sent_index[sentence[i]] = [locator]
      else:
        sent_index[sentence[i]].append(locator)

In [7]:
word_index['qutux']

[[0, 1, 4],
 [0, 2, 0],
 [0, 3, 6],
 [0, 6, 1],
 [0, 9, 1],
 [0, 27, 1],
 [1, 1, 6],
 [1, 3, 4],
 [1, 5, 2],
 [1, 9, 3],
 [1, 9, 4],
 [1, 12, 1],
 [1, 18, 1],
 [1, 23, 1],
 [1, 31, 8],
 [1, 32, 1],
 [1, 44, 6]]

In [8]:
def word_freq(word):
    return len(word_index[word])

In [9]:
word_freq('qutux')

17

In [10]:
token_sum = 0
for word in word_index:
    token_sum += word_freq(word)
print(token_sum)

919


In [11]:
def query(token_value, window_size=5, language='atayal'):
  results = []
  for locator in filter_a_regex_token(token_value, language):
    result = generate_concordance_line(token_value, locator, window_size, language)
    results.append(result)

  if language == 'atayal':
    col_order_atayal = ['article_id', 'sentence_id', 'left', 'keyword', 'right', 'Atayal', 'Mandarin']
    return pd.DataFrame(results, columns=col_order_atayal)
  
  else:
    col_order_mandarin = ['article_id', 'sentence_id', 'keyword', 'Mandarin', 'Atayal']
    return pd.DataFrame(results, columns=col_order_mandarin)


def generate_concordance_line(token_value, locator, window_size, language):
  if language == 'atayal':

    a_idx = locator[0]
    s_idx = locator[1]
    t_idx = locator[2]
    sentence = corpus[a_idx]['atayal'][s_idx]

    each_concordance_line = {
        "article_id": a_idx,
        "sentence_id": s_idx,
        "left": "",
        "keyword": sentence[t_idx]["word"],
        "right": "",
        "Atayal": "",
        "Mandarin": corpus[a_idx]['mandarin'][s_idx][0]
    }

    # 安全邊界
    safe_left_bound = max(0, t_idx - window_size)
    safe_right_bound = min(len(sentence), t_idx + window_size + 1)

    # 左邊部分
    for token in sentence[safe_left_bound: t_idx]:
      each_concordance_line["left"] += ' '+token["word"]

    # 右邊部分
    for token in sentence[t_idx + 1: safe_right_bound]:
      each_concordance_line["right"] += ' '+token["word"]
    
    # 整句
    for token in sentence:
      each_concordance_line["Atayal"] += ' '+token["word"]

    return each_concordance_line
  
  else:

    a_idx = locator[0]
    s_idx = locator[1]
    sentence = corpus[a_idx]['mandarin'][s_idx][0]

    each_concordance_line = {
        "article_id": a_idx,
        "sentence_id": s_idx,
        "keyword": token_value,
        "Mandarin": sentence,
        "Atayal": ""
    }

    for token in corpus[a_idx]['atayal'][s_idx]:
      each_concordance_line["Atayal"] += ' '+token["word"]

    return each_concordance_line

def filter_a_regex_token(token_value, language):
  results = []
  index_to_use = None

  if language == "atayal":
    index_to_use = word_index   
  elif language == "mandarin":
    index_to_use = sent_index
  else:
    raise ValueError('參數 language 只能是 "atayal" 或者 "mandarin"')

  matched_tokens = [token for token in index_to_use.keys() if re.search(token_value, token)]
    
  for token in matched_tokens:
    for locator in index_to_use[token]:
      results.append(locator)
    
  return results


In [12]:
query('qutux')

Unnamed: 0,article_id,sentence_id,left,keyword,right,Atayal,Mandarin
0,0,1,raral raral hiya maki,qutux,"ryax, trang qmisan ka ghzyaq","raral raral hiya maki qutux ryax, trang qmisa...",很久很久以前有這麼一天，是值非常寒冷的冬季，天上瑞雪亦紛飛。
1,0,2,,qutux,cinklgan squliq minnwah mcisal squ,qutux cinklgan squliq minnwah mcisal squ lpyu...,有一群泰雅族人到遠方親戚的部落探訪之後，頂著刺骨寒風白雪，此刻正走在回鄉的路途中。
2,0,3,"nha qasa ga, mluw kya",qutux,baytunux na mkrakis kneril uzi,"cinklgan nha qasa ga, mluw kya qutux baytunux...",這一隊人馬之中跟隨著一位年輕又標緻的青春少女。
3,0,6,kmal,qutux,qu bnkis mha: “llaqi! hngaw,kmal qutux qu bnkis mha: “llaqi! hngaw ru aki...,這時隊中一位長老對大家說：「孩子們！讓我們休息片刻吧，順便烤火取暖」。
4,0,9,maki,qutux,qu mrkyas na mlikuy minnluw,"maki qutux qu mrkyas na mlikuy minnluw lha, t...",大夥中有一位跟隨的年輕男士，他被稱為瑪萊‧巴杜先生。
5,0,27,kmal,qutux,yaki mha: “aki ta minsuna,kmal qutux yaki mha: “aki ta minsuna cikay ha...,其中一位老婦人對他們說：「我們應該喘喘息一下，反正我們快要到家了好嗎？」大夥都同意她的看法，...
6,1,1,squ raral hiya qa maki,qutux,"squliq, hlngat iyal qu pqzinah","nanu squ raral hiya qa maki qutux squliq, hln...",據說，曾有這樣一位泰雅人先輩，他的腳程無人能出其右，是一位健步如飛的小伙子，
7,1,3,ana su wayal lama,qutux,"ryax qu squliq hiya mga,",ana su wayal lama qutux ryax qu squliq hiya m...,即使有比他先走上一天，他卻能輕易地在路途中追上比他先走的人們。
8,1,5,minnxal maki,qutux,"ryax mga, mkayal kwara squliq","minnxal maki qutux ryax mga, mkayal kwara squ...",據說有一天，所有同樣和長腳耶哥立先生同住在一個部落的泰雅族人，聚會討論時一位長老這樣說：
9,1,9,nanu yasa qu,qutux,qutux ngasal si ptbuci smxu,nanu yasa qu qutux qutux ngasal si ptbuci smx...,因此家家戶戶各自忙著輾米


In [13]:
query('一個', language='mandarin')

Unnamed: 0,article_id,sentence_id,keyword,Mandarin,Atayal
0,0,33,一個,就在這個節骨眼上，突然有一個身影“wax”，跌落下去的就是不畏寒冷且強壯的瑪萊‧巴杜先生，,"taqu ke’ kasa lgwah, wayal mha “wax” suruw na..."
1,1,5,一個,據說有一天，所有同樣和長腳耶哥立先生同住在一個部落的泰雅族人，聚會討論時一位長老這樣說：,"minnxal maki qutux ryax mga, mkayal kwara squ..."
2,1,17,一個,因為沒有一個獵手能勝任尾隨追擊野獸的任務，我腳長動作比較快速」，大家言聽計從就分頭進行。,swaga iyat simu pthuyay tpucing mhzyaw qsinuw...
3,1,31,一個,長腳耶哥立先生快手快腳揀了一個特大號的肉塊，隨即走向山涯邊離他們不遠的地方蹲坐狼吞虎嚥。,"mharu tuliq Yekliy mrhuw hiya, si nya kyapi q..."
