<a href="https://colab.research.google.com/github/morrisalp/taatik/blob/master/Get_Hebrew_Wiktionary_transliterations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
! pip install wikitextparser



In [0]:
import requests
import pandas as pd
from functools import lru_cache
from multiprocessing import Pool
from tqdm import tqdm
import wikitextparser as wtp
from google.colab import files

In [0]:
def all_pages():
  cont = ''
  while True:
    url = f'https://he.wiktionary.org/w/api.php?action=query&list=allpages&aplimit=500&format=json&apcontinue={cont}'
    obj = requests.get(url).json()
    for x in obj['query']['allpages']: yield x['title']
    if 'continue' not in obj: break
    cont = obj['continue']['apcontinue']

In [0]:
@lru_cache(maxsize = None)
def get_wikitext(word):
  try:
    url = f'https://he.wiktionary.org/w/api.php?action=query&titles={word}&prop=revisions&rvprop=content&format=json'
    obj = requests.get(url).json()
    return list(obj['query']['pages'].values())[0]['revisions'][0]['*']
  except KeyError:
    return ''

In [0]:
%time he = set(all_pages())

CPU times: user 656 ms, sys: 38 ms, total: 694 ms
Wall time: 15.1 s


In [0]:
def get_translit_df(word):
  wikitext = get_wikitext(word)
  parsed = wtp.parse(wikitext)
  return pd.concat([
      pd.DataFrame({
          "word": word.strip(),
          "nikkud": section.title.strip(),
          "transliteration": [
              argument.value.strip().replace("'''", "")
              for template in wtp.parse(section.contents).templates
              for argument in template.arguments
              if template.name == "ניתוח דקדוקי" and
              argument.name == "הגייה"
          ]
      })
      for section in parsed.sections
  ])

In [0]:
get_translit_df('אכסניה')

Unnamed: 0,nikkud,transliteration,word
0,אַכְסַנְיָה,akhsanya,אכסניה


In [0]:
with Pool(processes = 8) as pool:
  df = pd.concat(list(tqdm(pool.imap(get_translit_df, he), total = len(he))))

100%|██████████| 23851/23851 [15:59<00:00, 24.27it/s]


In [0]:
df.sort_values(by = 'word').to_csv('hebrew_wiki_translit.csv', index = False)
files.download('hebrew_wiki_translit.csv')