# Create OGSL table
Create a DataFrame that associates every sign value that is recognized in [ORACC](http://oracc.org) with a unicode code point (or a sequence of unicode code points). The table may be used to translate a text in transliteration into a sequence of unicode code points for use in `fasttext`. The data are derived from the ORACC Global Sign List ([OGSL](http://build-oracc.museum.upenn.edu)).


In [1]:
import pandas as pd
import zipfile
import json
import os
import sys
util_dir = os.path.abspath('../utils')
sys.path.append(util_dir)
from utils import *
import pickle

## 0 Create Directories, if Necessary
The two directories needed for this script are `jsonzip` and `output`. The directories are created with the function `make_dirs()` from the `utils` module. 

In [3]:
os.makedirs('jsonzip', exist_ok = True)
os.makedirs('output', exist_ok = True)

## 1 Download the ZIP file

In [4]:
project = ["ogsl"] # oracc_download() expects a list
oracc_download(project)

Saving http://oracc.org/ogsl/json/ogsl.zip as jsonzip/ogsl.zip.


ogsl: 0.00B [00:00, ?B/s]

['ogsl']

# <a name="head21"></a>2 The `parsejson()` function

In [5]:
def parsejson(data_json):
    for key, value in data_json["signs"].items():
        if "values" in value:
            for n in value["values"]:
                s["value"] = n
                s["name"] = key
                s["utf8"] = value.get("utf8", "")
                s["hex"] = value.get("hex", "")
                sign = {key : value for key, value in s.items()}
                s_l.append(sign)
    return

# 3 Main Process

In [6]:
s = {}
s_l = []
file = "jsonzip/ogsl.zip"
z = zipfile.ZipFile(file) 
filename = "ogsl/ogsl-sl.json"
signlist = z.read(filename).decode('utf-8')
data_json = json.loads(signlist)                # make it into a json object (essentially a dictionary)
parsejson(data_json)  

# 4 Make Dataframe

In [7]:
df = pd.DataFrame(s_l)
df

Unnamed: 0,value,name,utf8,hex
0,ʾu₄,A,𒀀,x12000
1,a,A,𒀀,x12000
2,aia₂,A,𒀀,x12000
3,aya₂,A,𒀀,x12000
4,barₓ,A,𒀀,x12000
...,...,...,...,...
9919,1(|AŠ×DIŠ@t|),|AŠ×DIŠ@t|,,
9920,ensiₓ,|PA.SI|,,
9921,nirahₓ,BAU153,,
9922,unuₓ,|GUD.KU|,,


In [8]:
with open("output/ogsl.p", "wb") as p:
    pickle.dump(df, p)