In [25]:
import json
from functools import reduce
import re

json_file = open('Data/LinearBInscription.js')
inscriptions = json.load(json_file)

lb_words = []
for info in inscriptions:
    inscription = info[1]
    words = inscription["words"]
    site = inscription["site"].strip()
    for word in words:
        word = re.sub("(\[|\]|\n|inf.|mut.|sup.|vac.|vest.|\/|⟦|⟧)", "", word).strip()
        if not word:
            continue
        lb_words.append((word, site, word))



In [3]:
import pandas as pd
import numpy as np
from IPython.display import display
pd.set_option("display.latex.repr", True)
pd.set_option("display.max_rows", None, "display.max_columns", None)
styles = [dict(selector="caption", 
    props=[("text-align", "center"),
    ("font-size", "120%"),
    ("color", 'black')])]


In [45]:
from collections import defaultdict
def siteCountTable(bigrams):
    """
    Create a table showing unique instances of bigrams (i.e. appearing in different words) 
    and the number of find sites.
    """
    bg_per_sites = defaultdict(set)
    for bg,st,w in bigrams:
        bg_per_sites[bg] |= {st}
    bg_more_than_one_site = [b for b,sts in bg_per_sites.items()
                             if len(sts) > 1]

    site_bg_count = defaultdict(int)
    for bg,s,_ in bigrams:
        if bg not in bg_more_than_one_site:
            continue
        site_bg_count[(bg,s)] += 1

    bg_site_and_instance_count = [
        (
            bg,
            len(set(s for b, s in site_bg_count if bg == b)),
            sum(c for ((b,s), c) in site_bg_count.items() if bg ==b)
        )
        for bg in set([b for b,s in site_bg_count])
    ]
    # Add a total to sort by
    bg_site_and_instance_count += [
        (
            bg,
            "Total",
            sum(c for ((b,s), c) in site_bg_count.items() if bg ==b)
        )
        for bg in set([b for b,s in site_bg_count])
    ]

    df = pd.DataFrame(
        {
            "No of Sites": [b for a,b,c in bg_site_and_instance_count]
          , "Bigram" : [a for a,b,c in bg_site_and_instance_count]
          , "count" : [c for a,b,c in bg_site_and_instance_count]
        }
    )
    table = pd.pivot_table(
            df, values='count', index=['Bigram'],
            columns=['No of Sites'], aggfunc=np.sum, fill_value=0
        ).sort_values(
            by="Total",
            ascending=False
        )
    return table

In [46]:
siteCountTable([w for w in lb_words if len(w[0]) > 1])

No of Sites,2,3,4,5,Total
Bigram,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
𐀞𐀫,0,272,0,0,272
𐀒𐀺,0,218,0,0,218
𐀁𐀐,0,177,0,0,177
𐀒𐀷,0,151,0,0,151
𐀵𐀰,0,0,0,132,132
deest,0,0,0,124,124
v.↓,0,0,114,0,114
𐄐𐄈,0,0,0,113,113
𐀐𐀐𐀕𐀙,94,0,0,0,94
𐀈𐀁𐀫,91,0,0,0,91
