In [2]:
import json
json_file = open('InvestigatingThePhaistosDiscAfterDavisData/LinearAWords.json')
inscriptions = json.load(json_file)
la_words = []
for inscription in inscriptions:
    word_tags = inscription["tagsForWords"]

    for index, word_tag in enumerate(word_tags):
        tags = word_tag["tags"]
        if "word" not in tags:
            continue
        word = word_tag["word"].replace('\U0001076b', '')
        if len(word) == 1:
            continue
        la_words.append((word, inscription["name"][:2], word))
la_words = list(la_words)


In [3]:

site_bg_count = [
    (
        site,
        bg,
        len(
            [b for b,s,_ in la_words if s == site and b == bg]
        )
    ) 
    for bg, site, _ in la_words
]

site_bg_count = list(set(site_bg_count))
bg_more_than_one_site = [
    b for b,s in
    [
        (bg, set([s for s,b,_ in site_bg_count if b==bg]))
        for st,bg,c in site_bg_count
    ]
    if len(s) > 1
]

In [4]:
import pandas as pd
import numpy as np
from IPython.display import display
pd.set_option("display.latex.repr", True)
pd.set_option("display.max_rows", None, "display.max_columns", None)
styles = [dict(selector="caption", 
    props=[("text-align", "center"),
    ("font-size", "120%"),
    ("color", 'black')])]


In [16]:
from collections import defaultdict
def siteCountTable(bigrams):
    """
    Create a table showing unique instances of bigrams (i.e. appearing in different words) 
    and the number of find sites.
    """
    bg_per_sites = defaultdict(set)
    for bg,st,w in bigrams:
        bg_per_sites[bg] |= {st}
    bg_more_than_one_site = [b for b,sts in bg_per_sites.items()
                             if len(sts) > 1]

    site_bg_count = defaultdict(int)
    for bg,s,_ in bigrams:
        if bg not in bg_more_than_one_site:
            continue
        site_bg_count[(bg,s)] += 1

    bg_site_and_instance_count = [
        (
            bg,
            len(set(s for b, s in site_bg_count if bg == b)),
            sum(c for ((b,s), c) in site_bg_count.items() if bg ==b)
        )
        for bg in set([b for b,s in site_bg_count])
    ]
    # Add a total to sort by
    bg_site_and_instance_count += [
        (
            bg,
            "Total",
            sum(c for ((b,s), c) in site_bg_count.items() if bg ==b)
        )
        for bg in set([b for b,s in site_bg_count])
    ]

    df = pd.DataFrame(
        {
            "No of Sites": [b for a,b,c in bg_site_and_instance_count]
          , "Bigram" : [a for a,b,c in bg_site_and_instance_count]
          , "count" : [c for a,b,c in bg_site_and_instance_count]
        }
    )
    table = pd.pivot_table(
            df, values='count', index=['Bigram'],
            columns=['No of Sites'], aggfunc=np.sum, fill_value=0
        ).sort_values(
            by="Total",
            ascending=False
        )
    return table

In [14]:
len(la_words)

1343

In [17]:
siteCountTable(la_words)

No of Sites,2,3,4,5,Total
Bigram,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
𐙂𐘁,0,37,0,0,37
𐘇𐘳𐘚𐙕𐘮𐘱,0,0,0,11,11
𐘇𐘬,0,10,0,0,10
𐙂𐘰𐘯,8,0,0,0,8
𐘱𐘞𐘞𐘴𐘋,0,0,0,7,7
𐘤𐘘𐘃,0,0,0,7,7
𐘀𐘙,6,0,0,0,6
𐙁𐘆,6,0,0,0,6
𐘆𐘅𐘉,6,0,0,0,6
𐘚𐘢𐘅𐙁,0,0,0,6,6
