In [227]:
import json
from functools import reduce
import re

json_file = open('Data/LinearBInscription.js')
inscriptions = json.load(json_file)

lb_words = []
for info in inscriptions:
    inscription = info[1]
    words = inscription["words"]
    site = inscription["site"].strip()
    for word in words:
        word = re.sub("(\[|\]|\n|inf.|mut.|sup.|vac.|vest.|\/|⟦|⟧)", "", word).strip()
        if not word:
            continue
        lb_words.append((word, site))



In [228]:
def getNgrams(words, n):
    ngrams = []
    for word, site in words:
        bg = [(word[i:i+n], site, word)
              for i in range(0, len(word) - (n-1))]
        ngrams.extend(bg)
    return ngrams



# Create a table showing instances of bigrams that occur at more than one find site.


In [224]:
import pandas as pd
import numpy as np
from IPython.display import display
pd.set_option("display.latex.repr", True)
pd.set_option("display.max_rows", None, "display.max_columns", None)
styles = [dict(selector="caption", 
    props=[("text-align", "center"),
    ("font-size", "120%"),
    ("color", 'black')])]


In [229]:
from collections import defaultdict
def siteCountTable(bigrams):
    """
    Create a table showing unique instances of bigrams (i.e. appearing in different words) 
    and the number of find sites.
    """
    bg_per_sites = defaultdict(set)
    for bg,st,w in bigrams:
        bg_per_sites[bg] |= {st}
    bg_more_than_one_site = [b for b,sts in bg_per_sites.items()
                             if len(sts) > 1]

    site_bg_count = defaultdict(int)
    for bg,s,_ in bigrams:
        if bg not in bg_more_than_one_site:
            continue
        site_bg_count[(bg,s)] += 1

    bg_site_and_instance_count = [
        (
            bg,
            len(set(s for b, s in site_bg_count if bg == b)),
            sum(c for ((b,s), c) in site_bg_count.items() if bg ==b)
        )
        for bg in set([b for b,s in site_bg_count])
    ]
    # Add a total to sort by
    bg_site_and_instance_count += [
        (
            bg,
            "Total",
            sum(c for ((b,s), c) in site_bg_count.items() if bg ==b)
        )
        for bg in set([b for b,s in site_bg_count])
    ]

    df = pd.DataFrame(
        {
            "No of Sites": [b for a,b,c in bg_site_and_instance_count]
          , "Bigram" : [a for a,b,c in bg_site_and_instance_count]
          , "count" : [c for a,b,c in bg_site_and_instance_count]
        }
    )
    table = pd.pivot_table(
            df, values='count', index=['Bigram'],
            columns=['No of Sites'], aggfunc=np.sum, fill_value=0
        ).sort_values(
            by="Total",
            ascending=False
        )
    return table

# Syllabograms by Site

In [230]:
la_bigrams = getNgrams(lb_words, 1)
siteCountTable(la_bigrams)

No of Sites,2,3,4,5,6,7,8,9,10,11,12,13,14,15,Total
Bigram,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
𐄇,0,0,0,0,0,0,3096,0,0,0,0,0,0,0,3096
𐀵,0,0,0,0,0,0,0,0,0,2101,0,0,0,0,2101
𐀍,0,0,0,0,0,0,0,0,0,0,0,0,0,1960,1960
𐀒,0,0,0,0,0,0,0,0,0,0,0,0,0,1863,1863
𐀃,0,0,0,0,0,0,0,1856,0,0,0,0,0,0,1856
𐀁,0,0,0,0,0,0,0,0,0,0,1819,0,0,0,1819
𐀫,0,0,0,0,0,0,0,0,0,0,1736,0,0,0,1736
𐀀,0,0,0,0,0,0,0,0,0,0,1603,0,0,0,1603
𐀐,0,0,0,1476,0,0,0,0,0,0,0,0,0,0,1476
𐀊,0,0,0,0,0,0,0,0,0,1463,0,0,0,0,1463


# Trigrams by Site

In [231]:
la_bigrams = getNgrams(lb_words, 3)
siteCountTable(la_bigrams)

No of Sites,2,3,4,5,Total
Bigram,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
est,0,0,0,124,124
dee,0,0,0,124,124
ees,0,0,0,124,124
𐀐𐀐𐀕,0,116,0,0,116
v.↓,0,0,114,0,114
lum,0,0,0,113,113
ill,0,0,0,113,113
igi,0,0,0,113,113
llu,0,0,0,113,113
sig,0,0,0,113,113
