In [3]:
from pathlib import Path
import pandas as pd
from lz4.frame import compress
import math
import numpy as np
from english_words import get_english_words_set
web2lowerset = get_english_words_set(['web2'], lower=True)
shorter = [x for x in web2lowerset if 5< len(x) < 10]

In [4]:
data_file = "data/movies.csv"
data_tokens = list(pd.read_csv(data_file).columns)
data_tokens.append(data_file)


In [5]:
dt_by_slug = dict()
for notation in Path("galleries/movies/").glob("*"):
    print(notation.stem)
    for spec in notation.glob("*"):
        if spec.is_dir(): continue
        contents = spec.read_text()
        tokens = [dt for dt in data_tokens if dt in contents]
        if spec.stem not in dt_by_slug:
            dt_by_slug[spec.stem] = tokens
        elif dt_by_slug[spec.stem] != tokens:
            print(spec.stem, dt_by_slug[spec.stem], tokens)
            dt_by_slug[spec.stem] = tokens


seaborn
ggplot2
vega-lite
matplotlib
plotly_express


In [6]:

dialects = dict(
  verbose=["\n".join([shorter[i]] + list(dt)) for i, dt in enumerate(dt_by_slug.values())], 
  terse=["\n".join(["true" if x == '1' else "false" for x in bin(i)[2:].rjust(int(math.log2(40)) + 1, "0")] + list(dt)) for i, dt in enumerate(dt_by_slug.values())]
)

In [7]:
def num_tokens(specs):
    tokens = set()
    for s in specs:
        for t in s.split("\n"):
            tokens.add(t)
    return len(tokens)

In [8]:
{k: num_tokens(v) for k, v in dialects.items()}

{'verbose': 47, 'terse': 9}

In [9]:
def lc(x):
  return len(compress(x.encode("utf8")))
  
def nmi(s1,s2):
  if s1 == s2: return 0
  return 2*lc(s1+s2) - lc(s1) - lc(s2)

def distance_matrix(specs):
  return np.array([[nmi(s1, s2) for s2 in specs] for s1 in specs])

def mean_remoteness(specs):
  return np.mean(np.median(distance_matrix(specs), axis=1))


In [10]:
{k: mean_remoteness(v) for k, v in dialects.items()}

{'verbose': 72.3125, 'terse': 76.325}