In [55]:
import numpy as np
import pandas as pd
import spacy
import string
from spacy.lang.en import English
import json
from collections import Counter, defaultdict

In [56]:
tokenizer = English().tokenizer

vocabs = defaultdict(lambda: Counter())
n_utt = defaultdict(int)
utt_lens = defaultdict(list)
unique_images = defaultdict(lambda: set())
unique_games = defaultdict(lambda: set())

for domain in ['appliances', 'food', 'indoor', 'outdoor', 'vehicles', 'all']:

    for split in ['train', 'val', 'test_seen', 'test_unseen']:

        data_file = f"/Users/mario/code/pb_speaker_adaptation/dataset/chains-domain-specific/{domain}/{split}.json"
        with open(data_file, "r") as f:
            subset = json.load(f)

        for img_path in subset:
            unique_images[domain].add(img_path)

            for game_id in subset[img_path]:
                unique_games[domain].add(int(game_id))

                for utt in subset[img_path][game_id]:
                    n_utt[domain] += 1

                    utt_len = 0
                    for tok in tokenizer(utt['Message_Text']):
                        vocabs[domain][tok.text.strip().lower()] += 1
                        utt_len += 1
                    utt_lens[domain].append(utt_len)
        # break
    # break


In [43]:
for domain in vocabs:
    print(f'{domain}: {len(vocabs[domain])}')

appliances: 1271
food: 1646
indoor: 2477
outdoor: 2858
vehicles: 1738
all: 6038


In [70]:
for domain in n_utt:
    print('{}: {} {:.1f}'.format(domain, n_utt[domain], n_utt[domain] / n_utt["all"] * 100))

appliances: 4310 9.4
food: 5682 12.4
indoor: 12088 26.4
outdoor: 16427 35.9
vehicles: 7234 15.8
all: 45741 100.0


In [45]:
for domain in unique_images:
    print(f'{domain}: {len(unique_images[domain])}')

appliances: 36
food: 36
indoor: 96
outdoor: 108
vehicles: 48
all: 324


In [64]:
stopwords_en = spacy.lang.en.stop_words.STOP_WORDS
stopwords_en |= {'no', 'noo', 'nope', 'yes', 'yeah', 'ok', 'oh', 'ha', 'i', 'you', ' '}
punctuation = set(string.punctuation)

stopword = lambda x: x in stopwords_en or x.isnumeric() or all([c in punctuation for c in x])
vocabs_clean = {}
for d in vocabs:
    vocabs_clean[d] = Counter({w: fr for w, fr in vocabs[d].items() if not stopword(w)})  # and fr > 1})

In [65]:
for domain in vocabs:
    if domain == 'all':
        continue

    id_vocab = set(vocabs[domain].keys())
    ood_vocab = set()
    for _domain in vocabs:
        if _domain not in ['all', domain]:
            for w in vocabs[_domain]:
                ood_vocab.add(w)

    overlap = id_vocab & ood_vocab
    overlap_percentage = float(len(overlap)) / len(id_vocab) * 100

    print('{}: {:.1f}'.format(domain, 100 - overlap_percentage))

appliances: 29.5
food: 43.3
indoor: 44.2
outdoor: 47.0
vehicles: 36.0


In [67]:
for domain in vocabs:
    if domain == 'all':
        continue

    print(domain)
    id_vocab = set(vocabs[domain].keys())

    for _domain in vocabs:
        if _domain not in ['all', domain]:
            ood_vocab = set()
            for w in vocabs[_domain]:
                ood_vocab.add(w)

            overlap = id_vocab & ood_vocab
            universe = id_vocab | ood_vocab
            overlap_percentage = float(len(overlap)) / len(universe) * 100

            print('  {}: {:.1f}'.format(_domain, overlap_percentage))

appliances
  food: 22.9
  indoor: 23.1
  outdoor: 21.0
  vehicles: 23.2
food
  appliances: 22.9
  indoor: 22.1
  outdoor: 18.3
  vehicles: 20.6
indoor
  appliances: 23.1
  food: 22.1
  outdoor: 26.0
  vehicles: 23.3
outdoor
  appliances: 21.0
  food: 18.3
  indoor: 26.0
  vehicles: 26.2
vehicles
  appliances: 23.2
  food: 20.6
  indoor: 23.3
  outdoor: 26.2
