In [1]:
import numpy as np
import pandas as pd
import json
import string
import random
import spacy
import math
from collections import defaultdict, Counter
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
random.seed(0)

In [3]:
with open('../v2/train.json', 'r') as f:
    train_set = json.load(f)
    
with open('../v2/val.json', 'r') as f:
    val_set = json.load(f)
    
with open('../v2/test.json', 'r') as f:
    test_set = json.load(f)

### Merge splits

In [4]:
dataset = defaultdict(dict)

for _set in [train_set, val_set, test_set]:
    for img_path in _set:
        for game_id in _set[img_path]:
            chain = _set[img_path][game_id]
            dataset[img_path][game_id] = chain

### Fine-grained image domains

**Vehicles**: person_motorcycle, car_motorcycle, bus_truck, car_truck  

**Accessories**: person_suitcase, person_umbrella  

**Food**: bowl_dining_table, cup_dining_table, cake_dining_table  

**Appliances**: person_oven, dining_table_refrigerator, person_refrigerator  

**Laptops**: dining_table_laptop, couch_laptop  

**Bed and couch**: person_bed, person_couch

**Outdoor**: person_surfboard, person_elephant, person_bicycle

with open('../data/chains-fine-domain/train.json', 'w') as f_out:
    json.dump(chains_train, fp=f_out, indent=2, default=str)
    
with open('../data/chains-fine-domain/val.json', 'w') as f_out:
    json.dump(chains_val, fp=f_out, indent=2, default=str)
    
with open('../data/chains-fine-domain/test.json', 'w') as f_out:
    json.dump(chains_test, fp=f_out, indent=2, default=str)

----

### Coarse-grained image domains

**Vehicles**: person_motorcycle, car_motorcycle, bus_truck, car_truck  

**Outdoor**: person_suitcase, person_umbrella, person_surfboard, person_elephant, person_bicycle, person_car, person_train, person_bench, person_truck

**Food**: bowl_dining_table, cup_dining_table, cake_dining_table  

**Appliances**: person_oven, dining_table_refrigerator, person_refrigerator  

**Indoor**: person_bed, person_couch, person_tv, couch_dining_table, person_teddy_bear, chair_couch, dining_table_laptop, couch_laptop


In [6]:
d2dlist = {
    'vehicles': ['person_motorcycle', 'car_motorcycle', 'bus_truck', 'car_truck'],
    'outdoor': ['person_suitcase', 'person_umbrella', 'person_surfboard', 'person_elephant', 'person_bicycle', 'person_car', 'person_train', 'person_bench', 'person_truck'],
    'food': ['bowl_dining_table', 'cup_dining_table', 'cake_dining_table'],
    'appliances': ['person_oven', 'dining_table_refrigerator', 'person_refrigerator'],
    'indoor': ['dining_table_laptop', 'couch_laptop', 'person_bed', 'person_couch', 'person_tv', 'couch_dining_table', 'person_teddy_bear', 'chair_couch']
}

d2d = {}
for new in d2dlist:
    for old in d2dlist[new]:
        d2d[old] = new

In [7]:
chains_train = {d: defaultdict(dict) for d in d2d.values()}
chains_val = {d: defaultdict(dict) for d in d2d.values()}
chains_test = {d: defaultdict(dict) for d in d2d.values()}

n_utts = defaultdict(lambda: defaultdict(int))
    
for img_path in dataset:
    old_domain, _ = img_path.split('/')
    
    if old_domain not in d2d:
        continue  # skip domains that have not been clustered
    
    new_domain = d2d[old_domain] 
       
    # ----------------------------------------
    game_ids = list(dataset[img_path].keys())
    random.shuffle(game_ids)
    train, val, test = np.split(game_ids, [int(len(game_ids)*0.7), int(len(game_ids)*0.85)]) 
    # ----------------------------------------
    
    for game_id in dataset[img_path]:
        if game_id in train:
            set_name = 'train'
            chains_train[new_domain][img_path][game_id] = dataset[img_path][game_id]
        elif game_id in val:
            set_name = 'val'
            chains_val[new_domain][img_path][game_id] = dataset[img_path][game_id]
        elif game_id in test:
            set_name = 'test'
            chains_test[new_domain][img_path][game_id] = dataset[img_path][game_id]
        else:
            raise ValueError()

        n_utts[set_name][old_domain] += len(dataset[img_path][game_id])
        n_utts[set_name][new_domain] += len(dataset[img_path][game_id])


In [8]:
print('Total games:', len(list(dataset.keys())))

# skip domains that have not been clustered

relevant_img_paths_list = []
relevant_img_paths = defaultdict(list)

for img_path in dataset:
    old_domain, _ = img_path.split('/')
    if old_domain not in d2d:
        continue  # skip domains that have not been clustered
    new_domain = d2d[old_domain] 
    if old_domain in d2d:
        relevant_img_paths_list.append(img_path)
        relevant_img_paths[new_domain].append(img_path)

print('Number of domains:', len(relevant_img_paths))
print('Number of games left:', sum(map(len, relevant_img_paths.values())))

unseen_img_paths = []
for d in relevant_img_paths:
    print(d, len(relevant_img_paths[d]))
    print(int(len(relevant_img_paths[d])*0.06))
    random.shuffle(relevant_img_paths[d])
    unseen_img_paths_d, _ = np.split(relevant_img_paths[d], [int(len(relevant_img_paths[d])*0.06)])
    unseen_img_paths.extend(unseen_img_paths_d)

print(len(unseen_img_paths))
# random.shuffle(relevant_img_paths)

# unseen_img_paths, _ = np.split(relevant_img_paths, [int(len(relevant_img_paths)*0.05)])

Total games: 360
Number of domains: 5
Number of games left: 324
outdoor 108
6
vehicles 48
2
indoor 96
5
food 36
2
appliances 36
2
17


## #todo: split image ids within domains instead!

In [9]:
len(unseen_img_paths)

17

In [10]:
chains_train = {d: defaultdict(dict) for d in d2d.values()}
chains_val = {d: defaultdict(dict) for d in d2d.values()}
chains_test = {d: defaultdict(dict) for d in d2d.values()}
chains_test_unseen = {d: defaultdict(dict) for d in d2d.values()}
chains_test_seen = {d: defaultdict(dict) for d in d2d.values()}


n_utts = defaultdict(lambda: defaultdict(int))
    
for img_path in relevant_img_paths_list:
    old_domain, _ = img_path.split('/')
    new_domain = d2d[old_domain] 
    
    if img_path in unseen_img_paths:
        for game_id in dataset[img_path]:
            set_name = 'test'
            chains_test[new_domain][img_path][game_id] = dataset[img_path][game_id]
            n_utts[set_name][old_domain] += len(dataset[img_path][game_id])
            n_utts[set_name][new_domain] += len(dataset[img_path][game_id])
            
            set_name ='test_unseen'
            chains_test_unseen[new_domain][img_path][game_id] = dataset[img_path][game_id]
            n_utts[set_name][old_domain] += len(dataset[img_path][game_id])
            n_utts[set_name][new_domain] += len(dataset[img_path][game_id])
    else:
            
        # ----------------------------------------
        game_ids = list(dataset[img_path].keys())
        random.shuffle(game_ids)
        train, val, test = np.split(game_ids, [int(len(game_ids)*0.74), int(len(game_ids)*0.90)]) 
        # ----------------------------------------

        for game_id in dataset[img_path]:
            if game_id in train:
                set_name = 'train'
                chains_train[new_domain][img_path][game_id] = dataset[img_path][game_id]
            elif game_id in val:
                set_name = 'val'
                chains_val[new_domain][img_path][game_id] = dataset[img_path][game_id]
            elif game_id in test:
                set_name = 'test'
                chains_test[new_domain][img_path][game_id] = dataset[img_path][game_id]
            else:
                raise ValueError()

            n_utts[set_name][old_domain] += len(dataset[img_path][game_id])
            n_utts[set_name][new_domain] += len(dataset[img_path][game_id])
            
            if game_id in test:
                set_name = 'test_seen'
                chains_test_seen[new_domain][img_path][game_id] = dataset[img_path][game_id]
                n_utts[set_name][old_domain] += len(dataset[img_path][game_id])
                n_utts[set_name][new_domain] += len(dataset[img_path][game_id])


In [11]:
train_n, val_n, test_n, test_seen_n, test_unseen_n = 0, 0, 0, 0, 0
for d in d2dlist:
    train_n += n_utts['train'][d]
    val_n += n_utts['val'][d]
    test_n += n_utts['test'][d]
    test_seen_n += n_utts['test_seen'][d]
    test_unseen_n += n_utts['test_unseen'][d]
tot = sum([train_n, val_n, test_n])
tot2 = sum([train_n, val_n, test_seen_n, test_unseen_n])

print(tot, tot2)

45741 45741


In [12]:
train_n / tot, val_n / tot, test_n / tot

(0.6909118733739971, 0.1525327386808334, 0.15655538794516954)

In [13]:
train_n / tot, val_n / tot, test_seen_n / tot, test_unseen_n / tot

(0.6909118733739971,
 0.1525327386808334,
 0.10380184079928292,
 0.05275354714588662)

In [15]:
for d in d2dlist:
    
    with open('../data/chains-domain-specific/{}/train.json'.format(d), 'w') as f_out:
        json.dump(chains_train[d], fp=f_out, indent=2, default=str)

    with open('../data/chains-domain-specific/{}/val.json'.format(d), 'w') as f_out:
        json.dump(chains_val[d], fp=f_out, indent=2, default=str)

    with open('../data/chains-domain-specific/{}/test.json'.format(d), 'w') as f_out:
        json.dump(chains_test[d], fp=f_out, indent=2, default=str)
        
    with open('../data/chains-domain-specific/{}/test_seen.json'.format(d), 'w') as f_out:
        json.dump(chains_test_seen[d], fp=f_out, indent=2, default=str)
        
    with open('../data/chains-domain-specific/{}/test_unseen.json'.format(d), 'w') as f_out:
        json.dump(chains_test_unseen[d], fp=f_out, indent=2, default=str)

In [16]:
speaker_dataset_train = defaultdict(dict)
speaker_dataset_val = defaultdict(dict)
speaker_dataset_test = defaultdict(dict)
speaker_dataset_test_seen = defaultdict(dict)
speaker_dataset_test_unseen = defaultdict(dict)

for d in d2dlist:
    
    for img_path in chains_train[d]:
        for game_id in chains_train[d][img_path]:
            chain = chains_train[d][img_path][game_id]
            speaker_dataset_train[img_path][game_id] = chain
            
    for img_path in chains_val[d]:
        for game_id in chains_val[d][img_path]:
            chain = chains_val[d][img_path][game_id]
            speaker_dataset_val[img_path][game_id] = chain
            
    for img_path in chains_test[d]:
        for game_id in chains_test[d][img_path]:
            chain = chains_test[d][img_path][game_id]
            speaker_dataset_test[img_path][game_id] = chain
            
    for img_path in chains_test_seen[d]:
        for game_id in chains_test_seen[d][img_path]:
            chain = chains_test_seen[d][img_path][game_id]
            speaker_dataset_test_seen[img_path][game_id] = chain
            
    for img_path in chains_test_unseen[d]:
        for game_id in chains_test_unseen[d][img_path]:
            chain = chains_test_unseen[d][img_path][game_id]
            speaker_dataset_test_unseen[img_path][game_id] = chain


In [17]:
with open('../data/chains-domain-specific/speaker/train.json'.format(d), 'w') as f_out:
    json.dump(speaker_dataset_train, fp=f_out, indent=2, default=str)

with open('../data/chains-domain-specific/speaker/val.json'.format(d), 'w') as f_out:
    json.dump(speaker_dataset_val, fp=f_out, indent=2, default=str)

with open('../data/chains-domain-specific/speaker/test.json'.format(d), 'w') as f_out:
    json.dump(speaker_dataset_test, fp=f_out, indent=2, default=str)

with open('../data/chains-domain-specific/speaker/test_seen.json'.format(d), 'w') as f_out:
    json.dump(speaker_dataset_test_seen, fp=f_out, indent=2, default=str)

with open('../data/chains-domain-specific/speaker/test_unseen.json'.format(d), 'w') as f_out:
    json.dump(speaker_dataset_test_unseen, fp=f_out, indent=2, default=str)