In [1]:
import requests


def read_code_schema():
    response = requests.get(spreadsheet_urls['code_schema'])
    rows = response.text.split('\r\n')
    headers = rows.pop(0).split('\t')
    schema = []
    level1 = None
    level2 = None
    for row in rows:
        row_cells = row.split('\t')
        row_json = {header: row_cells[hi] for hi, header in enumerate(headers)}
        if row_json['Level 1'] != '':
            level1 = row_json['Level 1']
        else:
            row_json['Level 1'] = level1
        if row_json['Level 2'] != '':
            level2 = row_json['Level 2']
        else:
            row_json['Level 2'] = level2            
        schema.append(row_json)
    return schema


spreadsheet_urls = {
    'code_schema': 'https://docs.google.com/spreadsheets/d/1gwXlwLR-kc1wNyKTfBA5ZyVuUq_WXjl55neSq763eC0/export?gid=736749451&format=tsv'
}

code_schema = read_code_schema()

In [2]:
domain_codes = [code_info for code_info in code_schema if code_info['Level 1'] == 'Domain']
domain_tags = [code_info['Copy-paste strings'] for code_info in domain_codes]
domain_tags

['domain;aerospace',
 'domain;books',
 'domain;children',
 'domain;conversational agents',
 'domain;creativity',
 'domain;data',
 'domain;digital library ',
 'domain;disabilities',
 'domain;DIY',
 'domain;e-commerce',
 'domain;education',
 'domain;games',
 'domain;government',
 'domain;health',
 'domain;history',
 'domain;learning',
 'domain;legal',
 'domain;libraries',
 'domain;literature',
 'domain;misinformation',
 'domain;movies',
 'domain;multilinguality',
 'domain;museum',
 'domain;music',
 'domain;news',
 'domain;older people',
 'domain;politics',
 'domain;privacy',
 'domain;research',
 'domain;science',
 'domain;social media',
 'domain;social sciences',
 'domain;visualization',
 'domain;wikipedia',
 'domain;WWW',
 'domain;...',
 'domain;domain-agnostic',
 'domain;unknown',
 'domain;IIR',
 'domain;search engines',
 'domain;search',
 'domain;mobile search',
 'domain;digital libraries',
 'domain;search-as-learning',
 'domain;collaborative search',
 'domain;personal information',
 

In [3]:
domain_codes

[{'Level 1': 'Domain',
  'Level 2': 'Aerospace',
  'Example': '',
  'Copy-paste strings': 'domain;aerospace',
  'Note': 'although this is perphaps also the work context, I assume the most important aspect here is that the documents focus on aerospace information.',
  'Category': 'document:topic'},
 {'Level 1': 'Domain',
  'Level 2': 'Books / E-Books',
  'Example': '',
  'Copy-paste strings': 'domain;books',
  'Note': '',
  'Category': 'document:type'},
 {'Level 1': 'Domain',
  'Level 2': 'Children',
  'Example': '',
  'Copy-paste strings': 'domain;children',
  'Note': '',
  'Category': 'user:group'},
 {'Level 1': 'Domain',
  'Level 2': 'Conversational agents',
  'Example': '',
  'Copy-paste strings': 'domain;conversational agents',
  'Note': '',
  'Category': 'application:technology'},
 {'Level 1': 'Domain',
  'Level 2': 'Creativity',
  'Example': '',
  'Copy-paste strings': 'domain;creativity',
  'Note': '',
  'Category': 'task:work;user:issue'},
 {'Level 1': 'Domain',
  'Level 2': 'D

In [27]:
from collections import Counter


def read_zotero_file(zotero_file):
    with open(zotero_file, 'rt') as fh:
        for line in fh:
            if len(line.strip()) == 0:
                continue
            elif line.startswith('<'):
                continue
            try:
                code, tag = line.strip().split(' ', 1)
                yield code, tag
            except ValueError:
                print(line)
                raise
    
tag_freq = Counter()
zotero_file = '../data/zotero/BIRRD-Zotero/BIRRD-Zotero.txt'


for code, tag in read_zotero_file(zotero_file):
    if code != 'K1':
        continue
    tag_freq.update([tag])
    
for tag, freq in tag_freq.most_common():
    if tag.startswith('domain'):
        if tag in domain_codes:
            continue
        print(tag)
        #print(f"{tag: <40}{freq: >8}")
        

domain;IIR
domain;search engines
domain;search
domain;social media
domain;mobile search
domain;digital libraries
domain;search-as-learning
domain;collaborative search
domain;personal information
domain;exploratory search
domain;images
domain;e-mail
domain;question answering
domain;recommender systems
domain;information seeking
domain;evaluation
domain;Wikipedia
domain;
domain;videos
domain;archives/museums
domain;UX
domain;information literacy
domain;relevance assessment
domain;data visualization
domain;mobile devices
domain;memory
domain;personal information management
domain;tourism
domain;text
domain;speech
domain:IIR
domain;medical
domain;crowdsourcing
domain;research data
domain;distraction
domain;trust
domain;Twitter
domain;digital humanities
domain;serendipity
domain;diversity
domain;methodology
domain;tasks
domain;usability
domain;web archiving
domain:WWW
domain;ir
domain;personalization
domain;profiling
domain; user engagement
domain;academic
domain;collaboration
domain;health

In [12]:
for code_info in domain_codes:
    domain = code_info['Copy-paste strings'].replace('domain;', '').replace('domain:', '').strip()
    categories = code_info['Category'].split(';')
    specific_cats = [f"{cat}:{domain.replace(' ', '_').replace(';', ':')}" for cat in categories]
    print('; '.join(specific_cats))


document:topic:aerospace
document:type:books
user:group:children
application:technology:conversational_agents
task:work:creativity; user:issue:creativity
document:type:data
application:digital_library
user:issue:disabilities; user:group:disabilities
task:work:DIY
task:work:e-commerce
task:work:education
document:type:games
document:topic:government
document:topic:health
document:topic:history; task:work:history
task:context:learning
document:topic:legal; task:work:legal
task:context:libraries
document:type:literature
document:issue:misinformation
document:type:movies
document:issue:multilinguality
task:context:museum
document:genre:music
document:genre:news
user:group:older_people
document:topic:politics
user:issue:privacy
task:work:research
document:topic:science; task:work:science
document:genre:social_media
document:topic:social_sciences; task:work:social_sciences
application:feature:visualization
document:genre:wikipedia
document:genre:WWW
~:...
~:domain-agnostic
~:unknown
research