In [1]:
import glob
import os
import json

from lxml import etree

## Fabritius

In [6]:
tree = etree.parse('../data/fabritius/xml/fr.xml')

For now, we are only interested in the `Iconography` field in each record:

In [7]:
terms_ = set()
for record in list(tree.findall('record')):
    iconography = record.find('Iconography')
    if iconography is not None:
        for terms in iconography:
            terms_.add(terms.tag)
print(terms_)

{'conceptualTerms', 'iconographicInterpretation', 'subjectInterpretiveHistory', 'specificSubjectIdentification', 'subjectTerms', 'iconographicTerms', 'generalSubjectDescription'}


## Subject Terms

Let's start with the field `subjectTerms`:

In [8]:
def parse_lil_field(field):
    main_term, sub_terms = "<UNK>", []
    if '(' in field:
        idx = field.find('(')
        main_term = field[:idx].strip()
        if not main_term:
            main_term  = '<UNK>'
        sub_terms = field[idx:].strip().replace('(', '').replace(')', '')
        sub_terms = [st.strip() for st in sub_terms.split(';')]
        sub_terms = [sorted(st.split(' : ')) for st in sub_terms]
    else:
        main_term = field.strip()
    return main_term, sub_terms

def parse_subjectTerms(field):
    field = ' '.join(field.strip().split())
    
    fields_, d = [], []
    
    if ') ; ' in field:
        fields_ = [f.strip() for f in field.split(') ;')]
    elif ' ; ' in field and ')' not in field:
        fields_ = [f.strip() for f in field.split(' ; ')]
    else:
        fields_ = [field]
        
    fields = []
    for field in fields_:
        if ' ; ' in field and '(' not in field:
            fields.extend(field.split(' ; '))
        else:
            fields.append(field)
    
    d = []
    for field in fields:      
        main_term, sub_terms = parse_lil_field(field.strip())
        d.append({main_term : sub_terms})
    
    return d

def parse_xml_file(path):
    meta = {}
    tree = etree.parse(path)
    
    for record in list(tree.findall('record')):
        record_id = record.find('RecordID').text
        iconography = record.find('Iconography')

        if iconography is None:
            continue

        subject = iconography.find('subjectTerms')
        if subject is not None:
            d = parse_subjectTerms(subject.text)
            meta[record_id] = d
    return meta

In [9]:
for path in sorted(glob.glob('../data/fabritius/xml/*.xml')):
    print(path)
    meta = parse_xml_file(path)
    bn = os.path.basename(path).replace('.xml', '.json')
    with open(f'../data/fabritius/json/{bn}', 'w', encoding='utf8') as json_file:
        json.dump(meta, json_file, ensure_ascii=False, indent=4)

../data/fabritius/xml/en.xml
../data/fabritius/xml/fr.xml
../data/fabritius/xml/nl.xml


As such, the structured of a parsed record will be:
   - a ordered list of one-key dictionaries,
   - that have a "main descriptor" as single entry (which can be "<UNK>")
   - the secondary descriptor consists of a list of lists, because a secondary descriptor can be a set of multiple terms

## Iconclass

In [1]:
import iconclass

In [2]:
iconclass.get('25G41')

{'c': ['25G41(...)',
  '25G411',
  '25G412',
  '25GG41',
  '25G41(+0)',
  '25G41(+1)',
  '25G41(+2)',
  '25G41(+3)'],
 'txt': {'it': 'fiori',
  'pt': 'flores',
  'zh': '花卉',
  'nl': 'Bloemen….(met NAAM)',
  'de': 'Blumen',
  'fi': 'kukat',
  'fr': 'fleurs',
  'en': 'flowers'},
 'kw': {'it': ['fiore'],
  'pt': ['flor'],
  'nl': ['bloem'],
  'de': ['Blume'],
  'fi': ['kukka'],
  'fr': ['fleur'],
  'es': ['flor'],
  'en': ['flower']},
 'n': '25G41',
 'p': ['2', '25', '25G', '25G4', '25G41']}

From the [website](https://labs.brill.com/ictestset/):
    
> The keys in the returned dict are:
> - 'n' The notation of this node (the same thing as the first parameter of the .get() call)
> - 'p' Path to this node, starting from the root of the hierarchy
> - 'c' Children (descendants) of this node
> - 'kw' Keywords, a dict keyed on two-letter language code containing keywords associated with this entry
> - 'txt Textual descriptions of this notation, a dict of strings keyed on two-letter language code.

Idea:
    - get frequency and co-occurence data from Brill's iconclass dataset
    - map onotologies using fasttext and co-ocurence probabilities (some kind of beam search?)

In [3]:
import json
with open('../data/iconclass_img/data.json') as f:
    ic = json.loads(f.read())

In [4]:
len(ic)

87744

In [5]:
documents = []
for _, codes in ic.items():
    documents.append(codes)

In [6]:
print(len(documents))

87744


In [7]:
documents[:10]

[['31A235', '31A24(+1)', '61B(+54)', '61B:31A2212(+1)', '61B:31D14'],
 ['41D92', '25G41'],
 ['11H', '11I35', '11I36'],
 ['11F25', '11FF25', '41E2'],
 ['25F3', '25F38(PHEASANT)', '25H', '25H11', '25H213', '34B232(+51)'],
 ['22E12',
  '31AA2352',
  '31B52',
  '41C122',
  '41C326',
  '41C71',
  '54A43(+11)',
  '58AA1',
  '86(ESSEN, TRINKEN UND DAS SPIEL DER VENUS SCHADEN DORT, WO etc.)',
  '98B(SARDANAPALUS)'],
 ['25I1', '46C24', '25H13', '46C513', '48C161', '48C149', '25I151'],
 ['31D14', '41A2', '48C7311', '48C75'],
 ['31A231', '31A24(+1)', '61B(+55)', '61B:31A2212(+1)'],
 ['61B(+53)', '61B:31D14']]

In [8]:
for doc in documents[:10]:
    for term in doc:
        try:
            print(term, iconclass.get(term)['kw']['fr'])
        except (KeyError, TypeError):
            pass

31A235 ['asseoir']
31A24(+1) ['position', 'tête']
41D92 ['femme']
25G41 ['fleur']
11H ['saint']
11F25 ['Mater Dolorosa', 'buste', 'en pied', 'mère', 'tête', 'à mi-corps']
41E2 ["illusion d'optique", 'peinture', "trompe l'oeil"]
25F3 ['oiseau']
25F38(PHEASANT) ['faisan']
25H ['paysage', 'paysage idéal']
25H11 ['montagne']
25H213 ['rivière']
34B232(+51) ['coq', 'poule', 'poulet', 'volaille']
22E12 ['brûler']
31AA2352 ['siège']
31B52 ['alcool', 'ivresse']
41C122 ['verser']
41C326 ['bocal', 'pichet']
41C71 ['alcool', 'boisson']
54A43(+11) ['Misura', 'Ripa', 'tempérance']
58AA1 ['destruction']
98B(SARDANAPALUS) ['Sardanapale']
25I1 ['cité idéale', 'veduta', 'ville']
46C24 ['voilier', 'yacht']
25H13 ['côte', 'mer']
46C513 ['voyageur']
48C161 ['colonne']
48C149 ['ruine']
25I151 ['fontaine']
31D14 ['adulte', 'homme']
41A2 ['intérieur', 'maison']
48C7311 ['violon']
48C75 ['instrument de musique', 'musicien']
31A231 ['debout', 'jambe']
31A24(+1) ['position', 'tête']


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

def identity(x):
    return x

vec = CountVectorizer(analyzer=identity, max_features=1000)
X = vec.fit_transform(documents).toarray()
print(X.shape)

(87744, 1000)


In [10]:
import pandas as pd

df = pd.DataFrame(X, columns=vec.get_feature_names())
df.head()

Unnamed: 0,Unnamed: 1,11(+5):61B,11A,11A23,11C12,11C13,11C23,11D,11D12,11D3,...,92D156,92D18(ARROW),92D18(BOW),92D18(QUIVER),92D1916,92L1,92L3,92L41,94C,98
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
df_asint = df.astype(int)
coocc = df_asint.T.dot(df_asint)
coocc.head()

Unnamed: 0,Unnamed: 1,11(+5):61B,11A,11A23,11C12,11C13,11C23,11D,11D12,11D3,...,92D156,92D18(ARROW),92D18(BOW),92D18(QUIVER),92D1916,92L1,92L3,92L41,94C,98
,740,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11(+5):61B,0,146,0,0,0,0,0,15,0,2,...,0,0,0,0,0,0,0,0,0,0
11A,0,0,73,0,6,0,1,5,1,0,...,0,0,0,0,4,0,0,0,0,0
11A23,0,0,0,191,7,1,4,0,5,2,...,0,4,5,5,0,0,0,0,0,0
11C12,0,0,6,7,69,2,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
# https://stackoverflow.com/questions/58701337/how-to-construct-ppmi-matrix-from-a-text-corpus

import numpy as np

def pmi(df, positive=True):
    col_totals = df.sum(axis=0)
    total = col_totals.sum()
    row_totals = df.sum(axis=1)
    expected = np.outer(row_totals, col_totals) / total
    df = df / expected
    # Silence distracting warnings about log(0):
    with np.errstate(divide='ignore'):
        df = np.log(df)
    df[np.isinf(df)] = 0.0  # log(0) = 0
    if positive:
        df[df < 0] = 0.0
    return df

In [35]:
ppmi = pmi(coocc, positive=True)

In [65]:
from itertools import combinations

pairs = []
for a, b in combinations(ppmi.index.tolist(), 2):
    pairs.append((ppmi.loc[a][b], a, b))

In [69]:
pairs = sorted(pairs, reverse=True)

for sc, a, b in pairs[:200]:
    print('===================')
    try:
        print(iconclass.get(a)['kw']['en'], '&&&', iconclass.get(b)['kw']['en'], sc)
    except:
        pass

['zoo'] &&& ['zoo'] 7.099737129167545
['handling', 'weapon'] &&& ['weapon'] 7.036006383948521
['Mostri', 'Ripa', 'animal', 'fabulous animal', 'fiction', 'grotesque'] &&& ['Mostri', 'animal', 'monster', 'shape'] 6.799204055124994
['city'] &&& ['fortress'] 6.629723928849419
['beard', 'old man'] &&& ['monstrance', 'ostensorium'] 6.554838716947378
['position-war', 'war'] &&& ['capturing (city)'] 6.445761224967057
['country', 'region'] &&& ['Armee', 'Heer', 'army', 'armée', 'battle', 'esercito'] 6.415016087304869
['country', 'region'] &&& ['position-war', 'war'] 6.40488941848694
['Mary Magdalene (St.)', 'book', 'crown', 'crown of thorns', 'crucifix', 'jar', 'mirror', 'musical instrument', 'ointment', 'palm-branch', 'rosary', 'scourge', 'scroll', 'whore'] &&& ['jar', 'ointment'] 6.372464457842415
['priest'] &&& ['caricature', 'criticizing', 'mocking'] 6.317653228460017
['hisser', 'hochziehen', 'hoisting', 'horse', 'pulling', 'sollevare', 'tirare', 'tirer', 'ziehen'] &&& ['harness'] 6.2632578

['storm at sea'] &&& ['shipwreck'] 4.705430516897925
['czar', 'emperor', 'tsar'] &&& ['badge', 'chain', 'insignia'] 4.701721963548689
['cab', 'carriage'] &&& ['ornament'] 4.697415367449593
['monstrance', 'ostensorium'] &&& ['book', 'codex'] 4.694707301430838
['hunting', 'occupations'] &&& ['Folge', 'Reihe', 'Serie', 'abstract idea', 'allegory', 'idea', 'personification', 'serie', 'series', 'série'] 4.689187969807946
['city'] &&& ['capturing (city)'] 4.686970015975543
['woman'] &&& ['Mary Magdalene (St.)', 'book', 'crown', 'crown of thorns', 'crucifix', 'jar', 'mirror', 'musical instrument', 'ointment', 'palm-branch', 'rosary', 'scourge', 'scroll', 'whore'] 4.6788896662021155
['John the Baptist (St.)', 'baptismal cup', 'book', 'cross', 'cup', 'honeycomb', 'lamb', 'reed', 'staff'] &&& ['lamb'] 4.672379809043606
['hare'] &&& ['song-bird'] 4.660848867386315
['John (St.)', 'apostle', 'book', 'cauldron', 'chalice', 'eagle', 'evangelist', 'palm-branch', 'scroll', 'snake'] &&& ['Mary Magdalene

## Show labels

In [None]:
import gensim.downloader as api
from gensim import corpora
from gensim.matutils import softcossim

sent_1 = 'Dravid is a cricket player and a opening batsman'.split()
sent_2 = 'Leo is a cricket player too He is a batsman,baller and keeper'.split()

# Download the FastText model
fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')

# Prepare a dictionary and a corpus.
documents = [sent_1, sent_2]
dictionary = corpora.Dictionary(documents)

# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary)

# Convert the sentences into bag-of-words vectors.
sent_1 = dictionary.doc2bow(sent_1)
sent_2 = dictionary.doc2bow(sent_2)

# Compute soft cosine similarity
print(softcossim(sent_1, sent_2, similarity_matrix))
#> 0.7909639717134869

In [None]:
https://fasttext.cc/docs/en/aligned-vectors.html