## get artwork df

In [1]:
import pandas as pd
import json
import requests
import wget
import time
import os
from SPARQLWrapper import SPARQLWrapper, JSON

def get_sparql_dataframe(service, query):
    """
    Helper function to convert SPARQL results into a Pandas data frame.
    """
    sparql = SPARQLWrapper(service)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()

    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)

    return pd.DataFrame(out, columns=cols)
  
q = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX : <http://h-da.de/fbi/artontology/>

SELECT ?artwork ?url ?name ?abstract (group_concat(?motif;separator=';') as ?motifs)
WHERE {
  ?artwork rdf:type :artwork;
     rdfs:label ?name;
     :image ?url;
     :abstract ?abstract;
     :motif/rdfs:label ?motif.
}
group by ?artwork ?url ?name ?abstract
"""

ds = "http://neuds.de:3030/artontology"
df = get_sparql_dataframe(ds, q)
df['id'] = df['artwork'].map(lambda x: x.split('/')[-1])
df.head()

Unnamed: 0,artwork,url,name,abstract,motifs,id
0,http://www.wikidata.org/entity/Q29019486,https://upload.wikimedia.org/wikipedia/commons...,Saint Proculus of Pozzuoli and his mother Sant...,Saints Proculus and Nicea is a 1636-1637 paint...,Nicea;Saint Procolo;Saint Procolo and Nicea,Q29019486
1,http://www.wikidata.org/entity/Q7527543,https://upload.wikimedia.org/wikipedia/commons...,Statue of John A. Macdonald,The Sir John A. Macdonald statue is a bronze s...,John A. Macdonald,Q7527543
2,http://www.wikidata.org/entity/Q3944494,https://upload.wikimedia.org/wikipedia/commons...,The Holy Family with Saint Catherine of Alexan...,Holy Family with Saint Catherine of Alexandria...,Virgin Mary;Catherine of Alexandria;boy;Child ...,Q3944494
3,http://www.wikidata.org/entity/Q368788,https://upload.wikimedia.org/wikipedia/commons...,Pietà,Pietà is a painting by the Italian Renaissance...,Virgin Mary;Jesus Christ;man;woman,Q368788
4,http://www.wikidata.org/entity/Q3630743,https://upload.wikimedia.org/wikipedia/commons...,Self-Portrait at the Age of 63,Self-Portrait at the Age of 63 is a self-portr...,Rembrandt;man,Q3630743


## create model

prepare corpus

In [10]:
from collections import defaultdict
from gensim.utils import simple_preprocess
from gensim import corpora
from gensim import models
from gensim import similarities

text_corpus = df['abstract'].values

# Create a set of frequent words
stoplist = set('for a of the and to in'.split(' '))
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in text_corpus]

# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 3] for text in texts]
# create dictionary
art_dict = corpora.Dictionary(processed_corpus)
# create bow
bow_corpus = [art_dict.doc2bow(text) for text in processed_corpus]
# create tfidf model
tfidf = models.TfidfModel(bow_corpus)

## tests

In [17]:
words = df[df['id'] == 'Q3630743']['abstract'].values[0].lower().split()
print(tfidf[art_dict.doc2bow(words)])

[(4, 0.03914012914889163), (6, 0.06393841288682726), (7, 0.007054869085089224), (18, 0.0009280072527049965), (19, 0.030456457869645865), (26, 0.01117035765207907), (34, 0.07990437425779509), (35, 0.0820346801897768), (37, 0.0491973845526996), (46, 0.03234060587361309), (49, 0.08261169672403249), (65, 0.10313427813835543), (76, 0.012723314984494945), (77, 0.02900833133146859), (86, 0.11481248072126692), (90, 0.06294919270537178), (109, 0.05961919481930718), (113, 0.020161725454353), (115, 0.02682094643053469), (121, 0.06655619075700493), (132, 0.05078901017176469), (135, 0.021181315547168605), (138, 0.05553040989065565), (146, 0.11670848601006444), (147, 0.1096038114387776), (148, 0.09475721812502487), (149, 0.09952211777936648), (150, 0.11400741409730408), (151, 0.12381316058135125), (152, 0.05634978280958678), (153, 0.1199031760153161), (154, 0.10608760262390701), (155, 0.05399836130963105), (156, 0.09561213321333133), (157, 0.07757084989432693), (158, 0.08140278407075766), (159, 0.09

## ideas

simple doc

In [50]:
from gensim.matutils import cossim as csim
s1 = df[df['id'] == 'Q179900']['abstract'].values[0].lower().split()
s2 = df[df['id'] == 'Q205259']['abstract'].values[0].lower().split()
v1 = art_dict.doc2bow(s1)
v2 = art_dict.doc2bow(s2)
size_art = len(art_dict)
print(csim(v1, v2))

0.17818653576961846


lda

In [54]:
import pickle
lda_model = pickle.load(open('nlp_lda.pkl', 'rb'))
lda_dict = pickle.load(open('nlp_dict.pkl', 'rb'))

s1 = df[df['id'] == 'Q179900']['abstract'].values[0].lower().split()
s2 = df[df['id'] == 'Q205259']['abstract'].values[0].lower().split()
v1 = lda_dict.doc2bow(s1)
v2 = lda_dict.doc2bow(s2)
lda1 = lda[v1]
lda2 = lda[v2]
print(csim(lda1, lda2))

0.776296337124219
