-
Notifications
You must be signed in to change notification settings - Fork 1
/
expand_records.py
165 lines (129 loc) · 6.03 KB
/
expand_records.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
from database import db as database
from lxml import etree
from extract_and_annotate_entities import *
# Restituisce l'id VIAF, se trovato, dell'autore del libro (record['creator'])
def get_author_viaf_id(record):
# escludi campi creator con nomi di città
if record['creator'].startswith(('Pisa <', 'Livorno <', 'Prato <', 'Siena <', 'Lucca <', 'Firenze', 'Arezzo <')):
return ''
# .split(' <')[0] per fixare stringhe del tipo "Gurrieri, Francesco <1937- >"
creator = record['creator'].split(' <')[0]
if creator != '':
re = requests.get('http://www.viaf.org/viaf/AutoSuggest?query={}'.format(creator))
j = re.json()
if j['result']:
for result in j['result']:
if result['nametype'] == 'personal':
return result['viafid']
return ''
# Restituisce la biografia dell'autore del libro, se trovato (record['creator'])
def get_author_wikipedia_info(record):
try:
wikipedia.set_lang('it')
creator = record['creator'].split(' <')[0]
if creator != '':
summary = wikipedia.summary(creator, sentences=1)
return summary
return None
except:
return None
# Restituisce la pagina Wikipedia estratta dal VIAF
def get_author_wikipedia_page(viaf_id):
response = requests.get('https://viaf.org/viaf/{}/viaf.xml'.format(viaf_id))
tree = etree.fromstring(response.content)
for link in tree.findall('.//ns1:xLink', tree.nsmap):
if 'it.wikipedia.org' in link.text:
return link.text
return None
# Restituisce un elenco di altre opere dello stesso autore, prese da VIAF
def get_opere(viaf_id):
response = requests.get('https://viaf.org/viaf/{}/viaf.xml'.format(viaf_id))
tree = etree.fromstring(response.content)
opere = []
for work in tree.findall('.//ns1:work', tree.nsmap):
title = work.find('ns1:title', tree.nsmap).text
opere.append(title)
if len(opere) > 4:
break
return opere
# Esegui espansione
def do_expand():
x = 0
print(' [*] connecting to database...')
db = database.get_db()
db.execute('delete from expanded_records')
db.execute('delete from entities')
db.execute('delete from entity_for_record')
db.execute("delete from sqlite_sequence where name='expanded_records'")
db.execute("delete from sqlite_sequence where name='entity_for_record'")
print(' [*] requesting records via API...')
r = requests.get('http://127.0.0.1:5000/api/v1/records') # !!! Da cambiare
json = r.json()
exp = []
exp2 = []
exp3 = []
bit = 100 / len(json)
query = "INSERT INTO expanded_records(id, viaf_id, author_other_works, author_wiki_page, author_wiki_info) VALUES(?, ?, ?, ?, ?)"
query2 = "INSERT INTO entities(entity_id, title, abstract, image_url, coords, uri) VALUES(?, ?, ?, ?, ?, ?)"
query3 = "INSERT INTO entity_for_record(record_id, entity_id) VALUES(?, ?)"
for record in json:
id = record['id']
print(' [*] RECORD WITH ID {}'.format(id))
viaf_id = get_author_viaf_id(record)
altre_opere = ''
wiki_page = ''
wiki_info = ''
# Estrai informazioni autore
if viaf_id != '':
opere = get_opere(viaf_id)
wiki = get_author_wikipedia_page(viaf_id)
if len(opere) > 0:
altre_opere = "~~".join(opere)
if wiki is not None:
wiki_page = wiki
summary = get_author_wikipedia_info(record)
if summary is not None:
wiki_info = summary
exp.append((id, viaf_id, altre_opere, wiki_page, wiki_info))
# Testo su cui fare estrazione di entità: descrizione, soggetti, autore e autore secondario
text_to_extract_from = clean(record['description']) + ', ' + record['subject'] + ', ' + \
author_cleanup(record['creator']) + ', ' + author_cleanup(record['contributor'])
print(' [*] text: {}'.format(text_to_extract_from))
# Salva entità estratte (array di stringhe)
entities = spacy_extract_entities(text_to_extract_from)
print(' [*] received entities: {}'.format(entities))
# Annota le entità
annotated_entities = query_wikipedia(entities)
print(' [*] received annotated entities: {}'.format(annotated_entities))
print()
# Inserisci le entità annotate nella query
for entity in annotated_entities:
if entity['coords'] != '':
exp2.append((entity['id'], entity['title'], entity['abstract'], entity['image'], entity['coords'], entity['uri']))
exp3.append((id, entity['id']))
if id == len(json):
yield "data: {}%%{}\n\n".format('100', 'done')
else:
x += bit
desc = "Expanding record with id {}...".format(record['id'])
yield "data: {}%%{}\n\n".format(str(x), desc)
print(' [*] inserting expanded records to the table...')
# Esegui le varie INSERT nel db
db.executemany(query, exp)
print(' [*] inserting entities...')
try:
db.executemany(query2, exp2)
db.executemany(query3, exp3)
# elimina entità duplicate
db.execute("DELETE FROM entities WHERE id NOT IN (SELECT MIN(id) FROM entities GROUP BY entity_id)")
db.execute("DELETE FROM entity_for_record WHERE id NOT IN (SELECT MIN(id) FROM entity_for_record GROUP BY record_id, entity_id)")
# elimina entità ambigue
db.execute("DELETE FROM entities WHERE abstract=''")
# elimina record aventi una sola entità associata, che coincide con il luogo di pubblicazione
db.execute("DELETE FROM records WHERE id IN(SELECT e.record_id FROM entity_for_record e, records r, places p, entities en\
WHERE e.record_id = r.id AND r.published_in = p.id AND e.entity_id = en.entity_id\
GROUP BY e.record_id HAVING COUNT(*) = 1 AND en.title = p.name)")
db.commit()
print(' [*] done!')
except Exception as e:
print(e)