Skip to content

Commit

Permalink
parole accentate
Browse files Browse the repository at this point in the history
  • Loading branch information
pdonorio committed Jun 28, 2017
1 parent 5e4c534 commit 192d77d
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 9 deletions.
1 change: 1 addition & 0 deletions operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
# RETHINKDB 2 ELASTICSEARCH

# r2e.make()
# FIXME
r2e.make(skip_lexique=True)
# r2e.make(only_xls=True)

Expand Down
14 changes: 14 additions & 0 deletions operations/html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# -*- coding: utf-8 -*-

import lxml.html
import lxml.etree


def convert(html_text):
try:
document = lxml.html.document_fromstring(html_text)
except lxml.etree.ParserError:
# empty document
return html_text
raw_text = document.text_content()
return raw_text
21 changes: 12 additions & 9 deletions operations/rethink2elastic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re
import logging
import datetime
from operations import html
from beeprint import pp
from restapi.resources.services.rethink import RethinkConnection, RDBquery
from restapi.resources.services.uploader import ZoomEnabling
Expand All @@ -20,7 +21,6 @@
RDB_TABLE2 = "datadocs"
noimages = {}
toberemoved = [
'e03aa189-b244-4782-8517-2a3edb3010fd',
# 'd2d5fcb6-81cc-4654-9f65-a436f0780c67' # prova
]

Expand Down Expand Up @@ -276,14 +276,9 @@ def suggest_transcription(transcription, key, probability=0.5, extrait=None):
for token in words['tokens']:
for word in token['token'].split("'"):
token['cleanlabel'] = key.split('_')[0]

if len(word) > 2:

# if 'scytalosagittipelliger' in word:
# print("TEST", extrait, word.encode())
# # exit(1)

add_suggestion(key, word, probability, extra=token)

return True


Expand Down Expand Up @@ -497,7 +492,7 @@ def single_update(doc):
langue = image['language']

transcription = image["transcriptions"].pop(0)
suggest_transcription(transcription, key, .25, elobj['extrait'])
# suggest_transcription(transcription, key, .25, elobj['extrait'])
if 'language' in image:
key += '_' + image['language'].lower()
docobj[key] = transcription
Expand All @@ -507,14 +502,22 @@ def single_update(doc):

for language, translation in image["translations"].items():
key = 'traduction'
suggest_transcription(transcription, key, .20, elobj['extrait'])
# suggest_transcription(transcription, key, .20, elobj['extrait'])

key = 'traduction_' + language.lower()
logger.debug("Found translations: %s" % language)
# suggest_transcription(transcription, key, .20)
docobj[key] = translation
langue += ' ' + language

# before completing
for key, value in docobj.items():
# clean html
docobj[key] = html.convert(value)
# add suggestion
name = key.split('_')[0]
suggest_transcription(docobj[key], name, .3, elobj['extrait'])

docobj['thumbnail'] = ZoomEnabling.get_thumbname(image['filename'])
elobj['doc'] = docobj
elobj['langue'] = langue.lower()
Expand Down

0 comments on commit 192d77d

Please sign in to comment.