Skip to content

Commit

Permalink
Merge af6c8c2 into 4186eee
Browse files Browse the repository at this point in the history
  • Loading branch information
langhabel committed Jan 31, 2019
2 parents 4186eee + af6c8c2 commit 3984627
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 59 deletions.
16 changes: 16 additions & 0 deletions oldp/apps/nlp/ner/base.py
Expand Up @@ -2,6 +2,7 @@

from oldp.apps.nlp.base import SpacyNLP
from oldp.apps.nlp.ner.strategy_factories import UniversalNERStrategyFactory
from oldp.apps.nlp.preprocessing import HtmlConcealer


class EntityExtractor:
Expand All @@ -17,3 +18,18 @@ def prepare(self, text):
def extract(self, entity_type) -> Generator:
strategy = self.factory.get_strategy(entity_type)
return strategy.extract(self.doc)


class HtmlEntityExtractor(EntityExtractor):
html_concealer = None

def prepare(self, text):
self.html_concealer = HtmlConcealer(text)
self.html_concealer.conceal()
text = self.html_concealer.get_content()
super().prepare(text)

def extract(self, entity_type) -> Generator:
for (value, start, end) in super().extract(entity_type):
start, end = self.html_concealer.concealed_to_html_pos(start, end)
yield value, start, end
43 changes: 43 additions & 0 deletions oldp/apps/nlp/preprocessing.py
@@ -0,0 +1,43 @@
import html
import re

import numpy as np


class HtmlConcealer:

def __init__(self, html_str):
self.content = html_str
self.pos_table = np.arange(len(self.content))

def conceal(self):
self.remove_pattern(r'<[^>]+>')
self.replace_html_special_ents()
self.remove_pattern(r'\n|\xa0', replace_with=' ')

def get_content(self):
self.conceal()
return self.content

def concealed_to_html_pos(self, pos_start, pos_end):
return self.pos_table[pos_start], self.pos_table[pos_end]

def remove_pattern(self, regex, replace_with=''):
pattern = re.compile(regex)
while True:
m = re.search(pattern, self.content)
if m is None:
break
self.content = self.content[:m.start(0)] + replace_with + self.content[m.end(0):]
self.pos_table = np.delete(self.pos_table, np.arange(m.start(0) + len(replace_with), m.end(0)))
return self.content, self.pos_table

def replace_html_special_ents(self):
pattern = re.compile(r'&#\d{1,4};|&\w{1,6};')
while True:
m = re.search(pattern, self.content)
if m is None:
break
unicode = html.unescape(m.group(0))
self.content = self.content[:m.start(0)] + unicode + self.content[m.end(0):]
self.pos_table = np.delete(self.pos_table, np.arange(m.start(0) + 1, m.end(0)))
22 changes: 22 additions & 0 deletions oldp/apps/nlp/tests/test_preprocessing.py
@@ -0,0 +1,22 @@
from django.test import TestCase

from oldp.apps.nlp.preprocessing import HtmlConcealer


class PreprocessingTestCase(TestCase):

def test_html_concealing(self):
html = '<h2>Tenor</h2>\n\n<ul class="ol"><li><p>1. Unter Ab&#228;nderung des Beschlusses der Kammer'
concealer = HtmlConcealer(html)
concealer.conceal()
self.assertEqual('Tenor 1. Unter Abänderung des Beschlusses der Kammer', concealer.get_content())

def test_html_concealing_pos_table(self):
html = '<h2>Tenor</h2>\n\n<ul class="ol"><li><p>1. Unter Ab&#228;nderung des Beschlusses der Kammer'
concealer = HtmlConcealer(html)
concealer.conceal()
concealed_word = concealer.get_content()[16:26]
html_word = html[47:62]
self.assertEqual(concealed_word, 'Abänderung')
self.assertEqual(html_word, 'Ab&#228;nderung')
self.assertEqual(concealer.concealed_to_html_pos(16, 26), (47, 62))
27 changes: 2 additions & 25 deletions oldp/apps/processing/processing_steps/extract_entities.py
@@ -1,33 +1,13 @@
import re

from bs4 import BeautifulSoup

from oldp.apps.nlp.models import Entity, NLPContent
from oldp.apps.nlp.ner.base import EntityExtractor


def get_text_from_html(html):
soup = BeautifulSoup(html, 'lxml')
return re.sub(r'\s\s+', ' ', soup.get_text())
from oldp.apps.nlp.ner.base import HtmlEntityExtractor


class EntityProcessor: # TODO Can this be all done in ProcessingStep?
SERIALIZATION_SEPERATOR = '^'
entity_types = []

def __init__(self):
super(EntityProcessor, self).__init__()

def clean_content(self, content):
# HTML Tags
pattern = re.compile(r'<[^>]+>')

for m in re.finditer(pattern, content):
mask = ' ' * (m.end(0) - m.start(0))
content = content[:m.start(0)] + mask + content[m.end(0):]

return content

def extract_and_load(self,
text: str,
owner: NLPContent,
Expand All @@ -38,10 +18,7 @@ def extract_and_load(self,
# Remove existing entities
owner.nlp_entities.all().delete()

# Clean HTML
text = self.clean_content(text)

extractor = EntityExtractor(lang=lang)
extractor = HtmlEntityExtractor(lang=lang)
extractor.prepare(text)

# Extract for each type
Expand Down
39 changes: 5 additions & 34 deletions oldp/apps/processing/tests/test_processing_steps.py
Expand Up @@ -2,14 +2,7 @@

from oldp.apps.cases.models import Case
from oldp.apps.nlp.models import Entity
from oldp.apps.processing.processing_steps.extract_entities import get_text_from_html, \
EntityProcessor
from django.test import TestCase

from oldp.apps.cases.models import Case
from oldp.apps.nlp.models import Entity
from oldp.apps.processing.processing_steps.extract_entities import get_text_from_html, \
EntityProcessor
from oldp.apps.processing.processing_steps.extract_entities import EntityProcessor


class EntityProcessorTestCase(TestCase):
Expand Down Expand Up @@ -37,44 +30,22 @@ def test_extract_and_load(self):
'Grundsicherungsleistungen in Gestalt einer Regelleistung von 347 Euro ' \
'und für Kosten der Unterkunft von 193,19 Euro.'
entities = [537.52, 347, 190.52, 347, 193.19]

positions = [(410, 421), (438, 446), (490, 501), (813, 821), (856, 867)]
case = Case.objects.get(pk=1)

processor = EntityProcessor()
processor.entity_types = [Entity.MONEY]
processor.extract_and_load(get_text_from_html(case_content), case, lang='de')
processor.extract_and_load(case_content, case, lang='de')

for i, entity in enumerate(case.nlp_entities.all()):
self.assertEqual(entity.value_float, entities[i])
self.assertEqual((entity.pos_start, entity.pos_end), positions[i])

def test_html_content(self):

case = Case.objects.get(pk=1888)

processor = EntityProcessor()
processor.entity_types = [Entity.MONEY, Entity.ORGANIZATION, Entity.LOCATION, Entity.PERSON]
# processor.extract_and_load(get_text_from_html(case_content), case, lang='de')
processor.extract_and_load(case.content, case, lang='de')

print(case.nlp_entities.all())



class HtmlCleaning(TestCase):

def test_get_text_from_html(self):
case_content = "<h2>Tenor</h2>\n\n<div>\n <dl class=\"RspDL\">\n <dt/>\n " \
" <dd>\n <p>Auf die Revision des Beklagten wird das " \
"Urteil des ... " \
"in der Fassung des Erg&#228;nzungsurteils ... zum Nachteil des Beklagten " \
"entschieden worden ist.</p>\n </dd>\n </dl> <dl " \
"class=\"RspDL\">\n <dt/>\n <dd>\n <p/>\n " \
" </dd>\n </dl>\n <dl class=\"RspDL\">\n " \
"<dt/>\n <dd>\n <p>Im Umfang der Aufhebung wird die " \
"Berufung ... zur&#252;ckgewiesen. Die weitergehende " \
"Berufung bleibt zur&#252;ckgewiesen.</p>\n </dd>\n </dl>\n "
self.assertEqual(u'Tenor Auf die Revision des Beklagten wird das Urteil des ... in der '
u'Fassung des Ergänzungsurteils ... zum Nachteil des Beklagten '
u'entschieden worden ist. Im Umfang der Aufhebung wird die Berufung ... '
u'zurückgewiesen. Die weitergehende Berufung bleibt zurückgewiesen. ',
get_text_from_html(case_content))
self.assertGreater(case.nlp_entities.all().count(), 50)
1 change: 1 addition & 0 deletions requirements/processing.txt
Expand Up @@ -9,3 +9,4 @@ nltk==3.2.2
spacy==2.0.16
cssselect==1.0.0
lxml==4.2.5
numpy==1.15.3

0 comments on commit 3984627

Please sign in to comment.