Merge af6c8c2 into 4186eee

openlegaldata · Jan 31, 2019 · 3984627 · 3984627
2 parents 4186eee + af6c8c2
commit 3984627
Show file tree

Hide file tree

Showing 6 changed files with 89 additions and 59 deletions.
diff --git a/oldp/apps/nlp/ner/base.py b/oldp/apps/nlp/ner/base.py
@@ -2,6 +2,7 @@
 
 from oldp.apps.nlp.base import SpacyNLP
 from oldp.apps.nlp.ner.strategy_factories import UniversalNERStrategyFactory
+from oldp.apps.nlp.preprocessing import HtmlConcealer
 
 
 class EntityExtractor:
@@ -17,3 +18,18 @@ def prepare(self, text):
     def extract(self, entity_type) -> Generator:
         strategy = self.factory.get_strategy(entity_type)
         return strategy.extract(self.doc)
+
+
+class HtmlEntityExtractor(EntityExtractor):
+    html_concealer = None
+
+    def prepare(self, text):
+        self.html_concealer = HtmlConcealer(text)
+        self.html_concealer.conceal()
+        text = self.html_concealer.get_content()
+        super().prepare(text)
+
+    def extract(self, entity_type) -> Generator:
+        for (value, start, end) in super().extract(entity_type):
+            start, end = self.html_concealer.concealed_to_html_pos(start, end)
+            yield value, start, end
diff --git a/oldp/apps/nlp/preprocessing.py b/oldp/apps/nlp/preprocessing.py
@@ -0,0 +1,43 @@
+import html
+import re
+
+import numpy as np
+
+
+class HtmlConcealer:
+
+    def __init__(self, html_str):
+        self.content = html_str
+        self.pos_table = np.arange(len(self.content))
+
+    def conceal(self):
+        self.remove_pattern(r'<[^>]+>')
+        self.replace_html_special_ents()
+        self.remove_pattern(r'\n|\xa0', replace_with=' ')
+
+    def get_content(self):
+        self.conceal()
+        return self.content
+
+    def concealed_to_html_pos(self, pos_start, pos_end):
+        return self.pos_table[pos_start], self.pos_table[pos_end]
+
+    def remove_pattern(self, regex, replace_with=''):
+        pattern = re.compile(regex)
+        while True:
+            m = re.search(pattern, self.content)
+            if m is None:
+                break
+            self.content = self.content[:m.start(0)] + replace_with + self.content[m.end(0):]
+            self.pos_table = np.delete(self.pos_table, np.arange(m.start(0) + len(replace_with), m.end(0)))
+        return self.content, self.pos_table
+
+    def replace_html_special_ents(self):
+        pattern = re.compile(r'&#\d{1,4};|&\w{1,6};')
+        while True:
+            m = re.search(pattern, self.content)
+            if m is None:
+                break
+            unicode = html.unescape(m.group(0))
+            self.content = self.content[:m.start(0)] + unicode + self.content[m.end(0):]
+            self.pos_table = np.delete(self.pos_table, np.arange(m.start(0) + 1, m.end(0)))
diff --git a/oldp/apps/nlp/tests/test_preprocessing.py b/oldp/apps/nlp/tests/test_preprocessing.py
@@ -0,0 +1,22 @@
+from django.test import TestCase
+
+from oldp.apps.nlp.preprocessing import HtmlConcealer
+
+
+class PreprocessingTestCase(TestCase):
+
+    def test_html_concealing(self):
+        html = '<h2>Tenor</h2>\n\n<ul class="ol"><li><p>1. Unter Ab&#228;nderung des Beschlusses der Kammer'
+        concealer = HtmlConcealer(html)
+        concealer.conceal()
+        self.assertEqual('Tenor  1. Unter Abänderung des Beschlusses der Kammer', concealer.get_content())
+
+    def test_html_concealing_pos_table(self):
+        html = '<h2>Tenor</h2>\n\n<ul class="ol"><li><p>1. Unter Ab&#228;nderung des Beschlusses der Kammer'
+        concealer = HtmlConcealer(html)
+        concealer.conceal()
+        concealed_word = concealer.get_content()[16:26]
+        html_word = html[47:62]
+        self.assertEqual(concealed_word, 'Abänderung')
+        self.assertEqual(html_word, 'Ab&#228;nderung')
+        self.assertEqual(concealer.concealed_to_html_pos(16, 26), (47, 62))
diff --git a/oldp/apps/processing/processing_steps/extract_entities.py b/oldp/apps/processing/processing_steps/extract_entities.py
@@ -1,33 +1,13 @@
-import re
-
-from bs4 import BeautifulSoup
-
 from oldp.apps.nlp.models import Entity, NLPContent
-from oldp.apps.nlp.ner.base import EntityExtractor
-
-
-def get_text_from_html(html):
-    soup = BeautifulSoup(html, 'lxml')
-    return re.sub(r'\s\s+', ' ', soup.get_text())
+from oldp.apps.nlp.ner.base import HtmlEntityExtractor
 
 
 class EntityProcessor:  # TODO Can this be all done in ProcessingStep?
-    SERIALIZATION_SEPERATOR = '^'
     entity_types = []
 
     def __init__(self):
         super(EntityProcessor, self).__init__()
 
-    def clean_content(self, content):
-        # HTML Tags
-        pattern = re.compile(r'<[^>]+>')
-
-        for m in re.finditer(pattern, content):
-            mask = ' ' * (m.end(0) - m.start(0))
-            content = content[:m.start(0)] + mask + content[m.end(0):]
-
-        return content
-
     def extract_and_load(self,
                          text: str,
                          owner: NLPContent,
@@ -38,10 +18,7 @@ def extract_and_load(self,
         # Remove existing entities
         owner.nlp_entities.all().delete()
 
-        # Clean HTML
-        text = self.clean_content(text)
-
-        extractor = EntityExtractor(lang=lang)
+        extractor = HtmlEntityExtractor(lang=lang)
         extractor.prepare(text)
 
         # Extract for each type

diff --git a/oldp/apps/processing/tests/test_processing_steps.py b/oldp/apps/processing/tests/test_processing_steps.py
@@ -2,14 +2,7 @@
 
 from oldp.apps.cases.models import Case
 from oldp.apps.nlp.models import Entity
-from oldp.apps.processing.processing_steps.extract_entities import get_text_from_html, \
-    EntityProcessor
-from django.test import TestCase
-
-from oldp.apps.cases.models import Case
-from oldp.apps.nlp.models import Entity
-from oldp.apps.processing.processing_steps.extract_entities import get_text_from_html, \
-    EntityProcessor
+from oldp.apps.processing.processing_steps.extract_entities import EntityProcessor
 
 
 class EntityProcessorTestCase(TestCase):
@@ -37,44 +30,22 @@ def test_extract_and_load(self):
                        'Grundsicherungsleistungen in Gestalt einer Regelleistung von 347 Euro ' \
                        'und für Kosten der Unterkunft von 193,19 Euro.'
         entities = [537.52, 347, 190.52, 347, 193.19]
-
+        positions = [(410, 421), (438, 446), (490, 501), (813, 821), (856, 867)]
         case = Case.objects.get(pk=1)
 
         processor = EntityProcessor()
         processor.entity_types = [Entity.MONEY]
-        processor.extract_and_load(get_text_from_html(case_content), case, lang='de')
+        processor.extract_and_load(case_content, case, lang='de')
 
         for i, entity in enumerate(case.nlp_entities.all()):
             self.assertEqual(entity.value_float, entities[i])
+            self.assertEqual((entity.pos_start, entity.pos_end), positions[i])
 
     def test_html_content(self):
-
         case = Case.objects.get(pk=1888)
 
         processor = EntityProcessor()
         processor.entity_types = [Entity.MONEY, Entity.ORGANIZATION, Entity.LOCATION, Entity.PERSON]
-        # processor.extract_and_load(get_text_from_html(case_content), case, lang='de')
         processor.extract_and_load(case.content, case, lang='de')
 
-        print(case.nlp_entities.all())
-
-
-
-class HtmlCleaning(TestCase):
-
-    def test_get_text_from_html(self):
-        case_content = "<h2>Tenor</h2>\n\n<div>\n         <dl class=\"RspDL\">\n       <dt/>\n   " \
-                       "         <dd>\n               <p>Auf die Revision des Beklagten wird das " \
-                       "Urteil des ... " \
-                       "in der Fassung des Erg&#228;nzungsurteils ... zum Nachteil des Beklagten " \
-                       "entschieden worden ist.</p>\n            </dd>\n         </dl>       <dl " \
-                       "class=\"RspDL\">\n            <dt/>\n            <dd>\n          <p/>\n  " \
-                       "          </dd>\n         </dl>\n         <dl class=\"RspDL\">\n         " \
-                       "<dt/>\n            <dd>\n            <p>Im Umfang der Aufhebung wird die " \
-                       "Berufung ... zur&#252;ckgewiesen. Die weitergehende " \
-                       "Berufung bleibt zur&#252;ckgewiesen.</p>\n         </dd>\n         </dl>\n "
-        self.assertEqual(u'Tenor Auf die Revision des Beklagten wird das Urteil des ... in der '
-                         u'Fassung des Ergänzungsurteils ... zum Nachteil des Beklagten '
-                         u'entschieden worden ist. Im Umfang der Aufhebung wird die Berufung ... '
-                         u'zurückgewiesen. Die weitergehende Berufung bleibt zurückgewiesen. ',
-                         get_text_from_html(case_content))
+        self.assertGreater(case.nlp_entities.all().count(), 50)
diff --git a/requirements/processing.txt b/requirements/processing.txt
@@ -9,3 +9,4 @@ nltk==3.2.2
 spacy==2.0.16
 cssselect==1.0.0
 lxml==4.2.5
+numpy==1.15.3