Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
89 additions
and
59 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import html | ||
import re | ||
|
||
import numpy as np | ||
|
||
|
||
class HtmlConcealer: | ||
|
||
def __init__(self, html_str): | ||
self.content = html_str | ||
self.pos_table = np.arange(len(self.content)) | ||
|
||
def conceal(self): | ||
self.remove_pattern(r'<[^>]+>') | ||
self.replace_html_special_ents() | ||
self.remove_pattern(r'\n|\xa0', replace_with=' ') | ||
|
||
def get_content(self): | ||
self.conceal() | ||
return self.content | ||
|
||
def concealed_to_html_pos(self, pos_start, pos_end): | ||
return self.pos_table[pos_start], self.pos_table[pos_end] | ||
|
||
def remove_pattern(self, regex, replace_with=''): | ||
pattern = re.compile(regex) | ||
while True: | ||
m = re.search(pattern, self.content) | ||
if m is None: | ||
break | ||
self.content = self.content[:m.start(0)] + replace_with + self.content[m.end(0):] | ||
self.pos_table = np.delete(self.pos_table, np.arange(m.start(0) + len(replace_with), m.end(0))) | ||
return self.content, self.pos_table | ||
|
||
def replace_html_special_ents(self): | ||
pattern = re.compile(r'&#\d{1,4};|&\w{1,6};') | ||
while True: | ||
m = re.search(pattern, self.content) | ||
if m is None: | ||
break | ||
unicode = html.unescape(m.group(0)) | ||
self.content = self.content[:m.start(0)] + unicode + self.content[m.end(0):] | ||
self.pos_table = np.delete(self.pos_table, np.arange(m.start(0) + 1, m.end(0))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
from django.test import TestCase | ||
|
||
from oldp.apps.nlp.preprocessing import HtmlConcealer | ||
|
||
|
||
class PreprocessingTestCase(TestCase): | ||
|
||
def test_html_concealing(self): | ||
html = '<h2>Tenor</h2>\n\n<ul class="ol"><li><p>1. Unter Abänderung des Beschlusses der Kammer' | ||
concealer = HtmlConcealer(html) | ||
concealer.conceal() | ||
self.assertEqual('Tenor 1. Unter Abänderung des Beschlusses der Kammer', concealer.get_content()) | ||
|
||
def test_html_concealing_pos_table(self): | ||
html = '<h2>Tenor</h2>\n\n<ul class="ol"><li><p>1. Unter Abänderung des Beschlusses der Kammer' | ||
concealer = HtmlConcealer(html) | ||
concealer.conceal() | ||
concealed_word = concealer.get_content()[16:26] | ||
html_word = html[47:62] | ||
self.assertEqual(concealed_word, 'Abänderung') | ||
self.assertEqual(html_word, 'Abänderung') | ||
self.assertEqual(concealer.concealed_to_html_pos(16, 26), (47, 62)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,3 +9,4 @@ nltk==3.2.2 | |
spacy==2.0.16 | ||
cssselect==1.0.0 | ||
lxml==4.2.5 | ||
numpy==1.15.3 |