Skip to content
This repository has been archived by the owner on Aug 13, 2021. It is now read-only.

Commit

Permalink
[347] Turn off translate (#350)
Browse files Browse the repository at this point in the history
* Turn off translate

* rmd dangling print statements
  • Loading branch information
Joel Klinger committed Dec 22, 2020
1 parent 8d1c29e commit 56961f8
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 90 deletions.
59 changes: 39 additions & 20 deletions nesta/core/luigihacks/elasticsearchplus.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
LANGS_TAG = "terms_iso2lang_entity"
PUNCTUATION = re.compile(r'[a-zA-Z\d\s:]').sub('', string.printable)


def sentence_chunks(text, chunksize=2000, delim='. '):
"""Split a string into chunks, but breaking only on
the specified delimiter.
Expand All @@ -53,17 +54,21 @@ def sentence_chunks(text, chunksize=2000, delim='. '):

class MLStripper(HTMLParser):
"""Taken from https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python. Tested in _sanitize_html."""

def __init__(self):
self.reset()
self.strict = False
self.convert_charrefs= True
self.convert_charrefs = True
self.fed = []
super().__init__()

def handle_data(self, d):
self.fed.append(d)

def get_data(self):
return ''.join(self.fed)


def strip_tags(html):
"""Taken from https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python. Tested in _sanitize_html"""
s = MLStripper()
Expand All @@ -86,7 +91,8 @@ def translate(text, translator, chunksize=2000):
{text, langs} ({str, set}): Translated text and set of
detected languages.
"""
chunks = [strip_tags(t) for t in sentence_chunks(text, chunksize=chunksize)]
chunks = [strip_tags(t)
for t in sentence_chunks(text, chunksize=chunksize)]
texts, langs = [], set()
for t in translator.translate(chunks, dest='en'):
texts.append(t.text.capitalize()) # GT uncapitalizes chunks
Expand Down Expand Up @@ -133,10 +139,10 @@ def _ngram_and_tokenize(row, ngrammer, ngram_fields):
processed_tokens = ngrammer.process_document(text)
tokens += [t.replace('_', ' ')
for tokens in processed_tokens
for t in tokens]
for t in tokens]
_row['terms_tokens_entity'] = tokens
return _row


def _sanitize_html(row):
"""Strips out any html encoding. Note: nothing clever is done
Expand All @@ -154,6 +160,7 @@ def _sanitize_html(row):
_row[k] = strip_tags(v)
return _row


def _clean_bad_unicode_conversion(row):
"""Removes sequences of ??? from strings, which normally
occur due to bad unicode conversion. Note this is a hack:
Expand All @@ -172,12 +179,13 @@ def _clean_bad_unicode_conversion(row):
elif "??" not in v:
continue
while "???" in v:
v = v.replace("???","")
v = v.replace("???", "")
while "??" in v:
v = v.replace("??","")
v = v.replace("??", "")
_row[k] = v
return _row


def _nullify_pairs(row, null_pairs={}):
"""Nullify any value if it's 'parent' is also null.
For example for null_pairs={'parent': 'child'}
Expand All @@ -201,6 +209,7 @@ def _nullify_pairs(row, null_pairs={}):
_row[child] = None
return _row


def _remove_padding(row):
"""Remove padding from text or list text
Expand All @@ -218,6 +227,7 @@ def _remove_padding(row):
for item in v]
return _row


def _caps_to_camel_case_by_value(v):
if type(v) is not str:
return v
Expand All @@ -227,6 +237,7 @@ def _caps_to_camel_case_by_value(v):
return v
return v.lower().title()


def _caps_to_camel_case(row):
"""Convert CAPITAL TERMS to Camel Case
Expand Down Expand Up @@ -275,11 +286,13 @@ def _clean_up_lists(row, do_sort=True):
_row[k] = v
return _row


def _add_entity_type(row, entity_type):
_row = deepcopy(row)
_row['type_of_entity'] = entity_type
return _row


def _null_empty_str(row):
"""Nullify values if they are empty strings.
Expand Down Expand Up @@ -326,6 +339,7 @@ def _coordinates_as_floats(row):
_row[k] = __floatify_coord(v)
return _row


@lru_cache()
def _country_lookup():
"""Extract country/nationality --> iso2 code lookup
Expand All @@ -334,7 +348,7 @@ def _country_lookup():
Returns:
lookup (dict): country/nationality --> iso2 code lookup.
"""
df = pd.read_csv(COUNTRY_LOOKUP, encoding='latin', na_filter = False)
df = pd.read_csv(COUNTRY_LOOKUP, encoding='latin', na_filter=False)
lookup = defaultdict(list)
for _, row in df.iterrows():
iso2 = row.pop("ISO 3166 Code")
Expand All @@ -344,6 +358,7 @@ def _country_lookup():
lookup[v].append(iso2)
return lookup


def _country_detection(row, country_tag=COUNTRY_TAG):
"""Append a list of countries detected from keywords
discovered in all text fields. The new field name
Expand Down Expand Up @@ -391,6 +406,7 @@ def _guess_delimiter(item, threshold=0.25):
if score < threshold:
return p


def _listify_terms(row, delimiters=None):
"""Split any 'terms' fields by a guessed delimiter if the
field is a string.
Expand All @@ -410,7 +426,8 @@ def _listify_terms(row, delimiters=None):
if _type is list:
continue
elif _type is not str:
raise TypeError(f"Type for '{k}' is '{_type}' but expected 'str' or 'list'.")
raise TypeError(
f"Type for '{k}' is '{_type}' but expected 'str' or 'list'.")
# Now determine the delimiter
if delimiters is None:
delimiter = _guess_delimiter(v)
Expand Down Expand Up @@ -521,6 +538,7 @@ class ElasticsearchPlus(Elasticsearch):
do_sort (bool): Sort all lists?
{args, kwargs}: (kw)args for the core :obj:`Elasticsearch` API.
"""

def __init__(self, entity_type,
aws_auth_region,
no_commit=False,
Expand Down Expand Up @@ -578,28 +596,29 @@ def __init__(self, entity_type,

# Convert items which SHOULD be lists to lists
if listify_terms:
self.transforms.append(lambda row: _listify_terms(row, terms_delimiters))
self.transforms.append(
lambda row: _listify_terms(row, terms_delimiters))

# Convert upper case text to camel case
if caps_to_camel_case:
self.transforms.append(_caps_to_camel_case)

# Translate any text to english
if auto_translate:
# URLs to load balance Google Translate
urls = list(f"translate.google.{ext}"
for ext in ('com', 'co.uk', 'co.kr', 'at',
'ru', 'fr', 'de', 'ch', 'es'))
self.transforms.append(lambda row: _auto_translate(row, translator=None,
service_urls=urls,
**auto_translate_kwargs))
# # Translate any text to english
# if auto_translate:
# # URLs to load balance Google Translate
# urls = list(f"translate.googleapis.{ext}"
# for ext in ('com', 'co.uk', 'co.kr', 'at',
# 'ru', 'fr', 'de', 'ch', 'es'))
# self.transforms.append(lambda row: _auto_translate(row, translator=None,
# service_urls=urls,
# **auto_translate_kwargs))

# Extract any ngrams and split into tokens
if len(ngram_fields) > 0:
# Setup ngrammer
if 'MYSQLDBCONF' not in os.environ:
if 'MYSQLDBCONF' not in os.environ:
os.environ['MYSQLDBCONF'] = 'mysqldb.config'
ngrammer = Ngrammer(database="production")
ngrammer = Ngrammer(database="production")
self.transforms.append(lambda row: _ngram_and_tokenize(row, ngrammer,
ngram_fields))

Expand Down
126 changes: 63 additions & 63 deletions nesta/core/luigihacks/tests/test_elasticsearchplus.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,70 +100,70 @@ def test_sentence_chunks():
chunksize=i)) == text


def test_auto_translate_true_short(row):
"""The translator shouldn't be applied for short pieces of text"""
time.sleep(2)
_row = _auto_translate(row, TRANSLATOR, 1000)
assert not _row.pop(TRANS_TAG)
assert len(_row.pop(LANGS_TAG)) == 0
assert row['korean'] == _row['korean']
assert row['mixed_lang'] == _row['mixed_lang']
assert row == _row

def test_auto_translate_true_long_small_chunks(row):
time.sleep(2)
_row_1 = _auto_translate(row, TRANSLATOR, 10, chunksize=1)
time.sleep(2)
_row_2 = _auto_translate(row, TRANSLATOR, 10, chunksize=10000)
assert _row_1.pop('mixed_lang') != _row_2.pop('mixed_lang')
# def test_auto_translate_true_short(row):
# """The translator shouldn't be applied for short pieces of text"""
# time.sleep(2)
# _row = _auto_translate(row, TRANSLATOR, 1000)
# assert not _row.pop(TRANS_TAG)
# assert len(_row.pop(LANGS_TAG)) == 0
# assert row['korean'] == _row['korean']
# assert row['mixed_lang'] == _row['mixed_lang']
# assert row == _row

# def test_auto_translate_true_long_small_chunks(row):
# time.sleep(2)
# _row_1 = _auto_translate(row, TRANSLATOR, 10, chunksize=1)
# time.sleep(2)
# _row_2 = _auto_translate(row, TRANSLATOR, 10, chunksize=10000)
# assert _row_1.pop('mixed_lang') != _row_2.pop('mixed_lang')

# Test the translation itself
# Constraints rather than fixed assertions since
# translate algorithm may change over time
# so the two chunk sizes aren't guaranteed to give the same results
k1 = _row_1.pop('korean').upper()
k2 = _row_2.pop('korean').upper()
assert len(k1) > 10
assert len(k2) > 10
assert (len(k1) - len(k2))/len(k1) < 0.95
assert (len(set(k1)) - len(set(k2)))/len(set(k1)) < 0.95
assert sum((Counter(k1) - Counter(k2)).values())/len(k1+k2) < 0.95
assert sum((Counter(k2) - Counter(k1)).values())/len(k1+k2) < 0.95

# Confirm that the languages are the same
langs_1 = _row_1.pop(LANGS_TAG)
langs_2 = _row_2.pop(LANGS_TAG)
assert len(langs_1) == len(langs_2)
assert set(langs_1) == set(langs_2)

# Confirm that nothing else has changed
assert _row_1 == _row_2

def test_auto_translate_true_long(row):
time.sleep(2)
_row = _auto_translate(row, TRANSLATOR, 10)
assert row.pop('korean') != _row['korean']
assert row.pop('mixed_lang') != _row['mixed_lang']
assert _row.pop(TRANS_TAG)
trans_korean = _row.pop('korean')
assert all(term in trans_korean.lower()
for term in ('brown', 'fox', 'jump',
'over', 'lazy', 'dog'))
trans_mixed = _row.pop('mixed_lang')
assert all(term in trans_mixed.lower()
for term in ('brown', 'fox',
'something', 'english'))
assert set(_row.pop(LANGS_TAG)) == {'ko', 'en'}
assert row == _row

def test_auto_translate_false(row):
row.pop('korean')
row.pop('mixed_lang')
time.sleep(2)
_row = _auto_translate(row, TRANSLATOR)
assert not _row.pop(TRANS_TAG)
assert _row.pop(LANGS_TAG) == ['en']
assert row == _row
# # Test the translation itself
# # Constraints rather than fixed assertions since
# # translate algorithm may change over time
# # so the two chunk sizes aren't guaranteed to give the same results
# k1 = _row_1.pop('korean').upper()
# k2 = _row_2.pop('korean').upper()
# assert len(k1) > 10
# assert len(k2) > 10
# assert (len(k1) - len(k2))/len(k1) < 0.95
# assert (len(set(k1)) - len(set(k2)))/len(set(k1)) < 0.95
# assert sum((Counter(k1) - Counter(k2)).values())/len(k1+k2) < 0.95
# assert sum((Counter(k2) - Counter(k1)).values())/len(k1+k2) < 0.95

# # Confirm that the languages are the same
# langs_1 = _row_1.pop(LANGS_TAG)
# langs_2 = _row_2.pop(LANGS_TAG)
# assert len(langs_1) == len(langs_2)
# assert set(langs_1) == set(langs_2)

# # Confirm that nothing else has changed
# assert _row_1 == _row_2

# def test_auto_translate_true_long(row):
# time.sleep(2)
# _row = _auto_translate(row, TRANSLATOR, 10)
# assert row.pop('korean') != _row['korean']
# assert row.pop('mixed_lang') != _row['mixed_lang']
# assert _row.pop(TRANS_TAG)
# trans_korean = _row.pop('korean')
# assert all(term in trans_korean.lower()
# for term in ('brown', 'fox', 'jump',
# 'over', 'lazy', 'dog'))
# trans_mixed = _row.pop('mixed_lang')
# assert all(term in trans_mixed.lower()
# for term in ('brown', 'fox',
# 'something', 'english'))
# assert set(_row.pop(LANGS_TAG)) == {'ko', 'en'}
# assert row == _row

# def test_auto_translate_false(row):
# row.pop('korean')
# row.pop('mixed_lang')
# time.sleep(2)
# _row = _auto_translate(row, TRANSLATOR)
# assert not _row.pop(TRANS_TAG)
# assert _row.pop(LANGS_TAG) == ['en']
# assert row == _row

def test_sanitize_html(row):
_row = _sanitize_html(row)
Expand Down
7 changes: 5 additions & 2 deletions nesta/packages/nlp_utils/ngrammer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class Ngrammer:
variable 'MYSQLDBCONF'
database (str): Database name
"""

def __init__(self, config_filepath=None, database="dev"):
if config_filepath is not None:
os.environ["MYSQLDBCONF"] = config_filepath
Expand Down Expand Up @@ -67,7 +68,8 @@ def find_and_replace(self, sentence, size):
return True
return False

def process_document(self, raw_text, remove_stops=True):
def process_document(self, raw_text, remove_stops=True,
keep_quasi_numeric=True):
"""Tokenize and insert n-grams into documents.
Args:
Expand All @@ -77,7 +79,8 @@ def process_document(self, raw_text, remove_stops=True):
processed_doc (list): Iterable ready for word embedding
"""
# Tokenize and clean up the text first
text = tokenize_document(raw_text)
text = tokenize_document(
raw_text, keep_quasi_numeric=keep_quasi_numeric)
# Replace large n-grams first, then small n-grams
for size in sorted(self.ngrams, reverse=True):
for sentence in text:
Expand Down

0 comments on commit 56961f8

Please sign in to comment.