From af8438b0dd414968c6054647fa4c5d49d370d526 Mon Sep 17 00:00:00 2001 From: William Storey Date: Fri, 15 Mar 2024 17:39:38 +0000 Subject: [PATCH] Replace fewer TLDs when normalizing --- HISTORY.rst | 7 ++---- minfraud/request.py | 52 +++++++++++++++++++++++++++++++++++++++++-- tests/test_request.py | 10 ++++++--- 3 files changed, 59 insertions(+), 10 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index 1985004..457babd 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -26,11 +26,8 @@ History * Duplicate ``.com`` s are now removed from email domain names when ``hash_email`` is used. For example, ``example.com.com`` will become ``example.com``. -* Extraneous characters after ``.com`` are now removed from email domain - names when ``hash_email`` is used. For example, ``example.comfoo`` will - become ``example.com``. -* Certain ``.com`` typos are now normalized to ``.com`` when ``hash_email`` is - used. For example, ``example.cam`` will become ``example.com``. +* Certain TLD typos are now normalized when ``hash_email`` is used. For + example, ``example.comcom`` will become ``example.com``. * Additional ``gmail.com`` domain names with leading digits are now normalized when ``hash_email`` is used. For example, ``100gmail.com`` will become ``gmail.com``. diff --git a/minfraud/request.py b/minfraud/request.py index bc36e17..767aae1 100644 --- a/minfraud/request.py +++ b/minfraud/request.py @@ -30,6 +30,50 @@ "putlook.com": "outlook.com", } +_TYPO_TLDS = { + "comm": "com", + "commm": "com", + "commmm": "com", + "comn": "com", + "cbm": "com", + "ccm": "com", + "cdm": "com", + "cem": "com", + "cfm": "com", + "cgm": "com", + "chm": "com", + "cim": "com", + "cjm": "com", + "ckm": "com", + "clm": "com", + "cmm": "com", + "cnm": "com", + "cpm": "com", + "cqm": "com", + "crm": "com", + "csm": "com", + "ctm": "com", + "cum": "com", + "cvm": "com", + "cwm": "com", + "cxm": "com", + "cym": "com", + "czm": "com", + "col": "com", + "con": "com", + "dom": "com", + "don": "com", + "som": "com", + "son": "com", + "vom": "com", + "von": "com", + "xom": "com", + "xon": "com", + "clam": "com", + "colm": "com", + "comcom": "com", +} + _EQUIVALENT_DOMAINS = { "googlemail.com": "gmail.com", "pm.me": "protonmail.com", @@ -296,10 +340,14 @@ def _clean_domain(domain): domain = domain.strip().rstrip(".").encode("idna").decode("ASCII") domain = re.sub(r"(?:\.com){2,}$", ".com", domain) - domain = re.sub(r"\.com[^.]+$", ".com", domain) - domain = re.sub(r"(?:\.(?:com|c[a-z]{1,2}m|co[ln]|[dsvx]o[mn]|))$", ".com", domain) domain = re.sub(r"^\d+(?:gmail?\.com)$", "gmail.com", domain) + idx = domain.rfind(".") + if idx != -1: + tld = domain[idx + 1 :] # noqa + if tld in _TYPO_TLDS: + domain = domain[:idx] + "." + _TYPO_TLDS.get(tld) + domain = _TYPO_DOMAINS.get(domain, domain) domain = _EQUIVALENT_DOMAINS.get(domain, domain) diff --git a/tests/test_request.py b/tests/test_request.py index 9807553..12ef245 100644 --- a/tests/test_request.py +++ b/tests/test_request.py @@ -210,7 +210,10 @@ def test_clean_email(): {"input": "Test+@maxmind.com", "output": "test@maxmind.com"}, {"input": "+@maxmind.com", "output": "+@maxmind.com"}, {"input": " Test@maxmind.com", "output": "test@maxmind.com"}, - {"input": "Test@maxmind.com|abc124472372", "output": "test@maxmind.com"}, + { + "input": "Test@maxmind.com|abc124472372", + "output": "test@maxmind.com|abc124472372", + }, {"input": "Test+foo@yahoo.com", "output": "test+foo@yahoo.com"}, {"input": "Test-foo@yahoo.com", "output": "test@yahoo.com"}, {"input": "Test-foo-foo2@yahoo.com", "output": "test@yahoo.com"}, @@ -222,9 +225,10 @@ def test_clean_email(): {"input": "alias@user.fastmail.com", "output": "user@fastmail.com"}, {"input": "foo-bar@ymail.com", "output": "foo@ymail.com"}, {"input": "foo@example.com.com", "output": "foo@example.com"}, - {"input": "foo@example.comfoo", "output": "foo@example.com"}, - {"input": "foo@example.cam", "output": "foo@example.com"}, + {"input": "foo@example.comfoo", "output": "foo@example.comfoo"}, + {"input": "foo@example.cam", "output": "foo@example.cam"}, {"input": "foo@10000gmail.com", "output": "foo@gmail.com"}, + {"input": "foo@example.comcom", "output": "foo@example.com"}, ] for test in tests: