From ba6e89313e921a3fd81ab9c702da099242f817b6 Mon Sep 17 00:00:00 2001 From: Marsel Mavletkulov Date: Mon, 12 Feb 2024 16:36:48 -0500 Subject: [PATCH 1/3] Add additional email normalization --- HISTORY.rst | 25 +++++ minfraud/request.py | 247 +++++++++++++++++++++++++++++++++++++++--- tests/test_request.py | 41 ++++++- 3 files changed, 296 insertions(+), 17 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index 1ca5631..6909dac 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -9,6 +9,31 @@ History * Added the following new values to the ``/payment/processor`` validation: * ``pxp_financial`` * ``trustpay`` +* Equivalent domain names are now normalized when ``hash_address`` is used. + For example, ``googlemail.com`` will become ``gmail.com``. +* Periods are now removed from ``gmail.com`` email address local parts when + ``hash_address`` is used. For example, ``f.o.o@gmail.com`` will become + ``foo@gmail.com``. +* Fastmail alias subdomain email addresses are now normalized when + ``hash_address`` is used. For example, ``alias@user.fastmail.com`` will + become ``user@fastmail.com``. +* Additional ``yahoo.com`` email addresses now have aliases removed from + their local part when ``hash_address`` is used. For example, + ``foo-bar@yahoo.com`` will become ``foo@yahoo.com`` for additional + ``yahoo.com`` domains. +* Duplicate ``.com``s are now removed from email domain names when + ``hash_address`` is used. For example, ``example.com.com`` will become + ``example.com``. +* Extraneous characters after ``.com`` are now removed from email domain + names when ``hash_address`` is used. For example, ``example.comfoo`` will + become ``example.com``. +* Certain ``.com`` typos are now normalized to ``.com`` when ``hash_address`` is + used. For example, ``example.cam`` will become ``example.com``. +* Additional ``gmail.com`` domain names with leading digits are now + normalized when ``hash_address`` is used. For example, ``100gmail.com`` will + become ``gmail.com``. +* Additional ``gmail.com`` typos are now normalized when ``hash_address`` is + used. For example, ``gmali.com`` will become ``gmail.com``. 2.9.0 (2023-12-05) ++++++++++++++++++ diff --git a/minfraud/request.py b/minfraud/request.py index e0d12df..c36442a 100644 --- a/minfraud/request.py +++ b/minfraud/request.py @@ -5,6 +5,7 @@ """ +import re import warnings import hashlib from typing import Any, Dict @@ -15,17 +16,207 @@ _TYPO_DOMAINS = { # gmail.com - "35gmai.com": "gmail.com", - "636gmail.com": "gmail.com", + "gmai.com": "gmail.com", "gamil.com": "gmail.com", - "gmail.comu": "gmail.com", + "gmali.com": "gmail.com", "gmial.com": "gmail.com", "gmil.com": "gmail.com", + "gmaill.com": "gmail.com", + "gmailm.com": "gmail.com", + "gmailo.com": "gmail.com", + "gmailyhoo.com": "gmail.com", "yahoogmail.com": "gmail.com", # outlook.com "putlook.com": "outlook.com", } +_EQUIVALENT_DOMAINS = { + "googlemail.com": "gmail.com", + "pm.me": "protonmail.com", + "proton.me": "protonmail.com", + "yandex.by": "yandex.ru", + "yandex.com": "yandex.ru", + "yandex.kz": "yandex.ru", + "yandex.ua": "yandex.ru", + "ya.ru": "yandex.ru", +} + +_FASTMAIL_DOMAINS = { + "123mail.org", + "150mail.com", + "150ml.com", + "16mail.com", + "2-mail.com", + "4email.net", + "50mail.com", + "airpost.net", + "allmail.net", + "bestmail.us", + "cluemail.com", + "elitemail.org", + "emailcorner.net", + "emailengine.net", + "emailengine.org", + "emailgroups.net", + "emailplus.org", + "emailuser.net", + "eml.cc", + "f-m.fm", + "fast-email.com", + "fast-mail.org", + "fastem.com", + "fastemail.us", + "fastemailer.com", + "fastest.cc", + "fastimap.com", + "fastmail.cn", + "fastmail.co.uk", + "fastmail.com", + "fastmail.com.au", + "fastmail.de", + "fastmail.es", + "fastmail.fm", + "fastmail.fr", + "fastmail.im", + "fastmail.in", + "fastmail.jp", + "fastmail.mx", + "fastmail.net", + "fastmail.nl", + "fastmail.org", + "fastmail.se", + "fastmail.to", + "fastmail.tw", + "fastmail.uk", + "fastmail.us", + "fastmailbox.net", + "fastmessaging.com", + "fea.st", + "fmail.co.uk", + "fmailbox.com", + "fmgirl.com", + "fmguy.com", + "ftml.net", + "h-mail.us", + "hailmail.net", + "imap-mail.com", + "imap.cc", + "imapmail.org", + "inoutbox.com", + "internet-e-mail.com", + "internet-mail.org", + "internetemails.net", + "internetmailing.net", + "jetemail.net", + "justemail.net", + "letterboxes.org", + "mail-central.com", + "mail-page.com", + "mailandftp.com", + "mailas.com", + "mailbolt.com", + "mailc.net", + "mailcan.com", + "mailforce.net", + "mailftp.com", + "mailhaven.com", + "mailingaddress.org", + "mailite.com", + "mailmight.com", + "mailnew.com", + "mailsent.net", + "mailservice.ms", + "mailup.net", + "mailworks.org", + "ml1.net", + "mm.st", + "myfastmail.com", + "mymacmail.com", + "nospammail.net", + "ownmail.net", + "petml.com", + "postinbox.com", + "postpro.net", + "proinbox.com", + "promessage.com", + "realemail.net", + "reallyfast.biz", + "reallyfast.info", + "rushpost.com", + "sent.as", + "sent.at", + "sent.com", + "speedpost.net", + "speedymail.org", + "ssl-mail.com", + "swift-mail.com", + "the-fastest.net", + "the-quickest.com", + "theinternetemail.com", + "veryfast.biz", + "veryspeedy.net", + "warpmail.net", + "xsmail.com", + "yepmail.net", + "your-mail.com", +} + +_YAHOO_DOMAINS = { + "y7mail.com", + "yahoo.at", + "yahoo.be", + "yahoo.bg", + "yahoo.ca", + "yahoo.cl", + "yahoo.co.id", + "yahoo.co.il", + "yahoo.co.in", + "yahoo.co.kr", + "yahoo.co.nz", + "yahoo.co.th", + "yahoo.co.uk", + "yahoo.co.za", + "yahoo.com", + "yahoo.com.ar", + "yahoo.com.au", + "yahoo.com.br", + "yahoo.com.co", + "yahoo.com.hk", + "yahoo.com.hr", + "yahoo.com.mx", + "yahoo.com.my", + "yahoo.com.pe", + "yahoo.com.ph", + "yahoo.com.sg", + "yahoo.com.tr", + "yahoo.com.tw", + "yahoo.com.ua", + "yahoo.com.ve", + "yahoo.com.vn", + "yahoo.cz", + "yahoo.de", + "yahoo.dk", + "yahoo.ee", + "yahoo.es", + "yahoo.fi", + "yahoo.fr", + "yahoo.gr", + "yahoo.hu", + "yahoo.ie", + "yahoo.in", + "yahoo.it", + "yahoo.lt", + "yahoo.lv", + "yahoo.nl", + "yahoo.no", + "yahoo.pl", + "yahoo.pt", + "yahoo.ro", + "yahoo.se", + "yahoo.sk", + "ymail.com", +} + def prepare_report(request: Dict[str, Any], validate: bool): """Validate and prepare minFraud report""" @@ -91,29 +282,42 @@ def maybe_hash_email(transaction): if address is None: return - address = address.lower().strip() - - at_idx = address.rfind("@") - if at_idx == -1: + address, domain = _clean_email(address) + if not address: return - domain = _clean_domain(address[at_idx + 1 :]) # noqa - local_part = address[:at_idx] - if domain != "" and "domain" not in email: email["domain"] = domain - email["address"] = _hash_email(local_part, domain) + email["address"] = hashlib.md5(address.encode("UTF-8")).hexdigest() def _clean_domain(domain): domain = domain.strip().rstrip(".").encode("idna").decode("ASCII") - return _TYPO_DOMAINS.get(domain, domain) + domain = re.sub(r"(?:\.com){2,}$", ".com", domain) + domain = re.sub(r"\.com[^.]+$", ".com", domain) + domain = re.sub(r"(?:\.(?:com|c[a-z]{1,2}m|co[ln]|[dsvx]o[mn]|))$", ".com", domain) + domain = re.sub(r"^\d+(?:gmail?\.com)$", "gmail.com", domain) -def _hash_email(local_part, domain): - # Strip off aliased part of email address - if domain == "yahoo.com": + domain = _TYPO_DOMAINS.get(domain, domain) + domain = _EQUIVALENT_DOMAINS.get(domain, domain) + + return domain + + +def _clean_email(address): + address = address.lower().strip() + + at_idx = address.rfind("@") + if at_idx == -1: + return None, None + + domain = _clean_domain(address[at_idx + 1 :]) # noqa + local_part = address[:at_idx] + + # Strip off aliased part of email address. + if domain in _YAHOO_DOMAINS: divider = "-" else: divider = "+" @@ -122,4 +326,15 @@ def _hash_email(local_part, domain): if alias_idx > 0: local_part = local_part[:alias_idx] - return hashlib.md5(f"{local_part}@{domain}".encode("UTF-8")).hexdigest() + if domain == "gmail.com": + local_part = local_part.replace(".", "") + + domain_parts = domain.split(".") + if len(domain_parts) > 2: + possible_domain = ".".join(domain_parts[1:]) + if possible_domain in _FASTMAIL_DOMAINS: + domain = possible_domain + if local_part != "": + local_part = domain_parts[0] + + return f"{local_part}@{domain}", domain diff --git a/tests/test_request.py b/tests/test_request.py index b55ee69..9807553 100644 --- a/tests/test_request.py +++ b/tests/test_request.py @@ -1,6 +1,10 @@ import unittest -from minfraud.request import maybe_hash_email, clean_credit_card +from minfraud.request import ( + maybe_hash_email, + clean_credit_card, + _clean_email, +) class TestRequest(unittest.TestCase): @@ -191,3 +195,38 @@ def test_clean_credit_card(self): clean_credit_card(transaction) self.assertEqual(test["expected"], transaction) + + +def test_clean_email(): + tests = [ + {"input": "", "output": None}, + {"input": "fasfs", "output": None}, + {"input": "test@gmail", "output": "test@gmail"}, + {"input": "e4d909c290d0fb1ca068ffaddf22cbd0", "output": None}, + {"input": "Test@maxmind", "output": "test@maxmind"}, + {"input": "Test@maxmind.com", "output": "test@maxmind.com"}, + {"input": "Test+007@maxmind.com", "output": "test@maxmind.com"}, + {"input": "Test+007+008@maxmind.com", "output": "test@maxmind.com"}, + {"input": "Test+@maxmind.com", "output": "test@maxmind.com"}, + {"input": "+@maxmind.com", "output": "+@maxmind.com"}, + {"input": " Test@maxmind.com", "output": "test@maxmind.com"}, + {"input": "Test@maxmind.com|abc124472372", "output": "test@maxmind.com"}, + {"input": "Test+foo@yahoo.com", "output": "test+foo@yahoo.com"}, + {"input": "Test-foo@yahoo.com", "output": "test@yahoo.com"}, + {"input": "Test-foo-foo2@yahoo.com", "output": "test@yahoo.com"}, + {"input": "Test-foo@gmail.com", "output": "test-foo@gmail.com"}, + {"input": "gamil.com@gamil.com", "output": "gamilcom@gmail.com"}, + {"input": "Test+alias@bücher.com", "output": "test@xn--bcher-kva.com"}, + {"input": "foo@googlemail.com", "output": "foo@gmail.com"}, + {"input": "foo.bar@gmail.com", "output": "foobar@gmail.com"}, + {"input": "alias@user.fastmail.com", "output": "user@fastmail.com"}, + {"input": "foo-bar@ymail.com", "output": "foo@ymail.com"}, + {"input": "foo@example.com.com", "output": "foo@example.com"}, + {"input": "foo@example.comfoo", "output": "foo@example.com"}, + {"input": "foo@example.cam", "output": "foo@example.com"}, + {"input": "foo@10000gmail.com", "output": "foo@gmail.com"}, + ] + + for test in tests: + got, _ = _clean_email(test["input"]) + assert test["output"] == got From d39000c692659da21fac9c2e433cce1fbbfc874b Mon Sep 17 00:00:00 2001 From: Marsel Mavletkulov Date: Tue, 13 Feb 2024 15:16:48 -0500 Subject: [PATCH 2/3] Fix sublists indentation --- HISTORY.rst | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index 6909dac..d1dbb7f 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -7,8 +7,10 @@ History ++++++++++++++++++ * Added the following new values to the ``/payment/processor`` validation: + * ``pxp_financial`` * ``trustpay`` + * Equivalent domain names are now normalized when ``hash_address`` is used. For example, ``googlemail.com`` will become ``gmail.com``. * Periods are now removed from ``gmail.com`` email address local parts when @@ -21,7 +23,7 @@ History their local part when ``hash_address`` is used. For example, ``foo-bar@yahoo.com`` will become ``foo@yahoo.com`` for additional ``yahoo.com`` domains. -* Duplicate ``.com``s are now removed from email domain names when +* Duplicate ``.com`` s are now removed from email domain names when ``hash_address`` is used. For example, ``example.com.com`` will become ``example.com``. * Extraneous characters after ``.com`` are now removed from email domain @@ -51,6 +53,7 @@ History * IMPORTANT: Python 3.7 or greater is required. If you are using an older version, please use an earlier release. * Added the following new values to the ``/payment/processor`` validation: + * ``google_pay`` * ``placetopay`` * ``shopify_payments`` @@ -85,6 +88,7 @@ History ``response.ip_address.traits.mobile_network_code``. We expect this data to be available by late January, 2022. * Added the following new values to the ``/payment/processor`` validation: + * ``boacompra`` * ``boku`` * ``coregateway`` @@ -96,6 +100,7 @@ History * ``payvision`` * ``trustly`` * ``windcave`` + * The ``/credit_card/last_4_digits`` input has been deprecated in favor of ``/credit_card/last_digits`` and will be removed in a future release. ``last_digits``/``last_4_digits`` also now supports two digit values in @@ -106,6 +111,7 @@ History an ``issuer_id_number`` that contains an eight digit IIN, and if the credit card brand is not one of the following, you should send the last two digits for ``last_digits``: + * ``Discover`` * ``JCB`` * ``Mastercard`` @@ -122,6 +128,7 @@ History ++++++++++++++++++ * Added the following new values to the ``/payment/processor`` validation: + * ``cardknox`` * ``creditguard`` * ``credorax`` @@ -129,6 +136,7 @@ History * ``dlocal`` * ``onpay`` * ``safecharge`` + * Added ``rule_label`` to minFraud output ``/disposition``. * Added ``was_3d_secure_successful`` to ``/credit_card`` validation @@ -157,7 +165,6 @@ History Factors responses. This is available at ``.ip_address.risk_reasons``. It is an array of ``IPRiskReason`` objects. - 2.2.0 (2020-10-13) ++++++++++++++++++ @@ -214,13 +221,16 @@ History +++++++++++++++++++ * Added the following new values to the ``/payment/processor`` validation: + * ``cashfree`` * ``first_atlantic_commerce`` * ``komoju`` * ``paytm`` * ``razorpay`` * ``systempay`` + * Added support for the following new subscores in Factors responses: + * ``device``: the risk associated with the device * ``email_local_part``: the risk associated with the email address local part * ``shipping_address``: the risk associated with the shipping address @@ -252,6 +262,7 @@ History This may be accessed via ``response.email.domain.first_seen`` on the minFraud Insights and Factors response objects. * Added the following new values to the ``/payment/processor`` validation: + * ``cardpay`` * ``epx`` @@ -270,6 +281,7 @@ History to 9,999,999,999,999. Previously, larger numbers were allowed. * Python 3.3 and 3.4 are no longer supported. * Added the following new values to the ``/payment/processor`` validation: + * ``affirm`` * ``afterpay`` * ``cetelem`` @@ -287,6 +299,7 @@ History * ``paysafecard`` * ``smartdebit`` * ``synapsefi`` + * Deprecated the ``email_tenure`` and ``ip_tenure`` attributes of ``minfraud.models.Subscores``. * Deprecated the ``is_high_risk`` attribute of @@ -299,11 +312,13 @@ History * Renamed MaxMind user ID to account ID in the code and added support for the new ``ACCOUNT_ID_REQUIRED`` error code. * Added the following new values to the ``/payment/processor`` validation: + * ``ccavenue`` * ``ct_payments`` * ``dalenys`` * ``oney`` * ``posconnect`` + * Added support for the ``/device/local_time`` output. * Added support for the ``/credit_card/is_virtual`` output. * Added ``payout_change`` to the ``/event/type`` input validation. @@ -316,6 +331,7 @@ History ``geoip2.record.RepresentedCountry``. This attribute is ``True`` if the country is a member state of the European Union. * Added the following new values to the ``/payment/processor`` validation: + * ``cybersource`` * ``transact_pro`` * ``wirecard`` @@ -324,11 +340,13 @@ History ++++++++++++++++++ * Added the following new values to the ``/payment/processor`` validation: + * ``bpoint`` * ``checkout_com`` * ``emerchantpay`` * ``heartland`` * ``payway`` + * Updated ``geoip2`` dependency to add support for GeoIP2 Precision Insights anonymizer fields. @@ -338,6 +356,7 @@ History * Added support for custom inputs. You may set up custom inputs from your account portal. * Added the following new values to the ``/payment/processor`` validation: + * ``american_express_payment_gateway`` * ``bluesnap`` * ``commdoo`` @@ -354,6 +373,7 @@ History * ``vantiv`` * ``vericheck`` * ``vpos`` + * Added the following new input values: ``/device/session_age`` and ``/device/session_id``. * Added support for the ``/email/first_seen`` output. @@ -425,11 +445,13 @@ History ++++++++++++++++++ * Added support for new minFraud Insights outputs. These are: - * ``/credit_card/brand`` - * ``/credit_card/type`` - * ``/device/id`` - * ``/email/is_free`` - * ``/email/is_high_risk`` + + * ``/credit_card/brand`` + * ``/credit_card/type`` + * ``/device/id`` + * ``/email/is_free`` + * ``/email/is_high_risk`` + * ``input`` on the ``Warning`` response model has been replaced with ``input_pointer``. The latter is a JSON pointer to the input that caused the warning. From 45fb77698adf020541fea82e824ef9241d4cec2f Mon Sep 17 00:00:00 2001 From: Marsel Mavletkulov Date: Fri, 16 Feb 2024 11:53:59 -0500 Subject: [PATCH 3/3] Check address for null --- HISTORY.rst | 18 +++++++++--------- minfraud/request.py | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index d1dbb7f..1985004 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -11,30 +11,30 @@ History * ``pxp_financial`` * ``trustpay`` -* Equivalent domain names are now normalized when ``hash_address`` is used. +* Equivalent domain names are now normalized when ``hash_email`` is used. For example, ``googlemail.com`` will become ``gmail.com``. * Periods are now removed from ``gmail.com`` email address local parts when - ``hash_address`` is used. For example, ``f.o.o@gmail.com`` will become + ``hash_email`` is used. For example, ``f.o.o@gmail.com`` will become ``foo@gmail.com``. * Fastmail alias subdomain email addresses are now normalized when - ``hash_address`` is used. For example, ``alias@user.fastmail.com`` will + ``hash_email`` is used. For example, ``alias@user.fastmail.com`` will become ``user@fastmail.com``. * Additional ``yahoo.com`` email addresses now have aliases removed from - their local part when ``hash_address`` is used. For example, + their local part when ``hash_email`` is used. For example, ``foo-bar@yahoo.com`` will become ``foo@yahoo.com`` for additional ``yahoo.com`` domains. * Duplicate ``.com`` s are now removed from email domain names when - ``hash_address`` is used. For example, ``example.com.com`` will become + ``hash_email`` is used. For example, ``example.com.com`` will become ``example.com``. * Extraneous characters after ``.com`` are now removed from email domain - names when ``hash_address`` is used. For example, ``example.comfoo`` will + names when ``hash_email`` is used. For example, ``example.comfoo`` will become ``example.com``. -* Certain ``.com`` typos are now normalized to ``.com`` when ``hash_address`` is +* Certain ``.com`` typos are now normalized to ``.com`` when ``hash_email`` is used. For example, ``example.cam`` will become ``example.com``. * Additional ``gmail.com`` domain names with leading digits are now - normalized when ``hash_address`` is used. For example, ``100gmail.com`` will + normalized when ``hash_email`` is used. For example, ``100gmail.com`` will become ``gmail.com``. -* Additional ``gmail.com`` typos are now normalized when ``hash_address`` is +* Additional ``gmail.com`` typos are now normalized when ``hash_email`` is used. For example, ``gmali.com`` will become ``gmail.com``. 2.9.0 (2023-12-05) diff --git a/minfraud/request.py b/minfraud/request.py index c36442a..bc36e17 100644 --- a/minfraud/request.py +++ b/minfraud/request.py @@ -283,7 +283,7 @@ def maybe_hash_email(transaction): return address, domain = _clean_email(address) - if not address: + if address is None: return if domain != "" and "domain" not in email: