From 15662b0947c3ab34d754133266c189cd7f44c378 Mon Sep 17 00:00:00 2001 From: Manabu Niseki Date: Fri, 22 Oct 2021 05:51:32 +0900 Subject: [PATCH] refactor: do not use re --- app/services/extractor.py | 20 +++++++++++--------- tests/services/test_extractor.py | 8 ++++++++ 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/app/services/extractor.py b/app/services/extractor.py index 8c25cc9..84d910f 100644 --- a/app/services/extractor.py +++ b/app/services/extractor.py @@ -1,27 +1,29 @@ +import urllib.parse from typing import List import html2text from bs4 import BeautifulSoup from ioc_finder import parse_urls -import re -import urllib.parse def is_html(content_type: str) -> bool: return "text/html" in content_type -def desafelink_url(url: str): +def unpack_safelink_url(url: str) -> str: # convert a Microsoft safelink back to a normal URL - match = re.search(r"https?://[^/]+\.safelinks\.protection\.outlook\.com/\?url=([^&]+)", url) - if match: - url = urllib.parse.unquote(match.group(1)) + parsed = urllib.parse.urlparse(url) + if parsed.netloc.endswith(".safelinks.protection.outlook.com"): + parsed_query = urllib.parse.parse_qs(parsed.query) + safelink_urls = parsed_query.get("url") + if safelink_urls is not None: + return urllib.parse.unquote(safelink_urls[0]) return url -def desafelink_urls(urls: List[str]) -> List[str]: - return [desafelink_url(url) for url in urls] +def unpack_safelink_urls(urls: List[str]) -> List[str]: + return [unpack_safelink_url(url) for url in urls] def normalize_url(url: str): @@ -61,4 +63,4 @@ def parse_urls_from_body(content: str, content_type: str) -> List[str]: ) urls.extend(parse_urls(content, parse_urls_without_scheme=False)) - return desafelink_urls(normalize_urls(urls)) + return normalize_urls(unpack_safelink_urls(urls)) diff --git a/tests/services/test_extractor.py b/tests/services/test_extractor.py index 25f1578..72137cc 100644 --- a/tests/services/test_extractor.py +++ b/tests/services/test_extractor.py @@ -25,3 +25,11 @@ def test_parse_urls_from_body_with_text(): ) assert len(urls) == 1 assert "http://example.com" in urls + + +def test_parse_urls_with_safelinks(): + urls = parse_urls_from_body( + "https://eur03.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.google.com%2F", + "text/plain", + ) + assert "https://www.google.com/" in urls