Skip to content

Commit

Permalink
refactor: do not use re
Browse files Browse the repository at this point in the history
  • Loading branch information
ninoseki committed Oct 21, 2021
1 parent b6eee86 commit 15662b0
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 9 deletions.
20 changes: 11 additions & 9 deletions app/services/extractor.py
@@ -1,27 +1,29 @@
import urllib.parse
from typing import List

import html2text
from bs4 import BeautifulSoup
from ioc_finder import parse_urls
import re
import urllib.parse


def is_html(content_type: str) -> bool:
return "text/html" in content_type


def desafelink_url(url: str):
def unpack_safelink_url(url: str) -> str:
# convert a Microsoft safelink back to a normal URL
match = re.search(r"https?://[^/]+\.safelinks\.protection\.outlook\.com/\?url=([^&]+)", url)
if match:
url = urllib.parse.unquote(match.group(1))
parsed = urllib.parse.urlparse(url)
if parsed.netloc.endswith(".safelinks.protection.outlook.com"):
parsed_query = urllib.parse.parse_qs(parsed.query)
safelink_urls = parsed_query.get("url")
if safelink_urls is not None:
return urllib.parse.unquote(safelink_urls[0])

return url


def desafelink_urls(urls: List[str]) -> List[str]:
return [desafelink_url(url) for url in urls]
def unpack_safelink_urls(urls: List[str]) -> List[str]:
return [unpack_safelink_url(url) for url in urls]


def normalize_url(url: str):
Expand Down Expand Up @@ -61,4 +63,4 @@ def parse_urls_from_body(content: str, content_type: str) -> List[str]:
)

urls.extend(parse_urls(content, parse_urls_without_scheme=False))
return desafelink_urls(normalize_urls(urls))
return normalize_urls(unpack_safelink_urls(urls))
8 changes: 8 additions & 0 deletions tests/services/test_extractor.py
Expand Up @@ -25,3 +25,11 @@ def test_parse_urls_from_body_with_text():
)
assert len(urls) == 1
assert "http://example.com" in urls


def test_parse_urls_with_safelinks():
urls = parse_urls_from_body(
"https://eur03.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.google.com%2F",
"text/plain",
)
assert "https://www.google.com/" in urls

0 comments on commit 15662b0

Please sign in to comment.