Skip to content

Commit

Permalink
Introduced tldextract and the basic if/else structure I'm thinking ab…
Browse files Browse the repository at this point in the history
…out for the blacklists
  • Loading branch information
palewire committed Jul 28, 2014
1 parent 76641ac commit bfd40f8
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 3 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ python-coveralls==2.4.2
requests==2.3.0
sh==1.09
six==1.7.3
tldextract==1.4
tox==1.7.2
virtualenv==1.11.6
wsgiref==0.1.2
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

install_requires = [
'six==1.7.2',
'tldextract==1.4',
]

setup(
Expand Down
36 changes: 35 additions & 1 deletion storysniffer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import tldextract
try:
from urlparse import urlparse
except ImportError:
Expand All @@ -19,12 +22,32 @@
re.IGNORECASE
)

# A list of URL paths tha probably won't link to new stories
# A list of URL parts that probably won't link to new stories
DOMAIN_BLACKLIST = (
'google',
'twitter',
'facebook',
'doubleclick',
)

SUBDOMAIN_BLACKLIST = (
'careers',
'mail',
)

TLD_BLACKLIST = (
'xxx',
)

PATH_BLACKLIST = (
'',
'/',
)

EXT_BLACKIST = (

)


def guess(url):
"""
Expand All @@ -35,10 +58,21 @@ def guess(url):
if not URL_REGEX.search(url):
raise ValueError("Provided url does not match acceptable URL patterns")

# Parse the url into parts so we can inspect them
urlparts = urlparse(url)
tldparts = tldextract.extract(url)

if urlparts.path in PATH_BLACKLIST:
return False

if tldparts.domain in DOMAIN_BLACKLIST:
return False

if tldparts.subdomain in SUBDOMAIN_BLACKLIST:
return False

if tldparts.suffix in TLD_BLACKLIST:
return False

# If you've it this far, return True
return True
8 changes: 6 additions & 2 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,12 @@ def test_busted(self):
storysniffer.guess(self.busted)

def test_guess(self):
self.assertTrue(storysniffer.guess(self.yes))
self.assertFalse(storysniffer.guess(self.no))
func = storysniffer.guess
self.assertTrue(func(self.yes))
self.assertFalse(func(self.no))
self.assertFalse(func("http://www.facebook.com/foobar/"))
self.assertFalse(func("http://careers.cnn.com/foobar/"))
self.assertFalse(func("http://www.news.xxx/foobar/"))


if __name__ == '__main__':
Expand Down

0 comments on commit bfd40f8

Please sign in to comment.