Permalink
Browse files

hash only the domain of the site; include tldextract for this

  • Loading branch information...
1 parent ec78950 commit 152e9396131726667ded879ae7163b628712d8ec @mihneadb mihneadb committed Sep 18, 2012
View
1 requirements/pure.txt
@@ -7,3 +7,4 @@ zope.interface==4.0.1
argparse==1.2.1
cssutils==0.9.10b1
django-inmemorystorage==0.1.1
+tldextract==1.1.2
View
12 spade/scraper/spiders/general_spider.py
@@ -16,11 +16,21 @@
from hashlib import sha256
from urlparse import urljoin, urlparse
import os
+import tldextract
# Django model
from spade import model
+# small helper function for finding only the domain of an url
+def get_domain(url):
+ try:
+ data = tldextract.extract(url)
+ return '%s.%s' % (data.domain, data.tld)
+ except:
+ return url
+
+
class GeneralSpider(BaseSpider):
"""
A generic spider
@@ -77,7 +87,7 @@ def parse(self, response):
sitescan, ss_created = model.SiteScan.objects.get_or_create(
batch=self.batch,
- site_url_hash=sha256(response.url).hexdigest(),
+ site_url_hash=sha256(get_domain(response.url)).hexdigest(),
defaults={'site_url': response.url})
if not ss_created:
View
11,988 vendor/tldextract/.tld_set
11,988 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
View
8,586 vendor/tldextract/.tld_set_snapshot
8,586 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
View
3 vendor/tldextract/__init__.py
@@ -0,0 +1,3 @@
+from tldextract import extract, TLDExtract
+
+__version__ = "1.1.2"
View
0 vendor/tldextract/tests/__init__.py
No changes.
View
102 vendor/tldextract/tests/all.py
@@ -0,0 +1,102 @@
+import doctest
+import logging
+import os
+import sys
+import unittest
+
+import tldextract
+from tldextract import extract
+
+class IntegrationTest(unittest.TestCase):
+ def test_log_snapshot_diff(self):
+ logging.basicConfig(level=logging.DEBUG)
+
+ extractor = tldextract.TLDExtract()
+ try:
+ os.remove(extractor.cache_file)
+ except IOError:
+ pass
+
+ # TODO: if .tld_set_snapshot is up to date, this won't trigger a diff
+ extractor('ignore.com')
+
+class ExtractTest(unittest.TestCase):
+ def assertExtract(self, expected_subdomain, expected_domain, expected_tld, url, fns=(extract,)):
+ for fn in fns:
+ ext = fn(url)
+ self.assertEquals(expected_subdomain, ext.subdomain)
+ self.assertEquals(expected_domain, ext.domain)
+ self.assertEquals(expected_tld, ext.tld)
+
+ def test_american(self):
+ self.assertExtract('www', 'google', 'com', 'http://www.google.com')
+
+ def test_british(self):
+ self.assertExtract("www", "theregister", "co.uk", "http://www.theregister.co.uk")
+
+ def test_no_subdomain(self):
+ self.assertExtract("", "gmail", "com", "http://gmail.com")
+
+ def test_nested_subdomain(self):
+ self.assertExtract("media.forums", "theregister", "co.uk", "http://media.forums.theregister.co.uk")
+
+ def test_odd_but_possible(self):
+ self.assertExtract('www', 'www', 'com', 'http://www.www.com')
+ self.assertExtract('', 'www', 'com', 'http://www.com')
+
+ def test_local_host(self):
+ self.assertExtract('', 'wiki', '', 'http://wiki/')
+ self.assertExtract('wiki', 'bizarre', '', 'http://wiki.bizarre')
+
+ def test_qualified_local_host(self):
+ self.assertExtract('', 'wiki', 'info', 'http://wiki.info/')
+ self.assertExtract('wiki', 'information', '', 'http://wiki.information/')
+
+ def test_ip(self):
+ self.assertExtract('', '216.22.0.192', '', 'http://216.22.0.192/')
+ self.assertExtract('216.22', 'project', 'coop', 'http://216.22.project.coop/')
+
+ def test_empty(self):
+ self.assertExtract('', '', '', 'http://')
+
+ def test_scheme(self):
+ self.assertExtract('mail', 'google', 'com', 'https://mail.google.com/mail')
+ self.assertExtract('mail', 'google', 'com', 'ssh://mail.google.com/mail')
+ self.assertExtract('mail', 'google', 'com', '//mail.google.com/mail')
+ self.assertExtract('mail', 'google', 'com', 'mail.google.com/mail', fns=(extract,))
+
+ def test_port(self):
+ self.assertExtract('www', 'github', 'com', 'git+ssh://www.github.com:8443/')
+
+ def test_username(self):
+ self.assertExtract('1337', 'warez', 'com', 'ftp://johndoe:5cr1p7k1dd13@1337.warez.com:2501')
+
+ def test_query(self):
+ self.assertExtract('', 'google', 'com', 'http://google.com?q=cats')
+
+ def test_regex_order(self):
+ self.assertExtract('www', 'parliament', 'uk', 'http://www.parliament.uk')
+ self.assertExtract('www', 'parliament', 'co.uk', 'http://www.parliament.co.uk')
+
+ def test_unhandled_by_iana(self):
+ self.assertExtract('www', 'cgs', 'act.edu.au', 'http://www.cgs.act.edu.au/')
+ self.assertExtract('www', 'google', 'com.au', 'http://www.google.com.au/')
+
+ def test_tld_is_a_website_too(self):
+ self.assertExtract('www', 'metp', 'net.cn', 'http://www.metp.net.cn')
+ #self.assertExtract('www', 'net', 'cn', 'http://www.net.cn') # This is unhandled by the PSL. Or is it?
+
+def test_suite():
+ return unittest.TestSuite([
+ doctest.DocTestSuite(tldextract.tldextract),
+ unittest.TestLoader().loadTestsFromTestCase(IntegrationTest),
+ unittest.TestLoader().loadTestsFromTestCase(ExtractTest),
+ ])
+
+def run_tests(stream=sys.stderr):
+ suite = test_suite()
+ unittest.TextTestRunner(stream).run(suite)
+
+if __name__ == "__main__":
+ run_tests()
+
View
230 vendor/tldextract/tldextract.py
@@ -0,0 +1,230 @@
+# -*- coding: utf-8 -*-
+"""`tldextract` accurately separates the gTLD or ccTLD (generic or country code
+top-level domain) from the registered domain and subdomains of a URL.
+
+ >>> import tldextract
+ >>> tldextract.extract('http://forums.news.cnn.com/')
+ ExtractResult(subdomain='forums.news', domain='cnn', tld='com')
+ >>> tldextract.extract('http://forums.bbc.co.uk/') # United Kingdom
+ ExtractResult(subdomain='forums', domain='bbc', tld='co.uk')
+ >>> tldextract.extract('http://www.worldbank.org.kg/') # Kyrgyzstan
+ ExtractResult(subdomain='www', domain='worldbank', tld='org.kg')
+
+`ExtractResult` is a namedtuple, so it's simple to access the parts you want.
+
+ >>> ext = tldextract.extract('http://forums.bbc.co.uk')
+ >>> ext.domain
+ 'bbc'
+ >>> '.'.join(ext[:2]) # rejoin subdomain and domain
+ 'forums.bbc'
+"""
+
+from __future__ import with_statement
+try:
+ import cPickle as pickle
+except ImportError:
+ import pickle
+import errno
+from functools import wraps
+import logging
+from operator import itemgetter
+import os
+import sys
+
+try:
+ import pkg_resources
+except ImportError:
+ class pkg_resources(object):
+ """Fake pkg_resources interface which falls back to getting resources
+ inside `tldextract`'s directory.
+ """
+ @classmethod
+ def resource_stream(cls, package, resource_name):
+ moddir = os.path.dirname(__file__)
+ f = os.path.join(moddir, resource_name)
+ return open(f)
+
+import re
+import socket
+import urllib2
+import urlparse
+
+LOG = logging.getLogger("tldextract")
+
+SCHEME_RE = re.compile(r'^([' + urlparse.scheme_chars + ']+:)?//')
+IP_RE = re.compile(r'^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$')
+
+class ExtractResult(tuple):
+ 'ExtractResult(subdomain, domain, tld)'
+ __slots__ = ()
+ _fields = ('subdomain', 'domain', 'tld')
+
+ def __new__(_cls, subdomain, domain, tld):
+ 'Create new instance of ExtractResult(subdomain, domain, tld)'
+ return tuple.__new__(_cls, (subdomain, domain, tld))
+
+ @classmethod
+ def _make(cls, iterable, new=tuple.__new__, len=len):
+ 'Make a new ExtractResult object from a sequence or iterable'
+ result = new(cls, iterable)
+ if len(result) != 3:
+ raise TypeError('Expected 3 arguments, got %d' % len(result))
+ return result
+
+ def __repr__(self):
+ 'Return a nicely formatted representation string'
+ return 'ExtractResult(subdomain=%r, domain=%r, tld=%r)' % self
+
+ def _asdict(self):
+ 'Return a new dict which maps field names to their values'
+ return dict(zip(self._fields, self))
+
+ def _replace(_self, **kwds):
+ 'Return a new ExtractResult object replacing specified fields with new values'
+ result = _self._make(map(kwds.pop, ('subdomain', 'domain', 'tld'), _self))
+ if kwds:
+ raise ValueError('Got unexpected field names: %r' % kwds.keys())
+ return result
+
+ def __getnewargs__(self):
+ 'Return self as a plain tuple. Used by copy and pickle.'
+ return tuple(self)
+
+ subdomain = property(itemgetter(0), doc='Alias for field number 0')
+ domain = property(itemgetter(1), doc='Alias for field number 1')
+ tld = property(itemgetter(2), doc='Alias for field number 2')
+
+class TLDExtract(object):
+ def __init__(self, fetch=True, cache_file=''):
+ """
+ Constructs a callable for extracting subdomain, domain, and TLD
+ components from a URL.
+
+ If fetch is True (the default) and no cached TLD set is found, this
+ extractor will fetch TLD sources live over HTTP on first use. Set to
+ False to not make HTTP requests. Either way, if the TLD set can't be
+ read, the module will fall back to the included TLD set snapshot.
+
+ Specifying cache_file will override the location of the TLD set.
+ Defaults to /path/to/tldextract/.tld_set.
+
+ """
+ self.fetch = fetch
+ self.cache_file = cache_file or os.path.join(os.path.dirname(__file__), '.tld_set')
+ self._extractor = None
+
+ def __call__(self, url):
+ """
+ Takes a string URL and splits it into its subdomain, domain, and
+ gTLD/ccTLD component.
+
+ >>> extract = TLDExtract()
+ >>> extract('http://forums.news.cnn.com/')
+ ExtractResult(subdomain='forums.news', domain='cnn', tld='com')
+ >>> extract('http://forums.bbc.co.uk/')
+ ExtractResult(subdomain='forums', domain='bbc', tld='co.uk')
+ """
+ netloc = SCHEME_RE.sub("", url).partition("/")[0].partition("?")[0]
+ return self._extract(netloc)
+
+ def _extract(self, netloc):
+ netloc = netloc.split("@")[-1].partition(':')[0]
+ registered_domain, tld = self._get_tld_extractor().extract(netloc)
+ if not tld and netloc and netloc[0].isdigit():
+ try:
+ is_ip = socket.inet_aton(netloc)
+ return ExtractResult('', netloc, '')
+ except AttributeError:
+ if IP_RE.match(netloc):
+ return ExtractResult('', netloc, '')
+ except socket.error:
+ pass
+
+ subdomain, _, domain = registered_domain.rpartition('.')
+ return ExtractResult(subdomain, domain, tld)
+
+ def _get_tld_extractor(self):
+ if self._extractor:
+ return self._extractor
+
+ cached_file = self.cache_file
+ try:
+ with open(cached_file) as f:
+ self._extractor = _PublicSuffixListTLDExtractor(pickle.load(f))
+ return self._extractor
+ except IOError, ioe:
+ file_not_found = ioe.errno == errno.ENOENT
+ if not file_not_found:
+ LOG.error("error reading TLD cache file %s: %s", cached_file, ioe)
+ except Exception, ex:
+ LOG.error("error reading TLD cache file %s: %s", cached_file, ex)
+
+ tlds = frozenset()
+ if self.fetch:
+ tld_sources = (_PublicSuffixListSource,)
+ tlds = frozenset(tld for tld_source in tld_sources for tld in tld_source())
+
+ if not tlds:
+ with pkg_resources.resource_stream(__name__, '.tld_set_snapshot') as snapshot_file:
+ self._extractor = _PublicSuffixListTLDExtractor(pickle.load(snapshot_file))
+ return self._extractor
+
+ LOG.info("computed TLDs: [%s, ...]", ', '.join(list(tlds)[:10]))
+ if LOG.isEnabledFor(logging.DEBUG):
+ import difflib
+ with pkg_resources.resource_stream(__name__, '.tld_set_snapshot') as snapshot_file:
+ snapshot = sorted(pickle.load(snapshot_file))
+ new = sorted(tlds)
+ for line in difflib.unified_diff(snapshot, new, fromfile=".tld_set_snapshot", tofile=cached_file):
+ print >> sys.stderr, line.encode('utf-8')
+
+ try:
+ with open(cached_file, 'wb') as f:
+ pickle.dump(tlds, f)
+ except IOError, e:
+ LOG.warn("unable to cache TLDs in file %s: %s", cached_file, e)
+
+ self._extractor = _PublicSuffixListTLDExtractor(tlds)
+ return self._extractor
+
+TLD_EXTRACTOR = TLDExtract()
+
+@wraps(TLD_EXTRACTOR.__call__)
+def extract(url):
+ return TLD_EXTRACTOR(url)
+
+def _fetch_page(url):
+ try:
+ return unicode(urllib2.urlopen(url).read(), 'utf-8')
+ except urllib2.URLError, e:
+ LOG.error(e)
+ return u''
+
+def _PublicSuffixListSource():
+ page = _fetch_page('http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1')
+
+ tld_finder = re.compile(r'^(?P<tld>[.*!]*\w[\S]*)', re.UNICODE | re.MULTILINE)
+ tlds = [m.group('tld') for m in tld_finder.finditer(page)]
+ return tlds
+
+class _PublicSuffixListTLDExtractor(object):
+ def __init__(self, tlds):
+ self.tlds = tlds
+
+ def extract(self, netloc):
+ spl = netloc.split('.')
+ for i in range(len(spl)):
+ maybe_tld = '.'.join(spl[i:])
+ exception_tld = '!' + maybe_tld
+ if exception_tld in self.tlds:
+ return '.'.join(spl[:i+1]), '.'.join(spl[i+1:])
+
+ wildcard_tld = '*.' + '.'.join(spl[i+1:])
+ if wildcard_tld in self.tlds or maybe_tld in self.tlds:
+ return '.'.join(spl[:i]), maybe_tld
+
+ return netloc, ''
+
+if __name__ == "__main__":
+ url = sys.argv[1]
+ print ' '.join(extract(url))

0 comments on commit 152e939

Please sign in to comment.