Permalink
Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
181 lines (158 sloc) 5.56 KB
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2010 Watts Lab, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
EFFECTIVE_TLD_NAMES = 'http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1'
def _normalize(s):
from exceptions import UnicodeEncodeError
s = s.strip().strip('.').split()[0].lower()
try:
return s.decode('utf-8')
except UnicodeEncodeError, ex:
return s
class PrefixNode:
'''
An internal class, each prefix in the tree represents a part of a domain name. (e.g. '.com')
The naming is a little confusing, a PrefixTree is the data structure, but the nodes are
used to store the suffixes of partial domain names.
'''
#passing a rule lets you build on entire branch in one call
def __init__(self, prefix, rule=None):
if prefix is not None:
self.is_exception = prefix.startswith('!')
if self.is_exception:
prefix = prefix[1:]
self.prefix = prefix
self.children = dict()
if rule is not None:
self.add(rule)
def __repr__(self):
return 'PrefixNode(\'%s\')' % (self.prefix,)
def add(self, rule):
if len(rule) == 0:
return
prefix = rule[0]
if prefix.startswith('!') and '*' in self.children:
#if this is an exception to a wildcard it should be a child of that wildcard
self.children['*'].add(rule)
return
if prefix in self.children:
self.children[prefix].add(rule[1:])
else:
if prefix.startswith('!'):
prefix = prefix[1:]
self.children[prefix] = PrefixNode( prefix, rule= rule[1:] )
# when given a rule to match, splits the tuple into a matching section and the domain name
# e.g. match( ('com', 'example', 'www') ) -> ( ['.com'], 'example' )
# e.g. match( ('com', 'eff-tld', 'site') ) -> ( ['.eff-tld', '.com'], 'site' )
def match(self, rule):
#print ' %s is matching %s' % (self.prefix, rule)
if len(rule) == 0:
#when a tld is also a hostname
return ( [], [])
if self.prefix == '*':
return self._match_as_wildcard(rule)
prefix = rule[0]
if prefix not in self.children:
if '*' in self.children:
return self.children['*'].match(rule)
return ( [], prefix )
else:
match = self.children[prefix].match( rule[1:] )
child_matched = match[0]
child_matched.append('.' + prefix)
return ( child_matched, match[1] )
def _match_as_wildcard(self, rule):
#print ' %s: matching %s as wildcard. My children are: %s' % (self.prefix, rule, self.children)
prefix = rule[0]
#if prefix matches no exception
if prefix not in self.children:
if len(rule) > 1:
return ( ['.' + prefix], rule[1])
return ( ['.' + prefix], None)
else:
return ( [], prefix)
class PrefixTree(PrefixNode):
'''
Helper to provide a nicer interface for dealing with the tree
'''
#seq is a sequence of tuples representing rules
#of the form ('com'), ('uk', 'co'), etc.
def __init__(self, seq):
PrefixNode.__init__(self, None)
for rule in seq:
self.add(rule)
def __repr__(self):
return '<PrefixTree (%s children)>' % (len(self.children),)
def match(self, s):
rule = tuple(reversed(s.strip().split('.')))
match = PrefixNode.match(self, rule)
#print 'Tree matching %s, match was %s' % (s, match)
if len(match[0]) == 0:
return None
return (''.join(match[0]), match[1])
def domain(self, s):
if s is None:
return None
s = _normalize(s)
match = self.match(s)
try:
return match[1] + match[0]
except:
return None
def domain(self, s):
if s is None:
return None
s = _normalize(s)
match = self.match(s)
try:
return match[1] + match[0]
except:
return None
def _tokenize(lines):
rules = []
for s in lines:
if s and not s.isspace() and not s.startswith('//'):
rule = tuple(reversed(_normalize(s).split('.')))
rules.append(rule)
return rules
def _is_ip(address):
parts = address.split(".")
if len(parts) != 4:
return False
for item in parts:
if not item.isdigit() or not 0 <= int(item) <= 255:
return False
return True
suffixtree = None
'''
Call this first to initialize the suffix tree
'''
def init_suffix_tree(tld_file):
fp = open(tld_file)
suffix_lines = fp.readlines()
suffix_rules = _tokenize(suffix_lines)
fp.close()
global suffixtree
suffixtree = PrefixTree(suffix_rules)
def get_root_domain(domain):
if not domain:
return None
domain = _normalize(domain)
if _is_ip(domain):
return domain
return suffixtree.domain(domain)
if __name__ == '__main__':
import doctest
doctest.testmod()