Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Comparing changes

Choose two branches to see what's changed or to start a new pull request. If you need to, you can also compare across forks.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also compare across forks.
...
Checking mergeability… Don't worry, you can still create the pull request.
  • 19 commits
  • 12 files changed
  • 0 commit comments
  • 6 contributors
Commits on Jan 16, 2010
@wilson wilson Taught Domainatrix how to scan text for URLs e3532f1
Commits on Jan 20, 2010
@menno menno Add support for parsing urls with wildcards 90958dc
Commits on Jan 28, 2010
@menno menno Merge branch 'wildcards' into menno-master d01cb85
@wouter wouter Update TLD list to 9b2de76c4d43 d828aa5
@wouter wouter Rewrite TLD detection, add support for TLD exception rules, add extra…
… checks for invalid URLs
afc7111
@wouter wouter Bump version, update gemspec 4f6777d
Commits on Jan 29, 2010
@wouter wouter Add better support for wildcard URLs, add to_s for Url, better errorh…
…andling
470a1e3
@wouter wouter Bump version to 0.0.9 1d5eb2a
Commits on Mar 04, 2010
@menno menno Raise Error on missing host 30984eb
Commits on Mar 30, 2010
@menno menno Add the fragment to the path 913c22e
Commits on Aug 16, 2012
@marceldegraaf marceldegraaf Make the specs work on Ruby 1.9.3 16970d5
@marceldegraaf marceldegraaf Add support for parsing ip addresses 6747690
@marceldegraaf marceldegraaf Return empty string instead of nil 0751b13
@marceldegraaf marceldegraaf Add ip_address accessor to find out if a parsed URL was an IP address…
… or not
40e53e8
@marceldegraaf marceldegraaf Merge pull request #1 from menno/ips
Add support for parsing IP addresses
20def50
Commits on Oct 25, 2012
@marceldegraaf marceldegraaf In Ruby 1.9+, force UTF-8 encoding 1ac535e
@marceldegraaf marceldegraaf Open the domains file in UTF-8 mode instead of forcing UTF-8 on each …
…line
ef303e5
@marceldegraaf marceldegraaf Make sure Ruby < 1.9 still works 30be2da
Commits on Nov 15, 2012
@wouter wouter IP address should contain 4 parts with 1 to 3 decimals each fa77f0d
View
1  .gitignore
@@ -0,0 +1 @@
+.rvmrc
View
1  .rbenv-version
@@ -0,0 +1 @@
+1.9.3-p194
View
12 domainatrix.gemspec
@@ -2,12 +2,12 @@
Gem::Specification.new do |s|
s.name = %q{domainatrix}
- s.version = "0.0.7"
+ s.version = "0.0.9"
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
- s.authors = ["Paul Dix"]
- s.date = %q{2009-12-10}
- s.email = %q{paul@pauldix.net}
+ s.authors = ["Paul Dix", "Menno van der Sman", "Wouter Broekhof"]
+ s.date = %q{2010-01-28}
+ s.email = %q{info@wakoopa.com}
s.files = [
"lib/domainatrix.rb",
"lib/effective_tld_names.dat",
@@ -20,7 +20,7 @@ Gem::Specification.new do |s|
"spec/domainatrix/domain_parser_spec.rb",
"spec/domainatrix/url_spec.rb"]
s.has_rdoc = true
- s.homepage = %q{http://github.com/pauldix/domainatrix}
+ s.homepage = %q{http://github.com/menno/domainatrix}
s.require_paths = ["lib"]
s.rubygems_version = %q{1.3.5}
s.summary = %q{A cruel mistress that uses the public suffix domain list to dominate URLs by canonicalizing, finding the public suffix, and breaking them into their domain parts.}
@@ -35,4 +35,4 @@ Gem::Specification.new do |s|
end
else
end
-end
+end
View
17 lib/domainatrix.rb
@@ -4,11 +4,24 @@
require 'domainatrix/domain_parser.rb'
require 'domainatrix/url.rb'
+begin
+ require 'uri'
+rescue LoadError
+end
+
module Domainatrix
- VERSION = "0.0.7"
+ VERSION = "0.0.8"
def self.parse(url)
@domain_parser ||= DomainParser.new("#{File.dirname(__FILE__)}/effective_tld_names.dat")
Url.new(@domain_parser.parse(url))
end
-end
+
+ def self.scan(text, &block)
+ @schemes ||= %w(http https)
+
+ urls = URI.extract(text, @schemes).map { |url| parse(url) }
+ urls.map!(&block) if block
+ urls
+ end
+end
View
130 lib/domainatrix/domain_parser.rb
@@ -1,8 +1,14 @@
+require 'rbconfig'
+
module Domainatrix
+ class Error < RuntimeError; end
+ class ParseError < Error; end
+
class DomainParser
include Addressable
-
+
attr_reader :public_suffixes
+ VALID_SCHEMA = /^http[s]{0,1}$/
def initialize(file_name)
@public_suffixes = {}
@@ -10,9 +16,15 @@ def initialize(file_name)
end
def read_dat_file(file_name)
- File.readlines(file_name).each do |line|
+ if RbConfig::CONFIG['MINOR'].to_i < 9
+ lines = File.readlines(file_name)
+ else
+ lines = File.readlines(file_name, $/, :mode => 'r:utf-8')
+ end
+
+ lines.each do |line|
line = line.strip
- unless (line =~ /\/\//) || line.empty?
+ unless (line =~ /^\/\//) || line.empty?
parts = line.split(".").reverse
sub_hash = @public_suffixes
@@ -24,12 +36,20 @@ def read_dat_file(file_name)
end
def parse(url)
- uri = URI.parse(url)
- if uri.query
- path = "#{uri.path}?#{uri.query}"
- else
- path = uri.path
+ uri = begin
+ Addressable::URI.parse(url)
+ rescue Addressable::URI::InvalidURIError
+ nil
end
+
+ raise ParseError, "URL is not parsable by Addressable::URI" if not uri
+ raise ParseError, "URL does not have a valid scheme" unless uri.scheme =~ VALID_SCHEMA
+ raise ParseError, "URL does not have a valid host" if uri.host.nil?
+
+ path = uri.path
+ path << "?#{uri.query}" if uri.query
+ path << "##{uri.fragment}" if uri.fragment
+
parse_domains_from_host(uri.host).merge({
:scheme => uri.scheme,
:host => uri.host,
@@ -38,33 +58,79 @@ def parse(url)
})
end
+ def split_domain(parts, tld_size)
+ if parts.size == 1 and tld_size == 0
+ subdomain = ''
+ domain = '*'
+ tld = ''
+ else
+ # parts are host split on . reversed, eg com.pauldix.www
+ domain_parts = parts.reverse
+ if domain_parts.size - tld_size <= 0
+ raise ParseError, "Invalid TLD size found for #{domain_parts.join('.')}: #{tld_size}"
+ end
+
+ tld = domain_parts.slice!(-tld_size, tld_size).join('.')
+ domain = domain_parts.pop
+ subdomain = domain_parts.join('.')
+ end
+
+ [subdomain, domain, tld]
+ end
+
def parse_domains_from_host(host)
parts = host.split(".").reverse
- public_suffix = []
- domain = ""
- subdomains = []
- sub_hash = @public_suffixes
- parts.each_index do |i|
- part = parts[i]
-
- sub_parts = sub_hash[part]
- sub_hash = sub_parts
- if sub_parts.has_key? "*"
- public_suffix << part
- public_suffix << parts[i+1]
- domain = parts[i+2]
- subdomains = parts.slice(i+3, parts.size)
- break
- elsif sub_parts.empty? || !sub_parts.has_key?(parts[i+1])
- public_suffix << part
- domain = parts[i+1]
- subdomains = parts.slice(i+2, parts.size)
- break
- else
- public_suffix << part
+ ip_address = false
+
+ if host == '*'
+ tld_size = 0
+ elsif !parts.map { |part| part.match(/^\d{1,3}$/) }.include?(nil)
+ # host is an ip address
+ ip_address = true
+ else
+ main_tld = parts.first
+
+ tld_size = 1
+ raise ParseError, "Invalid URL" if parts.size < 2
+
+ if main_tld != '*'
+ raise ParseError, "Invalid characters for TLD" unless main_tld =~ /^[a-z]{2,}/
+ if not current_suffixes = @public_suffixes[main_tld]
+ raise ParseError, "Invalid main TLD: #{main_tld}"
+ end
+
+ parts.each_with_index do |part, i|
+ if current_suffixes.empty?
+ # no extra rules found (eg domain.net)
+ break
+ else
+ if current_suffixes.has_key?("!#{parts[i+1]}")
+ # exception tld domain found (eg metro.tokyo.jp)
+ break
+ elsif current_suffixes.has_key?(parts[i+1])
+ # valid extra domain level found (eg co.uk)
+ tld_size += 1
+ current_suffixes = current_suffixes[parts[i+1]]
+ elsif current_suffixes.has_key?('*')
+ # wildcard domain level (eg *.jp)
+ tld_size += 1
+ break
+ else
+ # no extra rules found (eg domain.net)
+ break
+ end
+ end
+ end
end
end
- {:public_suffix => public_suffix.reverse.join("."), :domain => domain, :subdomain => subdomains.reverse.join(".")}
+
+ if ip_address
+ subdomain, domain, tld = '', host, ''
+ else
+ subdomain, domain, tld = split_domain(parts, tld_size)
+ end
+
+ {:public_suffix => tld, :domain => domain, :subdomain => subdomain, :ip_address => ip_address}
end
end
-end
+end
View
13 lib/domainatrix/url.rb
@@ -1,6 +1,6 @@
module Domainatrix
class Url
- attr_reader :public_suffix, :domain, :subdomain, :path, :url, :scheme, :host
+ attr_accessor :public_suffix, :domain, :subdomain, :path, :url, :scheme, :host, :ip_address
def initialize(attrs = {})
@scheme = attrs[:scheme]
@@ -10,6 +10,7 @@ def initialize(attrs = {})
@domain = attrs[:domain]
@subdomain = attrs[:subdomain]
@path = attrs[:path]
+ @ip_address = attrs[:ip_address]
end
def canonical(options = {})
@@ -23,5 +24,15 @@ def canonical(options = {})
url
end
+
+ def to_s
+ scheme = (@scheme.nil?) ? '' : "#{@scheme}://"
+ parts = []
+ parts << @subdomain if @subdomain and !@subdomain.empty?
+ parts << @domain if @domain and !@domain.empty?
+ parts << @public_suffix if @public_suffix and !@public_suffix.empty?
+
+ "#{scheme}#{parts.join('.')}#{@path}"
+ end
end
end
View
85 lib/effective_tld_names.dat
@@ -178,13 +178,14 @@ com.ai
net.ai
org.ai
-// al : http://www.inima.al/Domains.html
+// al : http://www.ert.gov.al/ert_alb/faq_det.html?Id=31
al
-gov.al
-edu.al
-org.al
com.al
+edu.al
+gov.al
+mil.al
net.al
+org.al
// am : http://en.wikipedia.org/wiki/.am
am
@@ -226,6 +227,7 @@ aq
e164.arpa
in-addr.arpa
ip6.arpa
+iris.arpa
uri.arpa
urn.arpa
@@ -239,11 +241,15 @@ asia
// at : http://en.wikipedia.org/wiki/.at
// Confirmed by registry <it@nic.at> 2008-06-17
at
-gv.at
ac.at
co.at
+gv.at
or.at
+// http://www.info.at/
+biz.at
+info.at
+
// priv.at : http://www.nic.priv.at/
// Submitted by registry <lendl@nic.at> 2008-06-09
priv.at
@@ -305,11 +311,14 @@ rs.ba
// bb : http://en.wikipedia.org/wiki/.bb
bb
+biz.bb
com.bb
edu.bb
gov.bb
+info.bb
net.bb
org.bb
+store.bb
// bd : http://en.wikipedia.org/wiki/.bd
*.bd
@@ -381,8 +390,10 @@ org.bi
biz
// bj : http://en.wikipedia.org/wiki/.bj
-// list of 2nd level tlds ?
bj
+asso.bj
+barreau.bj
+gouv.bj
// bm : http://www.bermudanic.bm/dnr-text.txt
bm
@@ -507,6 +518,9 @@ mil.by
// www.yahoo.com.by, for example), so we list it here for safety's sake.
com.by
+// http://hoster.by/
+of.by
+
// bz : http://en.wikipedia.org/wiki/.bz
// http://www.belizenic.bz/
bz
@@ -678,6 +692,9 @@ us.com
uy.com
za.com
+// Requested by Yngve Pettersen <yngve@opera.com> 2009-11-26
+operaunite.com
+
// coop : http://en.wikipedia.org/wiki/.coop
coop
@@ -807,6 +824,8 @@ fi
// completely removed.
// TODO: Check for updates (expected to be phased out around Q1/2009)
aland.fi
+// iki.fi : Submitted by Hannu Aronsson <haa@iki.fi> 2009-11-05
+iki.fi
// fj : http://en.wikipedia.org/wiki/.fj
*.fj
@@ -2408,7 +2427,22 @@ zoology.museum
иком.museum
// mv : http://en.wikipedia.org/wiki/.mv
-*.mv
+// "mv" included because, contra Wikipedia, google.mv exists.
+mv
+aero.mv
+biz.mv
+com.mv
+coop.mv
+edu.mv
+gov.mv
+info.mv
+int.mv
+mil.mv
+museum.mv
+name.mv
+net.mv
+org.mv
+pro.mv
// mw : http://www.registrar.mw/
mw
@@ -2420,6 +2454,7 @@ coop.mw
edu.mw
gov.mw
int.mw
+museum.mw
net.mw
org.mw
@@ -2471,6 +2506,7 @@ name
// nc : http://www.cctld.nc/
nc
+asso.nc
// ne : http://en.wikipedia.org/wiki/.ne
ne
@@ -3435,7 +3471,6 @@ sr.gov.pl
po.gov.pl
pa.gov.pl
// other functional domains
-med.pl
ngo.pl
irc.pl
usenet.pl
@@ -3525,6 +3560,7 @@ rybnik.pl
rzeszow.pl
sanok.pl
sejny.pl
+siedlce.pl
slask.pl
slupsk.pl
sosnowiec.pl
@@ -3563,6 +3599,7 @@ zgorzelec.pl
gda.pl
gdansk.pl
gdynia.pl
+med.pl
sopot.pl
// other geographical domains
gliwice.pl
@@ -3944,23 +3981,31 @@ sm
// sn : http://en.wikipedia.org/wiki/.sn
sn
+art.sn
+com.sn
+edu.sn
+gouv.sn
+org.sn
+perso.sn
+univ.sn
// sr : http://en.wikipedia.org/wiki/.sr
sr
// st : http://www.nic.st/html/policyrules/
st
-gov.st
-saotome.st
-principe.st
+co.st
+com.st
consulado.st
-org.st
edu.st
+embaixada.st
+gov.st
+mil.st
net.st
-com.st
+org.st
+principe.st
+saotome.st
store.st
-mil.st
-co.st
// su : http://en.wikipedia.org/wiki/.su
su
@@ -4018,17 +4063,19 @@ or.th
tj
ac.tj
biz.tj
-com.tj
co.tj
+com.tj
edu.tj
+go.tj
+gov.tj
int.tj
+mil.tj
name.tj
net.tj
+nic.tj
org.tj
+test.tj
web.tj
-gov.tj
-go.tj
-mil.tj
// tk : http://en.wikipedia.org/wiki/.tk
tk
View
63 spec/domainatrix/domain_parser_spec.rb
@@ -36,12 +36,13 @@
it "includes the scheme" do
@domain_parser.parse("http://www.pauldix.net")[:scheme].should == "http"
end
-
+
it "includes the full host" do
- @domain_parser.parse("http://www.pauldix.net")[:host].should == "www.pauldix.net"
+ @domain_parser.parse("http://www.pauldix.net")[:host].should == "www.pauldix.net"
end
-
+
it "parses out the path" do
+ @domain_parser.parse("http://pauldix.net/foo.html?asdf=foo#bar")[:path].should == "/foo.html?asdf=foo#bar"
@domain_parser.parse("http://pauldix.net/foo.html?asdf=foo")[:path].should == "/foo.html?asdf=foo"
@domain_parser.parse("http://pauldix.net?asdf=foo")[:path].should == "?asdf=foo"
@domain_parser.parse("http://pauldix.net")[:path].should == ""
@@ -67,5 +68,59 @@
@domain_parser.parse("http://foo.pauldix.net")[:subdomain].should == "foo"
@domain_parser.parse("http://bar.foo.pauldix.co.uk")[:subdomain].should == "bar.foo"
end
+
+ it "should accept wildcards" do
+ @domain_parser.parse("http://*.pauldix.net")[:subdomain].should == "*"
+ @domain_parser.parse("http://pauldix.*")[:public_suffix].should == "*"
+ @domain_parser.parse("http://pauldix.net/*")[:path].should == "/*"
+
+ combined = @domain_parser.parse("http://*.pauldix.*/*")
+ combined[:subdomain].should == "*"
+ combined[:domain].should == "pauldix"
+ combined[:public_suffix].should == "*"
+ combined[:path].should == "/*"
+ end
+
+ it "should parse a URL if it has a wildcard exception" do
+ @domain_parser.parse("http://metro.tokyo.jp")[:domain].should == "metro"
+ end
+
+ it "should throw an exception if the tld is not valid" do
+ lambda { @domain_parser.parse("http://pauldix.nett") }.should raise_error(Domainatrix::ParseError)
+ end
+
+ it "should throw an exception if the domain doesn't contain a valid host" do
+ lambda { @domain_parser.parse("http://co.jp") }.should raise_error(Domainatrix::ParseError)
+ end
+
+ it "should throw an exception if the domain doesn't have a valid scheme" do
+ lambda { @domain_parser.parse("pauldix.net") }.should raise_error(Domainatrix::ParseError)
+ end
+
+ it "should throw an exception if the domain contains an invalid character" do
+ lambda { @domain_parser.parse("http://pauldix,net") }.should raise_error(Domainatrix::ParseError)
+ end
+
+ it "should thrown an exception if the url is malformed" do
+ lambda { @domain_parser.parse("http:/") }.should raise_error(Domainatrix::ParseError)
+ end
+
+ it "parses an ip address" do
+ @domain_parser.parse("http://123.123.123.123/foo/bar")[:domain].should == "123.123.123.123"
+ @domain_parser.parse("http://123.123.123.123/foo/bar")[:path].should == "/foo/bar"
+ @domain_parser.parse("http://123.123.123.123/foo/bar")[:ip_address].should == true
+ end
+
+ it "parses a host with numeric domain" do
+ @domain_parser.parse("http://123.123.123.co.uk/foo/bar")[:subdomain].should == "123.123"
+ @domain_parser.parse("http://123.123.123.co.uk/foo/bar")[:domain].should == "123"
+ @domain_parser.parse("http://123.123.123.co.uk/foo/bar")[:public_suffix].should == "co.uk"
+ @domain_parser.parse("http://123.123.123.co.uk/foo/bar")[:ip_address].should == false
+ end
+
+ it "should not parse an invalip ip address" do
+ lambda { @domain_parser.parse("http://12345") }.should raise_error(Domainatrix::ParseError)
+ end
+
end
-end
+end
View
10 spec/domainatrix/url_spec.rb
@@ -21,6 +21,10 @@
Domainatrix::Url.new(:path => "/asdf.html").path.should == "/asdf.html"
end
+ it "reports if it is an ip address" do
+ Domainatrix::Url.new(:ip_address => true).ip_address.should == true
+ end
+
it "canonicalizes the url" do
Domainatrix::Url.new(:domain => "pauldix", :public_suffix => "net").canonical.should == "net.pauldix"
Domainatrix::Url.new(:subdomain => "foo", :domain => "pauldix", :public_suffix => "net").canonical.should == "net.pauldix.foo"
@@ -38,4 +42,10 @@
it "canonicalizes the url without the path" do
Domainatrix::Url.new(:subdomain => "foo", :domain => "pauldix", :public_suffix => "net").canonical(:include_path => false).should == "net.pauldix.foo"
end
+
+ it "converts the url to a string" do
+ Domainatrix::Url.new(:scheme => "http", :subdomain => "www", :domain => "pauldix", :public_suffix => "net", :path => "/some/path").to_s.should == "http://www.pauldix.net/some/path"
+ Domainatrix::Url.new(:subdomain => "www", :domain => "pauldix", :public_suffix => "net", :path => "/some/path").to_s.should == "www.pauldix.net/some/path"
+ Domainatrix::Url.new(:domain => "pauldix", :public_suffix => "co.uk").to_s.should == "pauldix.co.uk"
+ end
end
View
52 spec/domainatrix_spec.rb
@@ -1,16 +1,48 @@
require File.dirname(__FILE__) + '/spec_helper'
-describe "domainatrix" do
- it "should parse into a url object" do
- Domainatrix.parse("http://pauldix.net").should be_a Domainatrix::Url
+describe Domainatrix do
+ describe ".parse" do
+ it "should convert a string into a url object" do
+ Domainatrix.parse("http://pauldix.net").should be_a Domainatrix::Url
+ end
+
+ it "should canonicalize" do
+ Domainatrix.parse("http://pauldix.net").canonical.should == "net.pauldix"
+ Domainatrix.parse("http://pauldix.net/foo.html").canonical.should == "net.pauldix/foo.html"
+ Domainatrix.parse("http://pauldix.net/foo.html?asdf=bar").canonical.should == "net.pauldix/foo.html?asdf=bar"
+ Domainatrix.parse("http://foo.pauldix.net").canonical.should == "net.pauldix.foo"
+ Domainatrix.parse("http://foo.bar.pauldix.net").canonical.should == "net.pauldix.bar.foo"
+ Domainatrix.parse("http://pauldix.co.uk").canonical.should == "uk.co.pauldix"
+ end
end
- it "should canonicalize" do
- Domainatrix.parse("http://pauldix.net").canonical.should == "net.pauldix"
- Domainatrix.parse("http://pauldix.net/foo.html").canonical.should == "net.pauldix/foo.html"
- Domainatrix.parse("http://pauldix.net/foo.html?asdf=bar").canonical.should == "net.pauldix/foo.html?asdf=bar"
- Domainatrix.parse("http://foo.pauldix.net").canonical.should == "net.pauldix.foo"
- Domainatrix.parse("http://foo.bar.pauldix.net").canonical.should == "net.pauldix.bar.foo"
- Domainatrix.parse("http://pauldix.co.uk").canonical.should == "uk.co.pauldix"
+ describe ".scan" do
+ it "parses the url found in a string" do
+ input = "HAHA. This is why Conan should stay: http://losangeles.craigslist.org/sfv/clt/1551463643.html"
+ url = Domainatrix.scan(input).first
+ url.canonical.should == "org.craigslist.losangeles/sfv/clt/1551463643.html"
+ end
+
+ it "finds multiple urls in a string" do
+ input = <<-TEXT
+ http://google.com
+ and then http://yahoo.com
+ TEXT
+ google, yahoo = Domainatrix.scan(input)
+ google.domain.should == "google"
+ yahoo.domain.should == "yahoo"
+ end
+
+ it "returns a map of results when given a block" do
+ input = "http://a.com https://b.com"
+ domains = Domainatrix.scan(input) do |url|
+ url.domain
+ end
+ domains.should == %w(a b)
+ end
+
+ it "returns an empty array when no urls are found" do
+ Domainatrix.scan("Nope").should == []
+ end
end
end
View
1  spec/spec.opts
@@ -1,2 +1,3 @@
--diff
--color
+--backtrace
View
12 spec/spec_helper.rb
@@ -1,5 +1,9 @@
require "rubygems"
-require "spec"
+
+begin
+ require "spec"
+rescue LoadError
+end
# gem install redgreen for colored test output
begin require "redgreen" unless ENV['TM_CURRENT_LINE']; rescue LoadError; end
@@ -7,4 +11,8 @@
path = File.expand_path(File.dirname(__FILE__) + "/../lib/")
$LOAD_PATH.unshift(path) unless $LOAD_PATH.include?(path)
-require "lib/domainatrix"
+begin
+ require "lib/domainatrix"
+rescue LoadError
+ require 'domainatrix'
+end

No commit comments for this range

Something went wrong with that request. Please try again.