Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Properly escaping DBpedia URIs in dump

  • Loading branch information...
commit 934231212040bee29de39ce67cbd95f09025ea93 1 parent e79af99
Yves Raimond moustaki authored
2  Gemfile
View
@@ -24,7 +24,7 @@ group :test do
gem 'rspec', '>=2.7.0'
gem 'mocha'
gem 'fakeweb'
- gem 'rcov'
+ gem 'simplecov'
gem 'rack-test', :require => 'rack/test'
gem 'rdf-rdfa', :require => 'rdf/rdfa'
gem 'equivalent-xml'
8 Gemfile.lock
View
@@ -15,6 +15,7 @@ GEM
metaclass (0.0.1)
mocha (0.11.4)
metaclass (~> 0.0.1)
+ multi_json (1.3.6)
nokogiri (1.5.2)
rack (1.4.1)
rack-protection (1.2.0)
@@ -22,7 +23,6 @@ GEM
rack-test (0.6.1)
rack (>= 1.0)
rake (0.9.2.2)
- rcov (1.0.0)
rdf (0.3.5.2)
addressable (>= 2.2.6)
rdf-json (0.3.0)
@@ -54,6 +54,10 @@ GEM
rspec-mocks (2.10.1)
shotgun (0.9)
rack (>= 1.0)
+ simplecov (0.6.4)
+ multi_json (~> 1.0)
+ simplecov-html (~> 0.5.3)
+ simplecov-html (0.5.3)
sinatra (1.3.2)
rack (~> 1.3, >= 1.3.6)
rack-protection (~> 1.2)
@@ -75,7 +79,6 @@ DEPENDENCIES
nokogiri
rack-test
rake
- rcov
rdf (>= 0.3.1)
rdf-json
rdf-rdfa
@@ -85,5 +88,6 @@ DEPENDENCIES
rdiscount
rspec (>= 2.7.0)
shotgun
+ simplecov
sinatra
sinatra-content-for
6 lib/wikipedia_api.rb
View
@@ -26,6 +26,7 @@ def initialize(pageid, title)
HTTP_TIMEOUT = 5
NBSP = Nokogiri::HTML(" ").text
UNSAFE_REGEXP = Regexp.new('[^-_\.!~*\'()a-zA-Z0-9;/:@&=$,]', false, 'N').freeze
+ DBPEDIA_UNSAFE_REGEXP = Regexp.new('[^a-zA-Z0-9\.\-*/:_,&]', false, 'N').freeze
def self.escape_query(str)
URI::escape(str, UNSAFE_REGEXP)
@@ -35,6 +36,11 @@ def self.escape_title(title)
URI::escape(title.gsub(' ','_'), ' ?#%"+=')
end
+ def self.title_to_dbpedia_key(title)
+ # From http://dbpedia.org/URIencoding
+ URI::escape(title.gsub(' ', '_').squeeze('_'), DBPEDIA_UNSAFE_REGEXP)
+ end
+
def self.clean_displaytitle(hash)
if hash['displaytitle']
hash['displaytitle'] = Nokogiri::HTML(hash['displaytitle']).text
2  scripts/generate_sameas.rb
View
@@ -87,7 +87,7 @@ def characters(string)
end
def dbpedia_uri(title)
- escaped = WikipediaApi.escape_title(title)
+ escaped = WikipediaApi.title_to_dbpedia_key(title)
RDF::URI("http://dbpedia.org/resource/#{escaped}")
end
8 spec/wikipedia_api_spec.rb
View
@@ -1,7 +1,15 @@
+# encoding: utf-8
require 'spec_helper'
require 'wikipedia_api'
describe WikipediaApi do
+ context "escaping a page title to a DBpedia key" do
+ it "should apply the encoding rules from dbpedia.org" do
+ WikipediaApi.title_to_dbpedia_key('Mozambique (Portugal)').should == 'Mozambique_%28Portugal%29'
+ WikipediaApi.title_to_dbpedia_key('S/2012_P_1').should == 'S/2012_P_1'
+ end
+ end
+
context "escaping a page title" do
it "should convert 'AC/DC' to 'AC/DC'" do
WikipediaApi.escape_title('AC/DC').should == 'AC/DC'
Please sign in to comment.
Something went wrong with that request. Please try again.