Skip to content

Commit

Permalink
Basic Wikidata Entity
Browse files Browse the repository at this point in the history
  • Loading branch information
zverok committed Feb 2, 2016
1 parent 5294669 commit cccd672
Show file tree
Hide file tree
Showing 9 changed files with 4,786 additions and 1 deletion.
6 changes: 5 additions & 1 deletion Gemfile
Expand Up @@ -5,10 +5,14 @@ source 'https://rubygems.org'
gemspec

group :development do
gem 'progress_bar', git: 'git://github.com/zverok/progress_bar'

gem 'nokogiri'
gem 'addressable'
gem 'naught'
gem 'faraday'
gem 'faraday_middleware'
gem 'progress_bar'
#gem 'progress_bar'
gem 'rake'
gem 'rubygems-tasks'
end
1 change: 1 addition & 0 deletions data/wikidata-predicates.json

Large diffs are not rendered by default.

80 changes: 80 additions & 0 deletions lib/reality/wikidata.rb
@@ -0,0 +1,80 @@
module Reality
module Wikidata
class Link
attr_reader :id, :label

def initialize(id, label = nil)
@id, @label = id, label
end
end

class Entity
class << self
def from_sparql(sparql_json, subject: 'subject', predicate: 'predicate', object: 'object', object_label: 'object_label')
JSON.parse(sparql_json)['results']['bindings'].map{|row|
[
row[subject]['value'].sub('http://www.wikidata.org/entity/', ''),
row[predicate]['value'].sub('http://www.wikidata.org/prop/direct/', ''),
row[object].merge('label' => row[object_label]['value'])
]
}.group_by(&:first).
map{|id, rows|
new(id, hash_from_predicates(rows))
}
end

def hash_from_predicates(rows)
rows.map{|s, p, o| [p, parse_value(o)]}.
group_by(&:first).map{|p, gs| [p, gs.map(&:last)]}.
to_h
end

def parse_value(hash)
case hash['type']
when 'literal'
parse_literal(hash)
when 'uri'
parse_uri(hash)
else
fail ArgumentError, "Unidentifieble datatype: #{hash['type']}"
end
end

def parse_uri(hash)
if hash['value'] =~ %r{https?://www\.wikidata\.org/entity/([^/]+)$}
Link.new($1, hash['label'])
else
hash['value']
end
end

def parse_literal(hash)
case hash['datatype']
when 'http://www.w3.org/2001/XMLSchema#decimal'
hash['value'].to_i
when 'http://www.opengis.net/ont/geosparql#wktLiteral'
# TODO: WTF
if hash['value'] =~ /^\s*point\s*\(\s*([\d.]+)\s+([\d.]+)\s*\)\s*$/i
lat, lng = $1, $2
Geo::Coord.new(lat.to_f, lng.to_f)
else
fail ArgumentError, "Unparseable WKT: #{hash['value']}"
end
else
hash['value']
end
end
end

attr_reader :id

def initialize(id, predicates)
@id, @predicates = id, predicates
end

def [](pred)
@predicates[pred]
end
end
end
end
23 changes: 23 additions & 0 deletions script/extract_wikidata_properties.rb
@@ -0,0 +1,23 @@
#!/usr/bin/env ruby
require 'bundler/setup'
require 'open-uri'
require 'progress_bar/core_ext/enumerable_with_progress'
require 'nokogiri'
require 'json'
require_relative 'lib/nokogiri_more'

start = Nokogiri::HTML(open('https://www.wikidata.org/wiki/Wikidata:List_of_properties'))
res = start.
search('th:contains("By number")').first.parent.search('td > a').
with_progress.map{|a|
name = a.text
data = Nokogiri::HTML(open('https://www.wikidata.org' + a.href).read).
search('tr').
map{|tr| tr.search('td').map(&:text)}.
map{|tds| [tds[0], tds[1]]}.
map(&:reverse).
reject{|id, name| id.nil? || name.nil?}.
to_h
}.inject(&:merge)

File.write 'data/wikidata-predicates.json', res.to_json
175 changes: 175 additions & 0 deletions script/lib/nokogiri_more.rb
@@ -0,0 +1,175 @@
require 'naught'
require 'addressable/uri'

module Nokogiri
module More
module NodeOnlyFor
def only_for!(selector)
matches?(selector) or fail(ArgumentError, "Doesn't works for nodes other than '#{selector}'")
end
end

module NodeHref
include NodeOnlyFor

def href
#only_for!('a[href]')
document.absolute(self['href'])
end
end

module DocumentURI
def uri
url ? Addressable::URI.parse(url) : nil
end

def absolute(link)
if uri
(uri + link).to_s
else
Addressable::URI.parse(link).to_s # double check it's really a link
end
end
end

module NodeChildrenGroups
def children_groups(*selectors)
groups = []
flat = children.select{|node| selectors.any?{|s| node.matches?(s)}}
while !flat.empty?
groups << make_group(flat, selectors)
end
groups
end

include NodeOnlyFor

def each_term
only_for!('dl')
children_groups('dt', 'dd')
end

private

def make_group(flat, selectors)
sel = selectors.dup
group = [[]]
while !flat.empty?
if flat.first.matches?(sel.first)
group.last << flat.shift
elsif sel.size > 1 && flat.first.matches?(sel[1])
sel.shift
group << []
group.last << flat.shift
else
break
end
end
group
end
end

module NodeText
def text_
text.strip
end
end

NodeNaught = Naught.build do |config|
config.black_hole

def tap # so you can just at?(selector).tap{|node| - and never be here, if it's not found
self
end
end

class NodeNotFound < RuntimeError
end

module NodeBangMethods
def at!(selector)
bang!(at(selector), selector)
end

def at_css!(selector)
bang!(at_css(selector), selector)
end

def at_xpath!(selector)
bang!(at_xpath(selector), selector)
end

def find_child!(selector)
bang!(find_child(selector), selector)
end

private

def bang!(node, selector)
if node
node
else
no_node!(selector)
end
end

def no_node!(selector)
#case Nokogiri::More::Config.bang_mode
#when :fail
fail NodeNotFound, "#{name} have no node at #{selector}"
#when :naught
#NodeNaught.new
#when :log
#NodeNaught.new
#end
end
end

module NodeQuestMethods
def at?(selector)
at(selector) || NodeNaught.new
end

def at_css?(selector)
at_css(selector) || NodeNaught.new
end

def at_xpath?(selector)
at_xpath(selector) || NodeNaught.new
end

def find_child?(selector)
find_child(selector) || NodeNaught.new
end
end

module NodeFindChildren
def find_child(selector)
children.filter(selector).first
end

def find_children(selector)
children.filter(selector)
end
end
end

# now let's do evil
class ::Class
public :include
end

Nokogiri::XML::Document.include More::DocumentURI
Nokogiri::XML::Node.include More::NodeText
Nokogiri::XML::Node.include More::NodeHref
Nokogiri::XML::Node.include More::NodeChildrenGroups

Nokogiri::XML::Node.include More::NodeBangMethods
Nokogiri::XML::Node.include More::NodeQuestMethods
Nokogiri::XML::Node.include More::NodeFindChildren

Nokogiri::XML::NodeSet.include More::NodeBangMethods
Nokogiri::XML::NodeSet.include More::NodeQuestMethods
Nokogiri::XML::NodeSet.include More::NodeFindChildren
end

0 comments on commit cccd672

Please sign in to comment.