Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
4,786 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
module Reality | ||
module Wikidata | ||
class Link | ||
attr_reader :id, :label | ||
|
||
def initialize(id, label = nil) | ||
@id, @label = id, label | ||
end | ||
end | ||
|
||
class Entity | ||
class << self | ||
def from_sparql(sparql_json, subject: 'subject', predicate: 'predicate', object: 'object', object_label: 'object_label') | ||
JSON.parse(sparql_json)['results']['bindings'].map{|row| | ||
[ | ||
row[subject]['value'].sub('http://www.wikidata.org/entity/', ''), | ||
row[predicate]['value'].sub('http://www.wikidata.org/prop/direct/', ''), | ||
row[object].merge('label' => row[object_label]['value']) | ||
] | ||
}.group_by(&:first). | ||
map{|id, rows| | ||
new(id, hash_from_predicates(rows)) | ||
} | ||
end | ||
|
||
def hash_from_predicates(rows) | ||
rows.map{|s, p, o| [p, parse_value(o)]}. | ||
group_by(&:first).map{|p, gs| [p, gs.map(&:last)]}. | ||
to_h | ||
end | ||
|
||
def parse_value(hash) | ||
case hash['type'] | ||
when 'literal' | ||
parse_literal(hash) | ||
when 'uri' | ||
parse_uri(hash) | ||
else | ||
fail ArgumentError, "Unidentifieble datatype: #{hash['type']}" | ||
end | ||
end | ||
|
||
def parse_uri(hash) | ||
if hash['value'] =~ %r{https?://www\.wikidata\.org/entity/([^/]+)$} | ||
Link.new($1, hash['label']) | ||
else | ||
hash['value'] | ||
end | ||
end | ||
|
||
def parse_literal(hash) | ||
case hash['datatype'] | ||
when 'http://www.w3.org/2001/XMLSchema#decimal' | ||
hash['value'].to_i | ||
when 'http://www.opengis.net/ont/geosparql#wktLiteral' | ||
# TODO: WTF | ||
if hash['value'] =~ /^\s*point\s*\(\s*([\d.]+)\s+([\d.]+)\s*\)\s*$/i | ||
lat, lng = $1, $2 | ||
Geo::Coord.new(lat.to_f, lng.to_f) | ||
else | ||
fail ArgumentError, "Unparseable WKT: #{hash['value']}" | ||
end | ||
else | ||
hash['value'] | ||
end | ||
end | ||
end | ||
|
||
attr_reader :id | ||
|
||
def initialize(id, predicates) | ||
@id, @predicates = id, predicates | ||
end | ||
|
||
def [](pred) | ||
@predicates[pred] | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
#!/usr/bin/env ruby | ||
require 'bundler/setup' | ||
require 'open-uri' | ||
require 'progress_bar/core_ext/enumerable_with_progress' | ||
require 'nokogiri' | ||
require 'json' | ||
require_relative 'lib/nokogiri_more' | ||
|
||
start = Nokogiri::HTML(open('https://www.wikidata.org/wiki/Wikidata:List_of_properties')) | ||
res = start. | ||
search('th:contains("By number")').first.parent.search('td > a'). | ||
with_progress.map{|a| | ||
name = a.text | ||
data = Nokogiri::HTML(open('https://www.wikidata.org' + a.href).read). | ||
search('tr'). | ||
map{|tr| tr.search('td').map(&:text)}. | ||
map{|tds| [tds[0], tds[1]]}. | ||
map(&:reverse). | ||
reject{|id, name| id.nil? || name.nil?}. | ||
to_h | ||
}.inject(&:merge) | ||
|
||
File.write 'data/wikidata-predicates.json', res.to_json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
require 'naught' | ||
require 'addressable/uri' | ||
|
||
module Nokogiri | ||
module More | ||
module NodeOnlyFor | ||
def only_for!(selector) | ||
matches?(selector) or fail(ArgumentError, "Doesn't works for nodes other than '#{selector}'") | ||
end | ||
end | ||
|
||
module NodeHref | ||
include NodeOnlyFor | ||
|
||
def href | ||
#only_for!('a[href]') | ||
document.absolute(self['href']) | ||
end | ||
end | ||
|
||
module DocumentURI | ||
def uri | ||
url ? Addressable::URI.parse(url) : nil | ||
end | ||
|
||
def absolute(link) | ||
if uri | ||
(uri + link).to_s | ||
else | ||
Addressable::URI.parse(link).to_s # double check it's really a link | ||
end | ||
end | ||
end | ||
|
||
module NodeChildrenGroups | ||
def children_groups(*selectors) | ||
groups = [] | ||
flat = children.select{|node| selectors.any?{|s| node.matches?(s)}} | ||
while !flat.empty? | ||
groups << make_group(flat, selectors) | ||
end | ||
groups | ||
end | ||
|
||
include NodeOnlyFor | ||
|
||
def each_term | ||
only_for!('dl') | ||
children_groups('dt', 'dd') | ||
end | ||
|
||
private | ||
|
||
def make_group(flat, selectors) | ||
sel = selectors.dup | ||
group = [[]] | ||
while !flat.empty? | ||
if flat.first.matches?(sel.first) | ||
group.last << flat.shift | ||
elsif sel.size > 1 && flat.first.matches?(sel[1]) | ||
sel.shift | ||
group << [] | ||
group.last << flat.shift | ||
else | ||
break | ||
end | ||
end | ||
group | ||
end | ||
end | ||
|
||
module NodeText | ||
def text_ | ||
text.strip | ||
end | ||
end | ||
|
||
NodeNaught = Naught.build do |config| | ||
config.black_hole | ||
|
||
def tap # so you can just at?(selector).tap{|node| - and never be here, if it's not found | ||
self | ||
end | ||
end | ||
|
||
class NodeNotFound < RuntimeError | ||
end | ||
|
||
module NodeBangMethods | ||
def at!(selector) | ||
bang!(at(selector), selector) | ||
end | ||
|
||
def at_css!(selector) | ||
bang!(at_css(selector), selector) | ||
end | ||
|
||
def at_xpath!(selector) | ||
bang!(at_xpath(selector), selector) | ||
end | ||
|
||
def find_child!(selector) | ||
bang!(find_child(selector), selector) | ||
end | ||
|
||
private | ||
|
||
def bang!(node, selector) | ||
if node | ||
node | ||
else | ||
no_node!(selector) | ||
end | ||
end | ||
|
||
def no_node!(selector) | ||
#case Nokogiri::More::Config.bang_mode | ||
#when :fail | ||
fail NodeNotFound, "#{name} have no node at #{selector}" | ||
#when :naught | ||
#NodeNaught.new | ||
#when :log | ||
#NodeNaught.new | ||
#end | ||
end | ||
end | ||
|
||
module NodeQuestMethods | ||
def at?(selector) | ||
at(selector) || NodeNaught.new | ||
end | ||
|
||
def at_css?(selector) | ||
at_css(selector) || NodeNaught.new | ||
end | ||
|
||
def at_xpath?(selector) | ||
at_xpath(selector) || NodeNaught.new | ||
end | ||
|
||
def find_child?(selector) | ||
find_child(selector) || NodeNaught.new | ||
end | ||
end | ||
|
||
module NodeFindChildren | ||
def find_child(selector) | ||
children.filter(selector).first | ||
end | ||
|
||
def find_children(selector) | ||
children.filter(selector) | ||
end | ||
end | ||
end | ||
|
||
# now let's do evil | ||
class ::Class | ||
public :include | ||
end | ||
|
||
Nokogiri::XML::Document.include More::DocumentURI | ||
Nokogiri::XML::Node.include More::NodeText | ||
Nokogiri::XML::Node.include More::NodeHref | ||
Nokogiri::XML::Node.include More::NodeChildrenGroups | ||
|
||
Nokogiri::XML::Node.include More::NodeBangMethods | ||
Nokogiri::XML::Node.include More::NodeQuestMethods | ||
Nokogiri::XML::Node.include More::NodeFindChildren | ||
|
||
Nokogiri::XML::NodeSet.include More::NodeBangMethods | ||
Nokogiri::XML::NodeSet.include More::NodeQuestMethods | ||
Nokogiri::XML::NodeSet.include More::NodeFindChildren | ||
end | ||
|
Oops, something went wrong.