Skip to content


Subversion checkout URL

You can clone with
Download ZIP
Browse files

better README. two-column layout. script-py =>

  • Loading branch information...
commit 84bee1ba6b0f7b8b8450535f4eb11fb4c6add0f5 1 parent 8129d1f
@mromanello authored
Showing with 42 additions and 9 deletions.
  1. +31 −4
  2. +11 −5
@@ -1,11 +1,38 @@
+# GAME: is X the sameAs Y? #
## README ##
-## NOTES ##
+The idea of this script is quite simple: it is designed as a game, where the user is asked to assess whether
+the matching between a Smith-Perseus URI and a DBpedia URI which is suggested by the software is
+likely to be correct or not.
+How the suggestion mechanism work? The Smith's dictionary has an entry for each entity as does Wiki-/DBpedia.
+## USAGE ##
+To start the game in iterative mode (you'll be asked at every "round" if you want to quit) just type
+ python
+The code comes with a list of IDs, each corresponding to an entry in the Smith's dictionary. You may want to launch the game for one particular entity.
+In this case try
+ python --id sosicles-1
+ python --id sosicles-1
There are cases where the wikipedia entry is taken (almost) entirely from the Smith's entry from Perseus (as a result, the LSI score is 1 or close to 1).
Some interesting examples:
-* sosicles-1
-* albinus-24
-* lysianassa-1
+ python --id albinus-24
+ python --id lysianassa-1
+## TODO ##
@@ -4,6 +4,11 @@
import random
from xml.etree.ElementTree import ElementTree,fromstring
from lxml import etree
+ from clint.textui import puts, colored
+ from clint.textui import columns
+except ImportError:
+ print "clint library not found: try sudo easy_install clint"
from nltk.corpus import stopwords # NLTK is needed for the English stopword list
from gensim import corpora, models, similarities # gensim provides an handy implementation of, among other things, LSI and topic models
@@ -192,15 +197,15 @@ def match_entity(id):
for n,r in enumerate(results):
logger.debug("##%i## (%s) %s"%(n,r[1],r[0]))
- print "\n%s\n"%desc
- print "Highest ranked (TFIDF): \"%s\" with TFIDF value %s\n"%(results[0][0][1],results[0][1])
+ puts(columns(["\n%s\n"%desc.encode("utf-8")),60],
+ [colored.magenta("Highest ranked (TFIDF): \"%s\" with TFIDF value %s\n"%(results[0][0][1],results[0][1].encode("utf-8"))),None]))
print "%s sameAs %s?\n"%(test_url,results[0][2])
return True
- print "\n%s\n"%desc
- print "\n%s\n"%documents[0][1]
+ puts(columns(["\n%s\n"%desc.encode("utf-8")),60],
+ [colored.magenta("\n%s\n"%documents[0][1].encode("utf-8")),None]))
print "%s sameAs %s?\n"%(test_url,documents[0][0])
return True
@@ -254,7 +259,8 @@ def main():
want_continue = False
print "Keep going!"
- except:
+ except Exception as inst:
+ print inst
print "There was a problem. Trying with the next one."
# initialise the logger
Please sign in to comment.
Something went wrong with that request. Please try again.