Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

added another sample RDF file; fixed a glitch in auto tagging

  • Loading branch information...
commit daefb0d3a10a4aabcba9941c7d7156204c61d9ba 1 parent 4283f4a
@mark-watson authored
View
18 Makefile
@@ -1,6 +1,22 @@
+all: jar_to_local_mvn jarwithdata_to_local_mvn
+
jar:
+ rm -f -r out/production/Java_practical_semantic_web/data
+ cd out/production/Java_practical_semantic_web; ls; pwd
+ cd out/production/Java_practical_semantic_web; jar cvf ../../../knowledgebooks-0.2.0.jar com org
+
+jarwithdata:
+ mkdir -p out/production/Java_practical_semantic_web/data
+ cp data/propername.ser out/production/Java_practical_semantic_web/data
+ cp data/tags.xml out/production/Java_practical_semantic_web/data
cd out/production/Java_practical_semantic_web; ls; pwd
- cd out/production/Java_practical_semantic_web; jar cvf ../../../knowledgebooks.jar com org
+ cd out/production/Java_practical_semantic_web; jar cvf ../../../knowledgebooks-with-data-0.2.0.jar com org data
+
+jar_to_local_mvn: jar
+ mvn install:install-file -Dfile=knowledgebooks-0.2.0.jar -DgroupId=self -DartifactId=knowledgebooks -Dversion=0.2.0 -Dpackaging=jar -DgeneratePom=true
+
+jarwithdata_to_local_mvn: jarwithdata
+ mvn install:install-file -Dfile=knowledgebooks-with-data-0.2.0.jar -DgroupId=self -DartifactId=knowledgebooks-with-data -Dversion=0.2.0 -Dpackaging=jar -DgeneratePom=true
clean:
rm -r -f out
View
37 data/news.n3
@@ -0,0 +1,37 @@
+@prefix kb: <http://knowledgebooks.com/ontology#> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+
+kb:containsCity rdfs:subPropertyOf kb:containsPlace .
+
+kb:containsCountry rdfs:subPropertyOf kb:containsPlace .
+
+kb:containsState rdfs:subPropertyOf kb:containsPlace .
+
+<http://news.yahoo.com/s/nm/20080616/ts_nm/usa_flooding_dc_16 /> kb:containsCity "Burlington" , "Denver" , "St. Paul" , "Chicago" , "Quincy" , "CHICAGO" , "Iowa City" ;
+ kb:containsRegion "U.S. Midwest" , "Midwest" ;
+ kb:containsCountry "United States" , "Japan" ;
+ kb:containsState "Minnesota" , "Illinois" , "Mississippi" , "Iowa" ;
+ kb:containsOrganization "National Guard" , "U.S. Department of Agriculture" , "White House" , "Chicago Board of Trade" , "Department of Transportation" ;
+ kb:containsPerson "Dena Gray-Fisher" , "Donald Miller" , "Glenn Hollander" , "Rich Feltes" , "George W. Bush" ;
+ kb:containsIndustryTerm "food inflation" , "food" , "finance ministers" , "oil" .
+
+<http://news.yahoo.com/s/nm/20080616/ts_nm/usa_politics_dc_2 /> kb:containsCity "Washington" , "FLINT" , "Baghdad" , "Arlington" , "Flint" ;
+ kb:containsCountry "United States" , "Afghanistan" , "Iraq" ;
+ kb:containsState "Illinois" , "Virginia" , "Arizona" , "Michigan" ;
+ kb:containsOrganization "White House" , "Obama administration" , "Iraqi government" ;
+ kb:containsPerson "David Petraeus" , "John McCain" , "Hoshiyar Zebari" , "Barack Obama" , "George W. Bush" , "Carly Fiorina" ;
+ kb:containsIndustryTerm "oil prices" .
+
+<http://news.yahoo.com/s/nm/20080616/ts_nm/worldleaders_trust_dc_1 /> kb:containsCity "WASHINGTON" ;
+ kb:containsCountry "United States" , "Pakistan" , "Islamic Republic of Iran" ;
+ kb:containsState "Maryland" ;
+ kb:containsOrganization "University of Maryland" , "United Nations" ;
+ kb:containsPerson "Ban Ki-moon" , "Gordon Brown" , "Hu Jintao" , "George W. Bush" , "Pervez Musharraf" , "Vladimir Putin" , "Steven Kull" , "Mahmoud Ahmadinejad" .
+
+<http://news.yahoo.com/s/nm/20080616/bs_nm/global_economy_dc_4 /> kb:containsCity "Sao Paulo" , "Kuala Lumpur" ;
+ kb:containsRegion "Midwest" ;
+ kb:containsCountry "United States" , "Britain" , "Saudi Arabia" , "Spain" , "Italy" , "India" , "France" , "Canada" , "Russia" , "Germany" , "China" , "Japan" , "South Korea" ;
+ kb:containsOrganization "Federal Reserve Bank" , "European Union" , "European Central Bank" , "European Commission" ;
+ kb:containsPerson "Lee Myung-bak" , "Rajat Nag" , "Luiz Inacio Lula da Silva" , "Jeffrey Lacker" ;
+ kb:containsCompany "Development Bank Managing" , "Reuters" , "Richmond Federal Reserve Bank" ;
+ kb:containsIndustryTerm "central bank" , "food" , "energy costs" , "finance ministers" , "crude oil prices" , "oil prices" , "oil shock" , "food prices" , "Finance ministers" , "Oil prices" , "oil" .
View
246 data/tags.xml
@@ -1123,252 +1123,6 @@
<term name="manipul" score="3" />
<term name="extend" score="3" />
</topic>
- <topic name="computers_programming">
- <term name="integ" score="128" />
- <term name="object" score="120" />
- <term name="script" score="106" />
- <term name="float" score="79" />
- <term name="vector" score="78" />
- <term name="string" score="73" />
- <term name="kei" score="56" />
- <term name="attach" score="49" />
- <term name="rotat" score="47" />
- <term name="state" score="43" />
- <term name="sound" score="42" />
- <term name="event" score="36" />
- <term name="type" score="30" />
- <term name="avatar" score="29" />
- <term name="inventori" score="27" />
- <term name="openid" score="26" />
- <term name="textur" score="26" />
- <term name="permiss" score="23" />
- <term name="control" score="23" />
- <term name="channel" score="22" />
- <term name="lsl" score="22" />
- <term name="rot" score="21" />
- <term name="test" score="21" />
- <term name="posit" score="21" />
- <term name="face" score="20" />
- <term name="default" score="19" />
- <term name="scale" score="18" />
- <term name="land" score="17" />
- <term name="anim" score="17" />
- <term name="entri" score="16" />
- <term name="librari" score="16" />
- <term name="rai" score="16" />
- <term name="pass" score="16" />
- <term name="target" score="16" />
- <term name="rais" score="16" />
- <term name="lllist" score="15" />
- <term name="true" score="15" />
- <term name="collis" score="15" />
- <term name="physic" score="15" />
- <term name="plai" score="14" />
- <term name="move" score="13" />
- <term name="local" score="12" />
- <term name="volum" score="12" />
- <term name="agent" score="12" />
- <term name="trigger" score="12" />
- <term name="block" score="12" />
- <term name="statu" score="12" />
- <term name="offset" score="12" />
- <term name="variabl" score="12" />
- <term name="constant" score="12" />
- <term name="hit" score="12" />
- <term name="particl" score="11" />
- <term name="listen" score="11" />
- <term name="loop" score="11" />
- <term name="cocoa" score="10" />
- <term name="llrot" score="10" />
- <term name="touch" score="10" />
- <term name="total" score="10" />
- <term name="chat" score="10" />
- <term name="via" score="10" />
- <term name="side" score="10" />
- <term name="within" score="10" />
- <term name="po" score="10" />
- <term name="arithmet" score="9" />
- <term name="sphere" score="9" />
- <term name="cannon" score="9" />
- <term name="direct" score="9" />
- <term name="val" score="9" />
- <term name="owner" score="9" />
- <term name="stop" score="9" />
- <term name="foo" score="9" />
- <term name="torqu" score="9" />
- <term name="ground" score="8" />
- <term name="lltriggersound" score="8" />
- <term name="light" score="8" />
- <term name="entir" score="8" />
- <term name="game" score="8" />
- <term name="handler" score="8" />
- <term name="request" score="8" />
- <term name="bouml" score="8" />
- <term name="axi" score="8" />
- <term name="llloopsound" score="8" />
- <term name="master" score="8" />
- <term name="poof" score="8" />
- <term name="chapter" score="7" />
- <term name="llplaysound" score="7" />
- <term name="smoke" score="7" />
- <term name="simul" score="7" />
- <term name="caus" score="7" />
- <term name="messag" score="7" />
- <term name="rang" score="7" />
- <term name="detect" score="7" />
- <term name="sub" score="6" />
- <term name="angl" score="6" />
- <term name="llsai" score="6" />
- <term name="num" score="6" />
- <term name="hand" score="6" />
- <term name="sens" score="6" />
- <term name="middot" score="6" />
- <term name="amount" score="6" />
- <term name="whether" score="6" />
- <term name="cannonbal" score="6" />
- <term name="fly" score="6" />
- <term name="damp" score="6" />
- <term name="below" score="6" />
- <term name="delet" score="6" />
- <term name="account" score="6" />
- <term name="render" score="6" />
- <term name="collid" score="6" />
- <term name="pixel" score="5" />
- <term name="veloc" score="5" />
- <term name="walk" score="5" />
- <term name="easili" score="5" />
- <term name="exit" score="5" />
- <term name="small" score="5" />
- <term name="explos" score="5" />
- <term name="manual" score="5" />
- <term name="count" score="5" />
- <term name="llmakeexplos" score="5" />
- <term name="reflect" score="5" />
- <term name="llmovetotarget" score="5" />
- <term name="gstartposit" score="5" />
- <term name="enter" score="5" />
- <term name="virusfre" score="5" />
- <term name="trace" score="5" />
- <term name="aim" score="5" />
- <term name="valid" score="5" />
- <term name="alreadi" score="5" />
- <term name="either" score="5" />
- <term name="die" score="5" />
- <term name="danc" score="5" />
- <term name="convert" score="5" />
- <term name="togeth" score="5" />
- <term name="remov" score="5" />
- <term name="explod" score="5" />
- <term name="sort" score="5" />
- <term name="hover" score="5" />
- <term name="llsettext" score="5" />
- <term name="spin" score="5" />
- <term name="uuid" score="5" />
- <term name="fire" score="5" />
- <term name="charact" score="5" />
- <term name="cast" score="5" />
- <term name="llstopsound" score="5" />
- <term name="llgetpo" score="5" />
- <term name="strength" score="5" />
- <term name="root" score="5" />
- <term name="interact" score="5" />
- <term name="hello" score="4" />
- <term name="lifetim" score="4" />
- <term name="repres" score="4" />
- <term name="accept" score="4" />
- <term name="timer" score="4" />
- <term name="sensor" score="4" />
- <term name="callback" score="4" />
- <term name="llsetstatu" score="4" />
- <term name="jump" score="4" />
- <term name="whenev" score="4" />
- <term name="bounc" score="4" />
- <term name="execut" score="4" />
- <term name="spinstat" score="4" />
- <term name="shot" score="4" />
- <term name="debit" score="4" />
- <term name="rez" score="4" />
- <term name="os" score="4" />
- <term name="mac" score="4" />
- <term name="alpha" score="4" />
- <term name="explor" score="4" />
- <term name="except" score="4" />
- <term name="radian" score="4" />
- <term name="brush" score="4" />
- <term name="statement" score="4" />
- <term name="backward" score="4" />
- <term name="llsettimerev" score="4" />
- <term name="multipl" score="4" />
- <term name="forward" score="4" />
- <term name="arc" score="4" />
- <term name="random" score="4" />
- <term name="declar" score="4" />
- <term name="lldetect" score="4" />
- <term name="linden" score="4" />
- <term name="impuls" score="4" />
- <term name="stride" score="4" />
- <term name="introduct" score="4" />
- <term name="hold" score="4" />
- <term name="appli" score="4" />
- <term name="password" score="4" />
- <term name="otherwis" score="4" />
- <term name="command" score="4" />
- <term name="boom" score="4" />
- <term name="gather" score="4" />
- <term name="sync" score="4" />
- <term name="lllisten" score="4" />
- <term name="stuff" score="4" />
- <term name="transit" score="4" />
- <term name="aculo" score="4" />
- <term name="lldie" score="4" />
- <term name="nilsson" score="4" />
- <term name="separ" score="4" />
- <term name="shape" score="4" />
- <term name="edg" score="4" />
- <term name="theori" score="4" />
- <term name="handl" score="4" />
- <term name="coordin" score="4" />
- <term name="send" score="4" />
- <term name="doubl" score="4" />
- <term name="camera" score="4" />
- <term name="child" score="4" />
- <term name="magic" score="3" />
- <term name="fix" score="3" />
- <term name="llpasscollis" score="3" />
- <term name="throw" score="3" />
- <term name="cancel" score="3" />
- <term name="subject" score="3" />
- <term name="decid" score="3" />
- <term name="abov" score="3" />
- <term name="along" score="3" />
- <term name="specifi" score="3" />
- <term name="integr" score="3" />
- <term name="fwd" score="3" />
- <term name="gscale" score="3" />
- <term name="empti" score="3" />
- <term name="critic" score="3" />
- <term name="punch" score="3" />
- <term name="address" score="3" />
- <term name="ascii" score="3" />
- <term name="mylist" score="3" />
- <term name="environ" score="3" />
- <term name="appl" score="3" />
- <term name="stanford" score="3" />
- <term name="door" score="3" />
- <term name="llrezobject" score="3" />
- <term name="vec" score="3" />
- <term name="captur" score="3" />
- <term name="theta" score="3" />
- <term name="sent" score="3" />
- <term name="cool" score="3" />
- <term name="meter" score="3" />
- <term name="aren" score="3" />
- <term name="privat" score="3" />
- <term name="goal" score="3" />
- <term name="llsleep" score="3" />
- <term name="tau" score="3" />
- <term name="ball" score="3" />
- </topic>
<topic name="religion_hinduism">
<term name="hindu" score="58" />
<term name="hinduism" score="45" />
View
25 src/com/knowledgebooks/nlp/AutoTagger.java
@@ -1,16 +1,16 @@
package com.knowledgebooks.nlp;
+import com.knowledgebooks.nlp.util.NameValue;
import com.knowledgebooks.public_domain.Stemmer;
-
-import java.io.FileInputStream;
-import java.util.*;
-import javax.xml.parsers.*;
-
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-import com.knowledgebooks.nlp.util.NameValue;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.util.*;
/**
* Associate pre-trained classification categories (tags) with input text: assigns
@@ -41,7 +41,18 @@
DefaultHandler handler = new TagsSAXHandler();
SAXParserFactory factory = SAXParserFactory.newInstance(); // Use the default non-validating parser
try {
- FileInputStream xml_input_stream = new FileInputStream(System.getProperty("user.dir") + "/" + "data/tags.xml");
+ System.err.println("Loading tag.xml for auto classification...");
+ InputStream xml_input_stream = handler.getClass().getClassLoader().getResourceAsStream("data/tags.xml");
+ //if (xml_input_stream == null) xml_input_stream = handler.getClass().getClassLoader().getResourceAsStream("data/tags.xml");
+ //if (xml_input_stream == null) xml_input_stream = handler.getClass().getClassLoader().getResourceAsStream("com/knowledgebooks/nlp/data/tags.xml");
+ //System.err.println("1. xml_input_stream = " + xml_input_stream);
+ if (xml_input_stream == null) {
+ xml_input_stream = new FileInputStream(System.getProperty("user.dir") + "/" + "data/tags.xml");
+ //System.err.println("2. xml_input_stream = " + xml_input_stream);
+ }
+
+
+ //FileInputStream xml_input_stream = new FileInputStream(System.getProperty("user.dir") + "/" + "data/tags.xml");
SAXParser saxParser = factory.newSAXParser();
saxParser.parse(xml_input_stream, handler);
} catch (Throwable t) {
View
4 src/com/knowledgebooks/nlp/ExtractNames.java
@@ -128,8 +128,8 @@ public boolean isPlaceName(List<String> words, int startIndex, int numWords) {
* @return
*/
public boolean isPlaceName(String name) {
- if (placeNameHash.get(name) != null)
- System.out.println("* place name: " + name + ", placeNameHash.get(name): " + placeNameHash.get(name));
+ //if (placeNameHash.get(name) != null)
+ // System.out.println("* place name: " + name + ", placeNameHash.get(name): " + placeNameHash.get(name));
return placeNameHash.get(name) != null;
}
View
14 src/com/knowledgebooks/nlp/KeyPhraseExtractionAndSummary.java
@@ -32,13 +32,13 @@
public class KeyPhraseExtractionAndSummary {
private PhraseList pl = new PhraseList();
// (defun get-key-summarization (word-vector key-word-rankings &aux x y z v (ret '()))
- private String words;
+ private Document document;
/**
* @param text
*/
public KeyPhraseExtractionAndSummary(String text) {
- Document document = new Document(text);
+ document = new Document(text);
Stemmer stemmer = new Stemmer();
List<String> stems = new ArrayList<String>(document.getNumWords());
@@ -135,18 +135,12 @@ public String getSummary() {
// System.out.println(" score:" + pl.getScore(i)+", phrase: "+pl.getPhrase(i));
//}
String ret = "";
- System.out.println("GETTING SUMMARY: pl.size()=" + pl.size());
- if (pl.size() == 0) ret = "";
- else if (pl.size() == 1) ret = pl.getPhrase(0);
+ //System.out.println("GETTING SUMMARY: pl.size()=" + pl.size());
+ if (pl.size() == 1) ret = pl.getPhrase(0);
/*else if (pl.getScore(0) > (2 * pl.getScore(1))) ret = pl.getPhrase(0);
else if (pl.getPhrase(0).length() > 80) ret = pl.getPhrase(0);*/
else ret = pl.getPhrase(0) + " " + pl.getPhrase(1);
ret = ret.trim();
- if (ret.length() < 78) {
- int len = words.length();
- if (len > 60) len = 60;
- ret = words.substring(0, len) + "...";
- }
return ret;
}
View
5 test/TestExtractNames.java
@@ -1,4 +1,5 @@
import com.knowledgebooks.nlp.ExtractNames;
+import com.knowledgebooks.nlp.KeyPhraseExtractionAndSummary;
import com.knowledgebooks.nlp.util.ScoredList;
/**
@@ -59,6 +60,10 @@ static public void main(String[] args) {
ret1[0].getValuesAsString());
System.out.println("Place names: " +
ret1[1].getValuesAsString());
+
+ // also text summarization:
+ KeyPhraseExtractionAndSummary kp = new KeyPhraseExtractionAndSummary("President Jane Smith spoke to Congress about tax and military appropriations. The subject of the economy was key. Then she left for Mexico.");
+ System.out.println("\n\nTesting summary:\n" + kp.getSummary());
}
}
}
Please sign in to comment.
Something went wrong with that request. Please try again.