NLP4J
Natural Language Processing Library for Java
NLP4J Components
Core Data, Utilities, CSV/Json/Plaintext parser, etc. : nlp4j-core
English language NLP: nlp4j-stanford
Japanese language NLP: nlp4j-kuromoji, nlp4j-cabocha, nlp4j-mecab, nlp4j-yahoojp , nlp4j-sudachi
Wikipedia dump file parser, mediawiki api client: wiki
Data crawling: twitter, webcrawler, wikipedia dump
Document search: apache solr, azure
NLP4J Maven for English NLP
<!-- for English NLP -->
<!-- https://mvnrepository.com/artifact/org.nlp4j/nlp4j-stanford -->
<dependency>
<groupId>org.nlp4j</groupId>
<artifactId>nlp4j-stanford</artifactId>
<version>1.3.5.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/edu.stanford.nlp/stanford-corenlp -->
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>4.4.0</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/edu.stanford.nlp/stanford-corenlp -->
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>4.4.0</version>
<classifier>models</classifier>
<scope>provided</scope>
</dependency>
NLP4J Code for simple English Morphological analysis
String text = "I eat sushi with chopsticks.";
DocumentAnnotator ann = (new DocumentAnnotatorBuilder<>(StanfordPosAnnotator.class)).set("target", "text")
.build();
Document doc = (new DocumentBuilder()).text(text).build();
ann.annotate(doc);
doc.getKeywords().forEach(kwd -> {
System.out.println(kwd.getBegin() + "," + kwd.getEnd() + "," + kwd.getFacet() + "," + kwd.getLex());
});
// Expected output:
// 0,1,word.PRP,I
// 2,5,word.VBP,eat
// 6,11,word.NN,sushi
// 12,16,word.IN,with
// 17,27,word.NNS,chopstick
// 27,28,word..,.
NLP4J Code for simple English Syntax analysis
StanfordPosDependencyAnnotator ann = new StanfordPosDependencyAnnotator();
ann.setProperty("target", "text");
Document doc = new DefaultDocument();
doc.putAttribute("text", "I eat sushi with chopsticks.");
ann.annotate(doc);
doc.getKeywords().forEach(kwd -> {
if (kwd instanceof KeywordWithDependency) {
KeywordWithDependency kd = (KeywordWithDependency) kwd;
System.out.println(kd.toStringAsXml()); // print as xml
System.out.println("I: " + kwd.getLex());
kd.getChildren().forEach(child -> {
System.out.println("children: " + child.getLex());
});
}
});
// Expected output
// <?xml version="1.0" encoding="UTF-8"?>
// <w begin="2" depth="0" end="5" facet="VBP" id="0" lex="eat" relation="root" sequence="0" str="eat">
// <w begin="0" depth="1" end="1" facet="PRP" id="1" lex="I" relation="nsubj" sequence="1" str="I"/>
// <w begin="6" depth="1" end="11" facet="NN" id="2" lex="sushi" relation="obj" sequence="2" str="sushi"/>
// <w begin="17" depth="1" end="27" facet="NNS" id="3" lex="chopstick" relation="obl" sequence="3" str="chopsticks">
// <w begin="12" depth="2" end="16" facet="IN" id="4" lex="with" relation="case" sequence="4" str="with"/>
// </w>
// <w begin="27" depth="1" end="28" facet="." id="5" lex="." relation="punct" sequence="5" str="."/>
// </w>
// I: eat
// children: I
// children: sushi
// children: chopstick
// children: .
NLP4J Code for simple English Syntax analysis (2)
public static void main(String[] args) throws Exception {
Document doc = new DefaultDocument();
doc.putAttribute("text", "I eat sushi with chopsticks.");
StanfordPosDependencyAnnotator ann = new StanfordPosDependencyAnnotator();
ann.setProperty("target", "text");
ann.annotate(doc);
for (Keyword kwd : doc.getKeywords()) {
if (kwd instanceof KeywordWithDependency) {
KeywordWithDependency kd = (KeywordWithDependency) kwd;
// Print dependency as a XML
System.out.println(kd.toStringAsXml());
print(kd);
}
}
}
private static void print(KeywordWithDependency kd) {
kd.getChildren().forEach(kwd -> {
System.out.println(kd.getLex() + " -> (" + kwd.getRelation() + ") " + kwd.getLex());
print(kwd);
});
}
}
// Expected output:
// <?xml version="1.0" encoding="UTF-8"?>
// <w begin="2" depth="0" end="5" facet="VBP" id="0" lex="eat" relation="root" sequence="0" str="eat">
// <w begin="0" depth="1" end="1" facet="PRP" id="1" lex="I" relation="nsubj" sequence="1" str="I"/>
// <w begin="6" depth="1" end="11" facet="NN" id="2" lex="sushi" relation="obj" sequence="2" str="sushi"/>
// <w begin="17" depth="1" end="27" facet="NNS" id="3" lex="chopstick" relation="obl" sequence="3" str="chopsticks">
// <w begin="12" depth="2" end="16" facet="IN" id="4" lex="with" relation="case" sequence="4" str="with"/>
// </w>
// <w begin="27" depth="1" end="28" facet="." id="5" lex="." relation="punct" sequence="5" str="."/>
// </w>
//
// eat -> (nsubj) I
// eat -> (obj) sushi
// eat -> (obl) chopstick
// chopstick -> (case) with
// eat -> (punct) .
NLP4J Code for Stanford NLP Open IE(Information Extraction), Triples, Clauses
StanfordOpenIEAnnotator ann = new StanfordOpenIEAnnotator();
ann.setProperty("target", "text");
Document doc = new DefaultDocument();
doc.putAttribute("text", //
"Mount Fuji, located on the island of Honshu, " //
+ "is the highest mountain in Japan. ");
ann.annotate(doc);
doc.getKeywords().forEach(kwd -> System.out.println(kwd.getFacet() + "," + kwd.getLex()));
// Expected Output
// pattern.oie.triple,mount fuji , is highest mountain in , japan
// pattern.oie.triple,mount fuji , is mountain in , japan
// pattern.oie.triple,mount fuji , is , mountain
// pattern.oie.triple,mount fuji , is , highest mountain
// pattern.oie.triple,mount fuji , located on , island honshu
// pattern.oie.triple,highest mountain , is in , japan
// pattern.oie.triple,mount fuji , located on , island
// pattern.oie.clause,Mount Fuji located on the island of Honshu is the highest mountain in Japan
// pattern.oie.clause,Mount Fuji located on the island of Honshu
NLP4J Maven for Reading Wikipedia Dump
<!-- https://mvnrepository.com/artifact/org.nlp4j/nlp4j-wiki -->
<dependency>
<groupId>org.nlp4j</groupId>
<artifactId>nlp4j-wiki</artifactId>
<version>1.1.0.0</version>
</dependency>
NLP4J Code for reading Wikipedia Dump
String itemString = "Nintendo";
String dir = "/usr/local/wiki/enwiki/20230101/";
// Index File
File indexFile = new File(dir + "enwiki-20230101-pages-articles-multistream-index.txt.bz2");
// Dump File
File dumpFile = new File(dir + "enwiki-20230101-pages-articles-multistream.xml.bz2");
try (WikiDumpReader dumpReader = new WikiDumpReader(dumpFile, indexFile);) {
WikiPage page = dumpReader.getItem(itemString);
System.out.println(page.getRootNodePlainText());
// Expected output:
// is a Japanese multinational video game company headquartered
// in Kyoto, Japan. It develops video games and video game consoles ...
System.out.println("<text>\n" + page.getText() + "\n</text>");
// {{Short description|Japanese video game company}} <!-- popup
// [[File:Nintendo.svg]] --> {{Pp-vandalism|small=yes}} {{Use dmy
// dates|date=October 2022}} {{Use American English|date=November 2020}}
// {{Infobox company | name = Nintendo Co., Ltd. | logo = Nintendo.svg |
// logo_alt = Logo in white on red background since 2016 | logo_caption = Logo
// in white on red background since 2016 | image =
// Nintendo_Headquarters_-_panoramio.jpg ...
}
See also
Natural Language Processing with Groovy, OpenNLP, CoreNLP, Nlp4j, Datumbox, Smile, Spark NLP, DJL and TensorFlow
https://groovy.apache.org/blog/natural-language-processing-with-groovy