NLP4J

Natural Language Processing Library for Java

NLP4J Components

Core Data, Utilities, CSV/Json/Plaintext parser, etc. : nlp4j-core
English language NLP: nlp4j-stanford
Japanese language NLP: nlp4j-kuromoji, nlp4j-cabocha, nlp4j-mecab, nlp4j-yahoojp , nlp4j-sudachi
Wikipedia dump file parser, mediawiki api client: wiki
Data crawling: twitter, webcrawler, wikipedia dump
Document search: apache solr, azure

NLP4J Maven for English NLP

<!-- for English NLP -->
<!-- https://mvnrepository.com/artifact/org.nlp4j/nlp4j-stanford -->
<dependency>
    <groupId>org.nlp4j</groupId>
    <artifactId>nlp4j-stanford</artifactId>
    <version>1.3.5.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/edu.stanford.nlp/stanford-corenlp -->
<dependency>
	<groupId>edu.stanford.nlp</groupId>
	<artifactId>stanford-corenlp</artifactId>
	<version>4.4.0</version>
	<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/edu.stanford.nlp/stanford-corenlp -->
<dependency>
	<groupId>edu.stanford.nlp</groupId>
	<artifactId>stanford-corenlp</artifactId>
	<version>4.4.0</version>
	<classifier>models</classifier>
	<scope>provided</scope>
</dependency>

NLP4J Code for simple English Morphological analysis

String text = "I eat sushi with chopsticks.";
DocumentAnnotator ann = (new DocumentAnnotatorBuilder<>(StanfordPosAnnotator.class)).set("target", "text")
		.build();
Document doc = (new DocumentBuilder()).text(text).build();
ann.annotate(doc);
doc.getKeywords().forEach(kwd -> {
	System.out.println(kwd.getBegin() + "," + kwd.getEnd() + "," + kwd.getFacet() + "," + kwd.getLex());
});

// Expected output:
// 0,1,word.PRP,I
// 2,5,word.VBP,eat
// 6,11,word.NN,sushi
// 12,16,word.IN,with
// 17,27,word.NNS,chopstick
// 27,28,word..,.

NLP4J Code for simple English Syntax analysis

StanfordPosDependencyAnnotator ann = new StanfordPosDependencyAnnotator();
ann.setProperty("target", "text");

Document doc = new DefaultDocument();
doc.putAttribute("text", "I eat sushi with chopsticks.");

ann.annotate(doc);

doc.getKeywords().forEach(kwd -> {
	if (kwd instanceof KeywordWithDependency) {
		KeywordWithDependency kd = (KeywordWithDependency) kwd;
		System.out.println(kd.toStringAsXml()); // print as xml
		System.out.println("I: " + kwd.getLex());
		kd.getChildren().forEach(child -> {
			System.out.println("children: " + child.getLex());
		});
	}
});

// Expected output
// <?xml version="1.0" encoding="UTF-8"?>
// <w begin="2" depth="0" end="5" facet="VBP" id="0" lex="eat" relation="root" sequence="0" str="eat">
//     <w begin="0" depth="1" end="1" facet="PRP" id="1" lex="I" relation="nsubj" sequence="1" str="I"/>
//     <w begin="6" depth="1" end="11" facet="NN" id="2" lex="sushi" relation="obj" sequence="2" str="sushi"/>
//     <w begin="17" depth="1" end="27" facet="NNS" id="3" lex="chopstick" relation="obl" sequence="3" str="chopsticks">
//         <w begin="12" depth="2" end="16" facet="IN" id="4" lex="with" relation="case" sequence="4" str="with"/>
//     </w>
//     <w begin="27" depth="1" end="28" facet="." id="5" lex="." relation="punct" sequence="5" str="."/>
// </w>
// I: eat
// children: I
// children: sushi
// children: chopstick
// children: .

NLP4J Code for simple English Syntax analysis (2)

public static void main(String[] args) throws Exception {
	Document doc = new DefaultDocument();
	doc.putAttribute("text", "I eat sushi with chopsticks.");
	StanfordPosDependencyAnnotator ann = new StanfordPosDependencyAnnotator();
	ann.setProperty("target", "text");
	ann.annotate(doc);
	for (Keyword kwd : doc.getKeywords()) {
		if (kwd instanceof KeywordWithDependency) {
			KeywordWithDependency kd = (KeywordWithDependency) kwd;
			// Print dependency as a XML
			System.out.println(kd.toStringAsXml());
			print(kd);
		}
	}
}

private static void print(KeywordWithDependency kd) {
	kd.getChildren().forEach(kwd -> {
		System.out.println(kd.getLex() + " -> (" + kwd.getRelation() + ") " + kwd.getLex());
		print(kwd);
	});
}
}

// Expected output:
// <?xml version="1.0" encoding="UTF-8"?>
// <w begin="2" depth="0" end="5" facet="VBP" id="0" lex="eat" relation="root" sequence="0" str="eat">
//     <w begin="0" depth="1" end="1" facet="PRP" id="1" lex="I" relation="nsubj" sequence="1" str="I"/>
//     <w begin="6" depth="1" end="11" facet="NN" id="2" lex="sushi" relation="obj" sequence="2" str="sushi"/>
//     <w begin="17" depth="1" end="27" facet="NNS" id="3" lex="chopstick" relation="obl" sequence="3" str="chopsticks">
//         <w begin="12" depth="2" end="16" facet="IN" id="4" lex="with" relation="case" sequence="4" str="with"/>
//     </w>
//     <w begin="27" depth="1" end="28" facet="." id="5" lex="." relation="punct" sequence="5" str="."/>
// </w>
//
// eat -> (nsubj) I
// eat -> (obj) sushi
// eat -> (obl) chopstick
// chopstick -> (case) with
// eat -> (punct) .

NLP4J Code for Stanford NLP Open IE(Information Extraction), Triples, Clauses

StanfordOpenIEAnnotator ann = new StanfordOpenIEAnnotator();
ann.setProperty("target", "text");

Document doc = new DefaultDocument();
doc.putAttribute("text", //
		"Mount Fuji, located on the island of Honshu, " //
				+ "is the highest mountain in Japan. ");

ann.annotate(doc);
doc.getKeywords().forEach(kwd -> System.out.println(kwd.getFacet() + "," + kwd.getLex()));

// Expected Output
// pattern.oie.triple,mount fuji , is highest mountain in , japan
// pattern.oie.triple,mount fuji , is mountain in , japan
// pattern.oie.triple,mount fuji , is , mountain
// pattern.oie.triple,mount fuji , is , highest mountain
// pattern.oie.triple,mount fuji , located on , island honshu
// pattern.oie.triple,highest mountain , is in , japan
// pattern.oie.triple,mount fuji , located on , island
// pattern.oie.clause,Mount Fuji located on the island of Honshu is the highest mountain in Japan
// pattern.oie.clause,Mount Fuji located on the island of Honshu

NLP4J Maven for Reading Wikipedia Dump

<!-- https://mvnrepository.com/artifact/org.nlp4j/nlp4j-wiki -->
<dependency>
    <groupId>org.nlp4j</groupId>
    <artifactId>nlp4j-wiki</artifactId>
    <version>1.1.0.0</version>
</dependency>

NLP4J Code for reading Wikipedia Dump

String itemString = "Nintendo";
String dir = "/usr/local/wiki/enwiki/20230101/";
// Index File
File indexFile = new File(dir + "enwiki-20230101-pages-articles-multistream-index.txt.bz2");
// Dump File
File dumpFile = new File(dir + "enwiki-20230101-pages-articles-multistream.xml.bz2");

try (WikiDumpReader dumpReader = new WikiDumpReader(dumpFile, indexFile);) {

	WikiPage page = dumpReader.getItem(itemString);
	System.out.println(page.getRootNodePlainText());
// Expected output:
// is a Japanese multinational video game company headquartered
// in Kyoto, Japan. It develops video games and video game consoles ...

	System.out.println("<text>\n" + page.getText() + "\n</text>");
// {{Short description|Japanese video game company}} <!-- popup
//  [[File:Nintendo.svg]] --> {{Pp-vandalism|small=yes}} {{Use dmy
//  dates|date=October 2022}} {{Use American English|date=November 2020}}
//  {{Infobox company | name = Nintendo Co., Ltd. | logo = Nintendo.svg |
//  logo_alt = Logo in white on red background since 2016 | logo_caption = Logo
//  in white on red background since 2016 | image =
//  Nintendo_Headquarters_-_panoramio.jpg ... 

}

Author

Hiroki Oya twitter linkedin

Name		Name	Last commit message	Last commit date
Latest commit History 449 Commits
download		download
nlp4j		nlp4j
.gitattributes		.gitattributes
.gitignore		.gitignore
LICENSE		LICENSE
README.md		README.md

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

NLP4J

NLP4J Components

NLP4J Maven for English NLP

NLP4J Code for simple English Morphological analysis

NLP4J Code for simple English Syntax analysis

NLP4J Code for simple English Syntax analysis (2)

NLP4J Code for Stanford NLP Open IE(Information Extraction), Triples, Clauses

NLP4J Maven for Reading Wikipedia Dump

NLP4J Code for reading Wikipedia Dump

See also

Author

About

Releases

Packages

Languages

License

oyahiroki/nlp4j

Folders and files

Latest commit

History

Repository files navigation

NLP4J

NLP4J Components

NLP4J Maven for English NLP

NLP4J Code for simple English Morphological analysis

NLP4J Code for simple English Syntax analysis

NLP4J Code for simple English Syntax analysis (2)

NLP4J Code for Stanford NLP Open IE(Information Extraction), Triples, Clauses

NLP4J Maven for Reading Wikipedia Dump

NLP4J Code for reading Wikipedia Dump

See also

Author

About

Topics

Resources

License

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages