Skip to content

oyahiroki/nlp4j

master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Code

Latest commit

 

Git stats

Files

Permalink
Failed to load latest commit information.
Type
Name
Latest commit message
Commit time
 
 
 
 
 
 
 
 
 
 

NLP4J

Natural Language Processing Library for Java

png

NLP4J Components

Core Data, Utilities, CSV/Json/Plaintext parser, etc. : nlp4j-core
English language NLP: nlp4j-stanford
Japanese language NLP: nlp4j-kuromoji, nlp4j-cabocha, nlp4j-mecab, nlp4j-yahoojp , nlp4j-sudachi
Wikipedia dump file parser, mediawiki api client: wiki
Data crawling: twitter, webcrawler, wikipedia dump
Document search: apache solr, azure

NLP4J Maven for English NLP

<!-- for English NLP -->
<!-- https://mvnrepository.com/artifact/org.nlp4j/nlp4j-stanford -->
<dependency>
    <groupId>org.nlp4j</groupId>
    <artifactId>nlp4j-stanford</artifactId>
    <version>1.3.5.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/edu.stanford.nlp/stanford-corenlp -->
<dependency>
	<groupId>edu.stanford.nlp</groupId>
	<artifactId>stanford-corenlp</artifactId>
	<version>4.4.0</version>
	<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/edu.stanford.nlp/stanford-corenlp -->
<dependency>
	<groupId>edu.stanford.nlp</groupId>
	<artifactId>stanford-corenlp</artifactId>
	<version>4.4.0</version>
	<classifier>models</classifier>
	<scope>provided</scope>
</dependency>

NLP4J Code for simple English Morphological analysis

String text = "I eat sushi with chopsticks.";
DocumentAnnotator ann = (new DocumentAnnotatorBuilder<>(StanfordPosAnnotator.class)).set("target", "text")
		.build();
Document doc = (new DocumentBuilder()).text(text).build();
ann.annotate(doc);
doc.getKeywords().forEach(kwd -> {
	System.out.println(kwd.getBegin() + "," + kwd.getEnd() + "," + kwd.getFacet() + "," + kwd.getLex());
});

// Expected output:
// 0,1,word.PRP,I
// 2,5,word.VBP,eat
// 6,11,word.NN,sushi
// 12,16,word.IN,with
// 17,27,word.NNS,chopstick
// 27,28,word..,.

NLP4J Code for simple English Syntax analysis

StanfordPosDependencyAnnotator ann = new StanfordPosDependencyAnnotator();
ann.setProperty("target", "text");

Document doc = new DefaultDocument();
doc.putAttribute("text", "I eat sushi with chopsticks.");

ann.annotate(doc);

doc.getKeywords().forEach(kwd -> {
	if (kwd instanceof KeywordWithDependency) {
		KeywordWithDependency kd = (KeywordWithDependency) kwd;
		System.out.println(kd.toStringAsXml()); // print as xml
		System.out.println("I: " + kwd.getLex());
		kd.getChildren().forEach(child -> {
			System.out.println("children: " + child.getLex());
		});
	}
});

// Expected output
// <?xml version="1.0" encoding="UTF-8"?>
// <w begin="2" depth="0" end="5" facet="VBP" id="0" lex="eat" relation="root" sequence="0" str="eat">
//     <w begin="0" depth="1" end="1" facet="PRP" id="1" lex="I" relation="nsubj" sequence="1" str="I"/>
//     <w begin="6" depth="1" end="11" facet="NN" id="2" lex="sushi" relation="obj" sequence="2" str="sushi"/>
//     <w begin="17" depth="1" end="27" facet="NNS" id="3" lex="chopstick" relation="obl" sequence="3" str="chopsticks">
//         <w begin="12" depth="2" end="16" facet="IN" id="4" lex="with" relation="case" sequence="4" str="with"/>
//     </w>
//     <w begin="27" depth="1" end="28" facet="." id="5" lex="." relation="punct" sequence="5" str="."/>
// </w>
// I: eat
// children: I
// children: sushi
// children: chopstick
// children: .

NLP4J Code for simple English Syntax analysis (2)

public static void main(String[] args) throws Exception {
	Document doc = new DefaultDocument();
	doc.putAttribute("text", "I eat sushi with chopsticks.");
	StanfordPosDependencyAnnotator ann = new StanfordPosDependencyAnnotator();
	ann.setProperty("target", "text");
	ann.annotate(doc);
	for (Keyword kwd : doc.getKeywords()) {
		if (kwd instanceof KeywordWithDependency) {
			KeywordWithDependency kd = (KeywordWithDependency) kwd;
			// Print dependency as a XML
			System.out.println(kd.toStringAsXml());
			print(kd);
		}
	}
}

private static void print(KeywordWithDependency kd) {
	kd.getChildren().forEach(kwd -> {
		System.out.println(kd.getLex() + " -> (" + kwd.getRelation() + ") " + kwd.getLex());
		print(kwd);
	});
}
}

// Expected output:
// <?xml version="1.0" encoding="UTF-8"?>
// <w begin="2" depth="0" end="5" facet="VBP" id="0" lex="eat" relation="root" sequence="0" str="eat">
//     <w begin="0" depth="1" end="1" facet="PRP" id="1" lex="I" relation="nsubj" sequence="1" str="I"/>
//     <w begin="6" depth="1" end="11" facet="NN" id="2" lex="sushi" relation="obj" sequence="2" str="sushi"/>
//     <w begin="17" depth="1" end="27" facet="NNS" id="3" lex="chopstick" relation="obl" sequence="3" str="chopsticks">
//         <w begin="12" depth="2" end="16" facet="IN" id="4" lex="with" relation="case" sequence="4" str="with"/>
//     </w>
//     <w begin="27" depth="1" end="28" facet="." id="5" lex="." relation="punct" sequence="5" str="."/>
// </w>
//
// eat -> (nsubj) I
// eat -> (obj) sushi
// eat -> (obl) chopstick
// chopstick -> (case) with
// eat -> (punct) .

NLP4J Code for Stanford NLP Open IE(Information Extraction), Triples, Clauses

StanfordOpenIEAnnotator ann = new StanfordOpenIEAnnotator();
ann.setProperty("target", "text");

Document doc = new DefaultDocument();
doc.putAttribute("text", //
		"Mount Fuji, located on the island of Honshu, " //
				+ "is the highest mountain in Japan. ");

ann.annotate(doc);
doc.getKeywords().forEach(kwd -> System.out.println(kwd.getFacet() + "," + kwd.getLex()));

// Expected Output
// pattern.oie.triple,mount fuji , is highest mountain in , japan
// pattern.oie.triple,mount fuji , is mountain in , japan
// pattern.oie.triple,mount fuji , is , mountain
// pattern.oie.triple,mount fuji , is , highest mountain
// pattern.oie.triple,mount fuji , located on , island honshu
// pattern.oie.triple,highest mountain , is in , japan
// pattern.oie.triple,mount fuji , located on , island
// pattern.oie.clause,Mount Fuji located on the island of Honshu is the highest mountain in Japan
// pattern.oie.clause,Mount Fuji located on the island of Honshu

NLP4J Maven for Reading Wikipedia Dump

<!-- https://mvnrepository.com/artifact/org.nlp4j/nlp4j-wiki -->
<dependency>
    <groupId>org.nlp4j</groupId>
    <artifactId>nlp4j-wiki</artifactId>
    <version>1.1.0.0</version>
</dependency>

NLP4J Code for reading Wikipedia Dump

String itemString = "Nintendo";
String dir = "/usr/local/wiki/enwiki/20230101/";
// Index File
File indexFile = new File(dir + "enwiki-20230101-pages-articles-multistream-index.txt.bz2");
// Dump File
File dumpFile = new File(dir + "enwiki-20230101-pages-articles-multistream.xml.bz2");

try (WikiDumpReader dumpReader = new WikiDumpReader(dumpFile, indexFile);) {

	WikiPage page = dumpReader.getItem(itemString);
	System.out.println(page.getRootNodePlainText());
// Expected output:
// is a Japanese multinational video game company headquartered
// in Kyoto, Japan. It develops video games and video game consoles ...

	System.out.println("<text>\n" + page.getText() + "\n</text>");
// {{Short description|Japanese video game company}} <!-- popup
//  [[File:Nintendo.svg]] --> {{Pp-vandalism|small=yes}} {{Use dmy
//  dates|date=October 2022}} {{Use American English|date=November 2020}}
//  {{Infobox company | name = Nintendo Co., Ltd. | logo = Nintendo.svg |
//  logo_alt = Logo in white on red background since 2016 | logo_caption = Logo
//  in white on red background since 2016 | image =
//  Nintendo_Headquarters_-_panoramio.jpg ... 

}

See also

Natural Language Processing with Groovy, OpenNLP, CoreNLP, Nlp4j, Datumbox, Smile, Spark NLP, DJL and TensorFlow

https://groovy.apache.org/blog/natural-language-processing-with-groovy

Author

Hiroki Oya twitter linkedin

About

Natural Language Processing library for Java

Topics

Resources

License

Stars

Watchers

Forks

Packages

No packages published

Languages