-
Notifications
You must be signed in to change notification settings - Fork 2
/
DocumentCategorizerNGramFeatureExample.java
74 lines (63 loc) · 3.55 KB
/
DocumentCategorizerNGramFeatureExample.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import opennlp.tools.doccat.BagOfWordsFeatureGenerator;
import opennlp.tools.doccat.DoccatFactory;
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizer;
import opennlp.tools.doccat.DocumentCategorizerME;
import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.doccat.DocumentSampleStream;
import opennlp.tools.doccat.FeatureGenerator;
import opennlp.tools.doccat.NGramFeatureGenerator;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
/**
* oepnnlp version 1.7.2
* Usage of NGram features for Document Classification in OpenNLP
* @author www.tutorialkart.com
*/
public class DocumentCategorizerNGramFeatureExample {
public static void main(String[] args) {
try {
// read the training data
InputStreamFactory dataIn = new MarkableFileInputStreamFactory(new File("train"+File.separator+"en-movie-category.train"));
ObjectStream<String> lineStream = new PlainTextByLineStream(dataIn, "UTF-8");
ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
// define the training parameters
TrainingParameters params = new TrainingParameters();
params.put(TrainingParameters.ITERATIONS_PARAM, 10+"");
params.put(TrainingParameters.CUTOFF_PARAM, 0+"");
// feature generators - N-gram feature generators
FeatureGenerator[] featureGenerators = { new NGramFeatureGenerator(1,1),
new NGramFeatureGenerator(2,3) };
DoccatFactory factory = new DoccatFactory(featureGenerators);
// create a model from traning data
DoccatModel model = DocumentCategorizerME.train("en", sampleStream, params, factory);
System.out.println("\nModel is successfully trained.");
// save the model to local
BufferedOutputStream modelOut = new BufferedOutputStream(new FileOutputStream("model"+File.separator+"en-movie-classifier-maxent.bin"));
model.serialize(modelOut);
System.out.println("\nTrained Model is saved locally at : "+"model"+File.separator+"en-movie-classifier-maxent.bin");
// test the model file by subjecting it to prediction
DocumentCategorizer doccat = new DocumentCategorizerME(model);
String[] docWords = "Afterwards Stuart and Charlie notice Kate in the photos Stuart took at Leopolds ball and realise that her destiny must be to go back and be with Leopold That night while Kate is accepting her promotion at a company banquet he and Charlie race to meet her and show her the pictures Kate initially rejects their overtures and goes on to give her acceptance speech but it is there that she sees Stuarts picture and realises that she truly wants to be with Leopold".replaceAll("[^A-Za-z]", " ").split(" ");
double[] aProbs = doccat.categorize(docWords);
// print the probabilities of the categories
System.out.println("\n---------------------------------\nCategory : Probability\n---------------------------------");
for(int i=0;i<doccat.getNumberOfCategories();i++){
System.out.println(doccat.getCategory(i)+" : "+aProbs[i]);
}
System.out.println("---------------------------------");
System.out.println("\n"+doccat.getBestCategory(aProbs)+" : is the predicted category for the given sentence.");
}
catch (IOException e) {
System.out.println("An exception in reading the training file. Please check.");
e.printStackTrace();
}
}
}