-
Notifications
You must be signed in to change notification settings - Fork 3
/
nlp.js
101 lines (74 loc) · 3.24 KB
/
nlp.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
// @flow
/*jslint node: true */
var fs = require('fs'); // for reading text files
// Sample data for these examples (coerced to strings):
var economy = ' ' + fs.readFileSync('data/texts/economy.txt');
var politics = ' ' + fs.readFileSync('data/texts/politics.txt');
var sports = ' ' + fs.readFileSync('data/texts/sports.txt');
var natural = require('natural'),
tokenizer = new natural.WordTokenizer();
console.log("\n-- Tokenized sample text in politics.txt:");
console.log(tokenizer.tokenize(politics));
console.log("\n-- Use Porter Stemmer on a single word:");
console.log(natural.PorterStemmer.stem("dogs"));
natural.PorterStemmer.attach(); // add methods to string
console.log("\n-- Use Porter Stemmer text in file sports.txt:");
console.log(sports.tokenizeAndStem());
console.log("dog dogs Dog dogged".tokenizeAndStem());
var classifier = new natural.BayesClassifier();
classifier.addDocument(economy, 'economy');
classifier.addDocument(politics, 'politics');
classifier.addDocument(sports, 'sports');
classifier.train();
console.log("\n-- Bayesian classifier test results:");
console.log(classifier.classify('The President and Congress went on vacation.'));
console.log(classifier.classify('Tax rates might be effected by quantitative easing.'));
console.log(classifier.classify('I like baseball more than football.'));
var NGrams = natural.NGrams;
console.log("\n-- 2grams in text from file sports.txt:");
console.log(NGrams.bigrams(sports));
console.log("\n-- 3grams in text from file sports.txt:");
console.log(NGrams.trigrams(sports));
var TfIdf = natural.TfIdf,
tfidf = new TfIdf();
tfidf.addDocument(economy, 'economy');
tfidf.addDocument(politics, 'politics');
tfidf.addDocument(sports, 'sports');
console.log('\n-- tfidf for word "economy" in three test documents:');
console.log('economy:');
tfidf.tfidfs('economy', function(i, measure) {
console.log('document #' + i + ' is ' + measure);
});
console.log('\n-- tfidf for word "politics" in three test documents:');
console.log('politics:');
tfidf.tfidfs('politics', function(i, measure) {
console.log('document #' + i + ' is ' + measure);
});
console.log('\n-- tfidf for word "sports" in three test documents:');
console.log('sports:');
tfidf.tfidfs('sports', function(i, measure) {
console.log('document #' + i + ' is ' + measure);
});
console.log('\n-- tfidf for word "Congress" in three test documents:');
console.log('Congress:');
tfidf.tfidfs('Congress', function(i, measure) {
console.log('document #' + i + ' is ' + measure);
});
console.log('\n-- tfidf for word "taxes" in three test documents:');
console.log('taxes:');
tfidf.tfidfs('taxes', function(i, measure) {
console.log('document #' + i + ' is ' + measure);
});
var wordnet_data_path = process.env.WORDNET_DATA;
console.log("Wordnet data path: " + wordnet_data_path);
var wordnet = new natural.WordNet(wordnet_data_path);
var pos_map = {v: 'verb', n: 'noun', a: 'adjective', s: 'adjective', r: 'adverb'};
wordnet.lookup('bank', function(results) {
results.forEach(function(result) {
console.log('\n-- Wordnet data for "bank":');
console.log(' part of speech: ' + pos_map[result.pos]);
console.log(' lemma: ' + result.lemma);
console.log(' synonyms: ' + result.synonyms);
console.log(' gloss: ' + result.gloss);
});
});