Skip to content

Commit

Permalink
Port over some code from Lingua::EN::Tagger to extract maximal noun p…
Browse files Browse the repository at this point in the history
…hrases from POS-tagged tokens.
  • Loading branch information
schuyler committed Aug 8, 2012
1 parent 837f04e commit 789e338
Showing 1 changed file with 64 additions and 0 deletions.
64 changes: 64 additions & 0 deletions app/scripts/mnp.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/* compute maximal noun phrases from tagged text
the relevant parts are from Lingua-EN-Tagger/Tagger.pm */

tags = {
NUM : 'CD',
GER : 'VBG',
ADJ : 'JJ[RS]*',
PART : 'VBN',
NN : 'NN[SP]*',
NNP : 'NNP',
PREP : 'IN',
DET : 'DET',
PAREN: '[LR]RB',
QUOT : 'PPR',
SEN : 'PP'
};

max_noun_regex = "\
# optional number, gerund - adjective -participle \
(?:$NUM)?(?:$GER|$ADJ|$PART)* \
# Followed by one or more nouns \
(?:$NN)+ \
(?: \
# Optional preposition, determinant, cardinal \
(?:$PREP)*(?:$DET)?(?:$NUM)? \
# Optional gerund or adjective or participle \
(?:$GER|$ADJ|$PART)* \
# one or more nouns \
(?:$NN)+ \
)* \
";

max_noun_regex = max_noun_regex.replace(/\$([A-Z]+)/g,
function(_, arg) { return "\\d+:" + tags[arg] } );
max_noun_regex = max_noun_regex.replace(/\s+(?:#.*?\n)/g, " ");
max_noun_regex = new RegExp(max_noun_regex, "g");

function get_noun_phrases(tokens) {
structure = [];
jQuery.each(tokens, function (i, pair) {
// pair = [token, pos]
structure.push(i + ":" + pair[1]);
});
structure = structure.join(" ");
matches = structure.match(max_noun_regex);
mnps = [];
jQuery.each(matches, function(i, match) {
mnp = [];
jQuery.each(match, function(j, pos) {
pos = pos.split(":");
mnp.push(tokens[pos[0]]);
});
mnps.push(mnp);
});
return mnps;
};

var input = process.argv[2];
var Pos = require("./pos");
var lex = new Pos.Lexer().lex(input);
var tokens = new Pos.Tagger().tag(lex);

console.log(tokens);

0 comments on commit 789e338

Please sign in to comment.