Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Port over some code from Lingua::EN::Tagger to extract maximal noun p…

…hrases from POS-tagged tokens.
  • Loading branch information...
commit 789e338b546f8668f004737cb7f2b68b4fc6787d 1 parent 837f04e
Schuyler Erle authored August 07, 2012

Showing 1 changed file with 64 additions and 0 deletions. Show diff stats Hide diff stats

  1. 64  app/scripts/mnp.js
64  app/scripts/mnp.js
... ...
@@ -0,0 +1,64 @@
  1
+/* compute maximal noun phrases from tagged text
  2
+   the relevant parts are from Lingua-EN-Tagger/Tagger.pm */
  3
+ 
  4
+tags = {
  5
+    NUM  : 'CD',
  6
+    GER  : 'VBG',
  7
+    ADJ  : 'JJ[RS]*',
  8
+    PART : 'VBN',
  9
+    NN   : 'NN[SP]*',
  10
+    NNP  : 'NNP',
  11
+    PREP : 'IN',
  12
+    DET  : 'DET',
  13
+    PAREN: '[LR]RB',
  14
+    QUOT : 'PPR',
  15
+    SEN  : 'PP'
  16
+};
  17
+
  18
+max_noun_regex = "\
  19
+    # optional number, gerund - adjective -participle   \
  20
+    (?:$NUM)?(?:$GER|$ADJ|$PART)*                       \
  21
+    # Followed by one or more nouns                     \
  22
+    (?:$NN)+                                            \
  23
+        (?:                                             \
  24
+        # Optional preposition, determinant, cardinal   \
  25
+        (?:$PREP)*(?:$DET)?(?:$NUM)?                    \
  26
+        # Optional gerund or adjective or participle    \
  27
+        (?:$GER|$ADJ|$PART)*                            \
  28
+        # one or more nouns                             \
  29
+        (?:$NN)+                                        \
  30
+    )*                                                  \
  31
+";
  32
+
  33
+max_noun_regex = max_noun_regex.replace(/\$([A-Z]+)/g,
  34
+                    function(_, arg) { return "\\d+:" + tags[arg] } );
  35
+max_noun_regex = max_noun_regex.replace(/\s+(?:#.*?\n)/g, " ");
  36
+max_noun_regex = new RegExp(max_noun_regex, "g");
  37
+
  38
+function get_noun_phrases(tokens) {
  39
+    structure = [];
  40
+    jQuery.each(tokens, function (i, pair) {
  41
+        // pair = [token, pos]
  42
+        structure.push(i + ":" + pair[1]);
  43
+    });
  44
+    structure = structure.join(" ");
  45
+    matches = structure.match(max_noun_regex);
  46
+    mnps = [];
  47
+    jQuery.each(matches, function(i, match) {
  48
+        mnp = [];
  49
+        jQuery.each(match, function(j, pos) {
  50
+            pos = pos.split(":");
  51
+            mnp.push(tokens[pos[0]]);
  52
+        });
  53
+        mnps.push(mnp);
  54
+    });
  55
+    return mnps;
  56
+};
  57
+
  58
+var input = process.argv[2];
  59
+var Pos = require("./pos");
  60
+var lex = new Pos.Lexer().lex(input);
  61
+var tokens = new Pos.Tagger().tag(lex);
  62
+
  63
+console.log(tokens);
  64
+

0 notes on commit 789e338

Please sign in to comment.
Something went wrong with that request. Please try again.