Achieves near complete test coverage for part of speech tagger module

nhunzaker · Jan 20, 2012 · 45955e4 · 45955e4
1 parent 1a2216e
commit 45955e4
Show file tree

Hide file tree

Showing 4 changed files with 404 additions and 73 deletions.
diff --git a/actions/what.js b/actions/what.js
@@ -8,7 +8,9 @@ var lang    = require("../brain/language")
 ;
 
 module.exports = function what (a) {
-
+
+    if (a.owner === a.subject) a.subject = "definition";
+
     var nodebot   = this
     ,   owner     = a.owner
     ,   subject   = a.subject || "definition"
@@ -55,8 +57,9 @@ module.exports = function what (a) {
             return nodebot.request();
         }
 
-        nodebot.lexico[nowner] = nodebot.lexicon[owner] || {};
+        nodebot.lexicon[owner] = nodebot.lexicon[owner] || {};
         nodebot.lexicon[owner][subject] = text;
+
         nodebot.say("Great, now I know!");
 
         return nodebot.request();

diff --git a/brain/language/tagger.js b/brain/language/tagger.js
@@ -1,4 +1,14 @@
 // Tagger.js
+//
+// Breaks up speech into components and assists with
+// classifying things such as the subject, ownership,
+// and action for a statement
+//
+// Note: I am not a linguist, this is the result of
+// blood, sweat, and tears!
+//
+// Please help me make this better:
+// https://github.com/nhunzaker/nodebot
 // -------------------------------------------------- //
 
 var lev = require("levenshtein")
@@ -21,7 +31,7 @@ var closest = module.exports.closest = function(string, words) {
     }
 
     words.forEach(function(word) {
-
+        
         var distance = lev(string, word);
 
         if (distance < shortest) {
@@ -36,39 +46,48 @@ var closest = module.exports.closest = function(string, words) {
 
 // Checks if a string is fileish
 var isFile = module.exports.isFile = function(string) {
+    string = string || "";
     return (string.replace(/\s/g, "").match(fileEx) !== null);
 };
 
 
 // Returns the part of speech for a particular word
 var getType = module.exports.getType = function (string) {
-    return tagger.tag(lexer.lex(string))[0][1];
+
+    if (string) {
+        return tagger.tag(lexer.lex(string))[0][1];
+    } else {
+        return undefined;
+    }
+
 };
 
 
 // Finds all words between the last of the first and last
 // of two types
-var getBetween = module.exports.getBetween = function(lex, type1, type2) {
+var getBetween = module.exports.getBetween = function(lex, type1, type2, form) {
 
     var tagged = tagger.tag(lex)
-    , filter1 = filter2 = [];
+    , filter1 = filter2 = start = end = [];
+
+    form = form || "outside"
 
     type1 = (typeof type1 === 'string') ? [type1] : type1;
     type2 = (typeof type2 === 'string') ? [type2] : type2;
 
     filter1 =  tagged.filter(function(i) { return type1.indexOf(i[1]) !== -1 }) || [];
     filter2 =  tagged.filter(function(i) { return type2.indexOf(i[1]) !== -1  }) || [];
-
-    var start  = (filter1[0]) ? filter1[0][0] : undefined
-    ,   end    = (filter2.slice(-1)[0]) ? filter2.slice(-1)[0][0] : undefined;
-
 
-    if (start || end) {
-        return lex.slice(lex.indexOf(start) + 1, lex.indexOf(end) + 1);
+    if (form === "outside") {
+        start = (filter1[0]) ? filter1[0][0] : undefined
     } else {
-        return [];
+        start = (filter1.slice(-1)[0]) ? filter1.slice(-1)[0][0] : undefined
     }
 
+    end = (filter2.slice(-1)[0]) ? filter2.slice(-1)[0][0] : undefined;
+
+
+    return (start || end) ? lex.slice(lex.indexOf(start) + 1, lex.indexOf(end) + 1) : [];
 
 };
 
@@ -90,19 +109,24 @@ var getTypes = module.exports.getTypes = function (array, string, strict) {
 };
 
 
-var classify = module.exports.classify = function(speech) {
+var classify = module.exports.classify = function(speech, debug) {
 
     var text   = speech || process.argv.slice(2).join(" ")
     ,   words  = lexer.lex(text)
     ,   tagged = tagger.tag(words)
     ,   action = subject = owner = false;
 
+    if (debug) {
+        console.log(tagged);
+    }
+
+
     // Classify!
     // -------------------------------------------------- //
 
     var verbs       = getTypes(tagged, "VB")
     ,   nouns       = getTypes(tagged, "NN")
-    ,   pronouns    = getTypes(tagged, "P")
+    ,   pronouns    = getTypes(tagged, "PRP") // finds all posessive pronouns
     ,   actions     = getTypes(tagged, "W")
     ,   adverbs     = getTypes(tagged, "R")
     ,   adjectives  = getTypes(tagged, "JJ")
@@ -141,116 +165,144 @@ var classify = module.exports.classify = function(speech) {
 
     var posession = tagged.filter(function(i) { return i[1] === "PRP$" || i[1] === "PRP"; });
 
-    // If there is posession, then use it
-    if (posession.length > 0) { 
+    // If there is posession and we have an action, then
+    // the owner is the posessive word
+    if (posession.length > 0 && action) {
+
         owner = posession[0][0];
+
+        // More bulletproofing, if the owner word is further
+        // in the sentence than the action, then we need to igore
+        // all of the verbs/posessives before the action
+        //
+        // ex: "Do you know what the current directory is?"
+        if (words.indexOf(owner) < words.indexOf(action)) {
+            owner = getBetween(words, ["DT"], "NN").join(" ");
+        }
+
     }
 
     // No ? Let's try between a preposition and 
     // determiners/nouns
     else if (determiners.length > 0 && preps.length > 0) {
-
-        owner = getBetween(words, "IN", ["DT", "NN"]);
+        
+        owner = getBetween(words, ["IN"], ["DT", "NN", "."]);
 
         // Strip accidental determinates
         if (getType(owner[0]) === "DT") owner = owner.slice(1);
-
-        owner = owner.join(" ");
+
+        // Strip accidental punctuation
+        if (getType(owner.slice(-1)[0]) === ".") owner = owner.slice(0, -1);
+
+        owner = owner.join(" ").trim();
     }
 
+    // At this point, we can really only guess that
+    // the owner is between the verb and the end of the
+    // statement
     else if (verbs.length > 0) {
-
+        
         owner = getBetween(words, ["VBZ", "VBP"], ".").slice(0, -1)
 
         // Strip accidental determinates
         if (getType(owner[0]) === "DT") owner = owner.slice(1);
-
-        owner = owner.join(" ");
+
+        // Strip accidental puncuation
+        if (getType(owner[0]) === ".") owner = owner.slice(1);
+
+        owner = owner.join(" ").trim();
+
     }
 
 
     // SUBJECT
-    // Answers : "What should the nodebot's action target?"
+    // Answers : "What is this statement about?"
     // -------------------------------------------------- //
 
-    // If ownership, then the and the next word is a noun then
-    // the subject is the noun
-    if (owner) {
-        subject = getBetween(words, ["DT", "PRP$"], ["IN", "."]).slice(0, -1).join(" ");
+    // If there is a file within the statement, it's probably
+    // the subject
+    if (speech.match(fileEx) !== null) {
+        subject = speech.match(fileEx)[0].trim();
     } 
-    // If there are no nouns and there is an owner, the subject is the owner
-    else if (nouns.length === 0 && owner) {
-        subject = owner;
-    }
 
-    // Okay, if that isn't true and we have prepositions
-    // then the subject will be the words following
+    // If there is a website within the statement, it's probably
+    // the subject
+    else if (websites.length > 0) {
+        subject = websites[0].trim()
+    }
 
-    // Start with the word after the preposition
-    // end with the next adjective or prep we see
+    // If ownership and there are prepositions, scan for words beween
+    // prepositions, determinates, and posessive words and
+    // prepositions, nouns, and puncuation
+    else if (owner && preps.length > 0) {
+
+        debug && console.log("fire");
+
+        // To account for more than one preposition, we need to be able to filter between
+        // either the inside or outside preposition
+        if (preps.length === 1) {
+            subject = getBetween(words, ["IN", "DT", "PRP$"], ["IN", "NN", "."], "outside");
+        } else {
+            subject = getBetween(words, ["IN", "DT", "PRP$"], ["IN", "NN", "."], "inside");
+        }
 
-    else if (preps.length > 0) {
-        subject = getBetween("IN", ["IN", "VBZ", "."]);
-    } 
+        // Autocorrect for trailing punctuation
+        if (getType(subject.slice(-1)[0]) === ".") {
+            subject = subject.slice(0, -1);
+        }
 
-    // Cute, at this point we check for determiners 
-    // (the, some...)
+        // Autocorrect for trailing ownership
+        if (subject.slice(-1)[0] === owner) {
+            subject = subject.slice(0, -1);
+        }
+
+        // Autocorrect for trailing prepositions
+        if (getType(subject.slice(-1)[0]) === "IN") {
+            subject = subject.slice(0, -1);
+        }
 
-    else if (determiners.length > 0) {
-        var det = determiners.slice(-1)[0];
-        subject = words[words.indexOf(det) + 1];
-    } 
-
-    // Now let's check if the first verb is
-    // present-tense, then it's probably between
-    // the first verb and the action
-
-    else if (getType(verbs[0]) === "VBZ") {
+        subject = subject.join(" ").trim();
 
-        var start = words.indexOf(verbs[0]) + 1
-        ,   end   = words.indexOf(action);
-
-        subject = words.slice(start, end).join(" ");
-
     } 
 
 
-    // Autocorrect for files
-    // we didn't accidently add whitespace
-    subject = (isFile(subject)) ? subject.replace(/\s/g, "").match(fileEx)[0] : subject
-    owner = (isFile(owner)) ? owner.replace(/\s/g, "").match(fileEx)[0] : owner
-
+    // Okay, last chance. If there *is* ownership, and there are no prepositions
+    // then the subject is inside the owner/determinate/verb and the last noun
+    // (*phew...*)
+    else if (owner && preps.length === 0) {
+        subject = getBetween(words, ["DT", "VBP", "PRP$"], "NN", "inside").join(" ");
+    }
+
     // Now that everything is properly classified,
     // let's filter the ownership
 
     switch(owner) {
-
+
+        // Reverse user possession
     case "me": case "my": case "i": case "I":
         owner = "user";
         break;
-
+
+        // Reverse nodebot possession
     case "your": case "you":
         owner = "nodebot";
         break;
 
-    case undefined: case "it": case "its": 
+        // Tweak other non-specific possession cases to the last
+        // recorded context
+    case "": case "it": case "its": 
     case "they": case "their": case "he": case "she": 
     case "his": case "hers":
-        owner = this.memory.context;
+        owner = Nodebot.memory.context;
         break;
     }
 
 
-    // If the subject is the same as the owner, make a last
-    // minute correction
-
-    if (subject === owner) subject = "definition";
-
     // Return what we find
     // -------------------------------------------------- //
 
     var ret = {
-        action  : action,
+        action  : (action) ? action.toLowerCase() : undefined,
         owner   : owner,
         subject : subject,
         tokens  : words

diff --git a/nodebot.js b/nodebot.js
@@ -37,6 +37,10 @@ require("./brain/interaction")(Nodebot);
 var command = process.argv.slice(2).join(" ").trim();
 
 // Take the proper initial action
-(command !== "") ? Nodebot.analyze(command) : Nodebot.request();
+
+if (!module.parent) {
+    (command !== "") ? Nodebot.analyze(command) : Nodebot.request();    
+}
+