Skip to content

Commit

Permalink
Achieves near complete test coverage for part of speech tagger module
Browse files Browse the repository at this point in the history
  • Loading branch information
nhunzaker committed Jan 20, 2012
1 parent 1a2216e commit 45955e4
Show file tree
Hide file tree
Showing 4 changed files with 404 additions and 73 deletions.
7 changes: 5 additions & 2 deletions actions/what.js
Expand Up @@ -8,7 +8,9 @@ var lang = require("../brain/language")
;

module.exports = function what (a) {


if (a.owner === a.subject) a.subject = "definition";

var nodebot = this
, owner = a.owner
, subject = a.subject || "definition"
Expand Down Expand Up @@ -55,8 +57,9 @@ module.exports = function what (a) {
return nodebot.request();
}

nodebot.lexico[nowner] = nodebot.lexicon[owner] || {};
nodebot.lexicon[owner] = nodebot.lexicon[owner] || {};
nodebot.lexicon[owner][subject] = text;

nodebot.say("Great, now I know!");

return nodebot.request();
Expand Down
192 changes: 122 additions & 70 deletions brain/language/tagger.js
@@ -1,4 +1,14 @@
// Tagger.js
//
// Breaks up speech into components and assists with
// classifying things such as the subject, ownership,
// and action for a statement
//
// Note: I am not a linguist, this is the result of
// blood, sweat, and tears!
//
// Please help me make this better:
// https://github.com/nhunzaker/nodebot
// -------------------------------------------------- //

var lev = require("levenshtein")
Expand All @@ -21,7 +31,7 @@ var closest = module.exports.closest = function(string, words) {
}

words.forEach(function(word) {

var distance = lev(string, word);

if (distance < shortest) {
Expand All @@ -36,39 +46,48 @@ var closest = module.exports.closest = function(string, words) {

// Checks if a string is fileish
var isFile = module.exports.isFile = function(string) {
string = string || "";
return (string.replace(/\s/g, "").match(fileEx) !== null);
};


// Returns the part of speech for a particular word
var getType = module.exports.getType = function (string) {
return tagger.tag(lexer.lex(string))[0][1];

if (string) {
return tagger.tag(lexer.lex(string))[0][1];
} else {
return undefined;
}

};


// Finds all words between the last of the first and last
// of two types
var getBetween = module.exports.getBetween = function(lex, type1, type2) {
var getBetween = module.exports.getBetween = function(lex, type1, type2, form) {

var tagged = tagger.tag(lex)
, filter1 = filter2 = [];
, filter1 = filter2 = start = end = [];

form = form || "outside"

type1 = (typeof type1 === 'string') ? [type1] : type1;
type2 = (typeof type2 === 'string') ? [type2] : type2;

filter1 = tagged.filter(function(i) { return type1.indexOf(i[1]) !== -1 }) || [];
filter2 = tagged.filter(function(i) { return type2.indexOf(i[1]) !== -1 }) || [];

var start = (filter1[0]) ? filter1[0][0] : undefined
, end = (filter2.slice(-1)[0]) ? filter2.slice(-1)[0][0] : undefined;


if (start || end) {
return lex.slice(lex.indexOf(start) + 1, lex.indexOf(end) + 1);
if (form === "outside") {
start = (filter1[0]) ? filter1[0][0] : undefined
} else {
return [];
start = (filter1.slice(-1)[0]) ? filter1.slice(-1)[0][0] : undefined
}

end = (filter2.slice(-1)[0]) ? filter2.slice(-1)[0][0] : undefined;


return (start || end) ? lex.slice(lex.indexOf(start) + 1, lex.indexOf(end) + 1) : [];

};

Expand All @@ -90,19 +109,24 @@ var getTypes = module.exports.getTypes = function (array, string, strict) {
};


var classify = module.exports.classify = function(speech) {
var classify = module.exports.classify = function(speech, debug) {

var text = speech || process.argv.slice(2).join(" ")
, words = lexer.lex(text)
, tagged = tagger.tag(words)
, action = subject = owner = false;

if (debug) {
console.log(tagged);
}


// Classify!
// -------------------------------------------------- //

var verbs = getTypes(tagged, "VB")
, nouns = getTypes(tagged, "NN")
, pronouns = getTypes(tagged, "P")
, pronouns = getTypes(tagged, "PRP") // finds all posessive pronouns
, actions = getTypes(tagged, "W")
, adverbs = getTypes(tagged, "R")
, adjectives = getTypes(tagged, "JJ")
Expand Down Expand Up @@ -141,116 +165,144 @@ var classify = module.exports.classify = function(speech) {

var posession = tagged.filter(function(i) { return i[1] === "PRP$" || i[1] === "PRP"; });

// If there is posession, then use it
if (posession.length > 0) {
// If there is posession and we have an action, then
// the owner is the posessive word
if (posession.length > 0 && action) {

owner = posession[0][0];

// More bulletproofing, if the owner word is further
// in the sentence than the action, then we need to igore
// all of the verbs/posessives before the action
//
// ex: "Do you know what the current directory is?"
if (words.indexOf(owner) < words.indexOf(action)) {
owner = getBetween(words, ["DT"], "NN").join(" ");
}

}

// No ? Let's try between a preposition and
// determiners/nouns
else if (determiners.length > 0 && preps.length > 0) {

owner = getBetween(words, "IN", ["DT", "NN"]);
owner = getBetween(words, ["IN"], ["DT", "NN", "."]);

// Strip accidental determinates
if (getType(owner[0]) === "DT") owner = owner.slice(1);

owner = owner.join(" ");

// Strip accidental punctuation
if (getType(owner.slice(-1)[0]) === ".") owner = owner.slice(0, -1);

owner = owner.join(" ").trim();
}

// At this point, we can really only guess that
// the owner is between the verb and the end of the
// statement
else if (verbs.length > 0) {

owner = getBetween(words, ["VBZ", "VBP"], ".").slice(0, -1)

// Strip accidental determinates
if (getType(owner[0]) === "DT") owner = owner.slice(1);

owner = owner.join(" ");

// Strip accidental puncuation
if (getType(owner[0]) === ".") owner = owner.slice(1);

owner = owner.join(" ").trim();

}


// SUBJECT
// Answers : "What should the nodebot's action target?"
// Answers : "What is this statement about?"
// -------------------------------------------------- //

// If ownership, then the and the next word is a noun then
// the subject is the noun
if (owner) {
subject = getBetween(words, ["DT", "PRP$"], ["IN", "."]).slice(0, -1).join(" ");
// If there is a file within the statement, it's probably
// the subject
if (speech.match(fileEx) !== null) {
subject = speech.match(fileEx)[0].trim();
}
// If there are no nouns and there is an owner, the subject is the owner
else if (nouns.length === 0 && owner) {
subject = owner;
}

// Okay, if that isn't true and we have prepositions
// then the subject will be the words following
// If there is a website within the statement, it's probably
// the subject
else if (websites.length > 0) {
subject = websites[0].trim()
}

// Start with the word after the preposition
// end with the next adjective or prep we see
// If ownership and there are prepositions, scan for words beween
// prepositions, determinates, and posessive words and
// prepositions, nouns, and puncuation
else if (owner && preps.length > 0) {

debug && console.log("fire");

// To account for more than one preposition, we need to be able to filter between
// either the inside or outside preposition
if (preps.length === 1) {
subject = getBetween(words, ["IN", "DT", "PRP$"], ["IN", "NN", "."], "outside");
} else {
subject = getBetween(words, ["IN", "DT", "PRP$"], ["IN", "NN", "."], "inside");
}

else if (preps.length > 0) {
subject = getBetween("IN", ["IN", "VBZ", "."]);
}
// Autocorrect for trailing punctuation
if (getType(subject.slice(-1)[0]) === ".") {
subject = subject.slice(0, -1);
}

// Cute, at this point we check for determiners
// (the, some...)
// Autocorrect for trailing ownership
if (subject.slice(-1)[0] === owner) {
subject = subject.slice(0, -1);
}

// Autocorrect for trailing prepositions
if (getType(subject.slice(-1)[0]) === "IN") {
subject = subject.slice(0, -1);
}

else if (determiners.length > 0) {
var det = determiners.slice(-1)[0];
subject = words[words.indexOf(det) + 1];
}

// Now let's check if the first verb is
// present-tense, then it's probably between
// the first verb and the action

else if (getType(verbs[0]) === "VBZ") {
subject = subject.join(" ").trim();

var start = words.indexOf(verbs[0]) + 1
, end = words.indexOf(action);

subject = words.slice(start, end).join(" ");

}


// Autocorrect for files
// we didn't accidently add whitespace
subject = (isFile(subject)) ? subject.replace(/\s/g, "").match(fileEx)[0] : subject
owner = (isFile(owner)) ? owner.replace(/\s/g, "").match(fileEx)[0] : owner

// Okay, last chance. If there *is* ownership, and there are no prepositions
// then the subject is inside the owner/determinate/verb and the last noun
// (*phew...*)
else if (owner && preps.length === 0) {
subject = getBetween(words, ["DT", "VBP", "PRP$"], "NN", "inside").join(" ");
}

// Now that everything is properly classified,
// let's filter the ownership

switch(owner) {


// Reverse user possession
case "me": case "my": case "i": case "I":
owner = "user";
break;


// Reverse nodebot possession
case "your": case "you":
owner = "nodebot";
break;

case undefined: case "it": case "its":
// Tweak other non-specific possession cases to the last
// recorded context
case "": case "it": case "its":
case "they": case "their": case "he": case "she":
case "his": case "hers":
owner = this.memory.context;
owner = Nodebot.memory.context;
break;
}


// If the subject is the same as the owner, make a last
// minute correction

if (subject === owner) subject = "definition";

// Return what we find
// -------------------------------------------------- //

var ret = {
action : action,
action : (action) ? action.toLowerCase() : undefined,
owner : owner,
subject : subject,
tokens : words
Expand Down
6 changes: 5 additions & 1 deletion nodebot.js
Expand Up @@ -37,6 +37,10 @@ require("./brain/interaction")(Nodebot);
var command = process.argv.slice(2).join(" ").trim();

// Take the proper initial action
(command !== "") ? Nodebot.analyze(command) : Nodebot.request();

if (!module.parent) {
(command !== "") ? Nodebot.analyze(command) : Nodebot.request();
}



0 comments on commit 45955e4

Please sign in to comment.