Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NLP-time implementation: language normalization #75

Merged
merged 7 commits into from
Mar 2, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 9 additions & 0 deletions History.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
### Issues under work:
- 71: "3.99" is interpreted as 4.39am: justified
- 66, 64, 28, 21, 18, 16, 15, 11, 5, 4: normalization: natural language for numbers and time
- 70, 66, 63, 62, 21, 18, 15: support arithmetic, in <n><T>, with <op=and> etc.
- 27, 26, 13: default, e.g. tonight, tomorrow at 11 -> at 11am, at midnight etc
- 55, 52, 25, 2: causality, before, after, ago, from now, <bar> ops
- 32: ranges
- fix decimal number parsing


0.2.2 / 2016-01-25
==================
Expand Down
68 changes: 68 additions & 0 deletions lib/maps.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
{
"op": {
"plus": ["plus", "+", "add", "and"],
"minus": ["minus", "subtract"],
"times": ["times", "multiply"],
"divide": ["divide"]
},
"c": {
"c": ["every", "per", "repeat"]
},
"r": {
"to": ["to", "thru", "through", "til", "till", "until", "-", "~"]
},
"n": {
"0.25": ["quarter"],
"0.5": ["half", "1/2", "half a", "half an"],
"0": ["zero"],
"1": ["one", "a", "an", "first"],
"2": ["two"],
"3": ["three", "third"],
"4": ["four", "fourth"],
"5": ["five", "fifth"],
"6": ["six", "sixth"],
"7": ["seven", "seventh"],
"8": ["eight", "eighth"],
"9": ["nine", "ninth"],
"10": ["ten", "tenth"],
"11": ["eleven", "eleventh"],
"12": ["twelve", "twelveth"],
"13": ["thirteen", "thirteenth"],
"14": ["fourteen", "fourteenth"],
"15": ["fifteen", "fifteenth"],
"16": ["sixteen", "sixteenth"],
"17": ["seventeen", "seventeenth"],
"18": ["eighteen", "eighteenth"],
"19": ["nineteen", "nineteenth"],
"20": ["twenty", "twentieth"],
"30": ["thirty", "thirtieth"],
"40": ["fourty", "fourtieth"],
"50": ["fifty", "fiftieth"],
"60": ["sixty", "sixtieth"],
"70": ["seventy", "seventieth"],
"80": ["eighty", "eightieth"],
"90": ["ninety", "ninetieth"],
"100": ["hundred", "hundreds", "hundredth"],
"1000": ["thousand", "thousands", "thousandth", "k", "K"]
},
"t": {
"th": ["st", "nd", "rd", "th", "st day", "nd day", "rd day", "th day"],
"2 week": ["fortnight", "next fortnight", "a fortnight"]
},
"dt": {
"ms": ["ms", "millisecond", "milliseconds"],
"second": ["s", "sec", "secs", "second", "seconds"],
"minute": ["m", "min", "mins", "minute", "minutes"],
"hour": ["h", "hr", "hrs", "hour", "hours"],
"day": ["d", "day", "days", "dai"],
"week": ["w", "wk", "wks", "week", "weeks"],
"month": ["M", "MM", "mo", "moon", "moons", "month", "months"],
"year": ["y", "yr", "yrs", "year", "years"],
"tomorrow": ["tmr", "tom", "tomorrow"],
"yesterday": ["ytd", "yesterday"]
},
"f": {
"1": ["once"],
"2": ["twice"]
}
}
257 changes: 257 additions & 0 deletions lib/norm.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
// Module to normalize a string before parsing: map inflection words into their lemmas, using maps.json

// dependencies
var _ = require('lodash');
var util = require('./util')
var maps = require('./maps.json')

// regexes for Subnormal forms
var re = {
// 12/20 - 12/21, 2012/12 - 2013/12
MMsDDdMMsDD: /(?!\d{1,4}\/\d{1,4}\s*-\s*\d{1,4}\/\d{1,4}\/)(\d{1,4})\/(\d{1,4})\s*-\s*(\d{1,4})\/(\d{1,4})/g,
// 12/22 - 23, 2012/10 - 12
MMsDDdDD: /(?!\d{1,4}\/\d{1,4}\s*-\s*\d{1,4}\/)(\d{1,4})\/(\d{1,4})\s*-\s*(\d{1,4})/g,
// 12/24, 2012/12
MMsDD: /(?!\d{1,4}\/\d{1,4}\/)(\d{1,4})\/(\d{1,4})/g,
// 05:30pm, 0530pm, 1730, 1730pm, 1730[re:h], remove the [re:h]
hhcmm: /(\s+\d{1,2}|^\d{1,2})\:?(\d{2})\s*(\S+)*/g
}

/**
* Return the normalized string.
* Algo: Parse and get norm a string into array of valid CFG symbols, in these steps:
* 1. parse normal forms
* 2. parse subnormal forms
* 3. parse english forms
* @param {string} str The input string.
* @return {string} Normalized string
*/
function norm(str) {
str = (' ' + str).replace(/\s+(\d+)([a-zA-Z]+)/g, ' $1 $2').replace(/\s+/g, ' ').replace(/^\s/, '').replace(/^\s/, '')
var pStr = parseNormalSubnormal(str)
var tokens = pStr.split(' ');
// 3. parse english forms
// var symbols = _.map(tokens, lemma)
var skip = false;
var symbols = [];
_.each(tokens, function(tok, i) {
// skip this once, reset skip
if (skip) {
skip = false;
return 1;
}
var oneGram = tok,
twoGram = tok + ' ' + (tokens[i + 1] || ''),
oneLemma = lemma(oneGram),
twoLemma = lemma(twoGram);
if (twoLemma != twoGram && oneLemma != oneGram) {
// if lemmatization must happen for both,
// pick the longer, skip next token
skip = true;
symbols.push(twoLemma);
} else {
symbols.push(oneLemma);
}
})
return symbols.join(' ')
}
// var str = "an seven hour 7h 05 October 2011 14:48 UTC 08/11 2020 2012/12 event is at tonight 12/20- 23 12/22 - 12/23 12/22 - 12/25 05:30h 17:30h 1730 1730pm 5pm 1st"
// var str = '2nd day of January 12:30'
// console.log(norm(str))


/**
* Return the lemma symbol of a word string, i.e. the name and value of the symbol it belongs to in the CFG. Uses ./maps.json.
* NLP Lemmatization refers here: htp://nlp.stanford.edu/Ir-book/html/htmledition/stemming-and-lemmatization-1.html. Inflections = all possible alternative words of a lemma.
* @param {string} str the word to lemmatize.
* @return {JSON} the lemma symbol {name, value} for CFG
* @example
* lemma('zero')
* // => { value: '0', name: 'n' }[value]
*/
function lemma(str) {
var lem = {}
var name = _.findKey(maps, function(sMap) {
var value = _.findKey(sMap, function(inflectionArr) {
return _.includes(inflectionArr, str)
})
if (value) { lem['value'] = value };
return value
})
lem['name'] = name;
// return lemma or the original of not lemmatized
return (lem.value || str)
}

/**
* Run 1. parseNormal then 2. parseSubnormal, return the parsed string with T-format tokens.
* @private
* @param {string} str The input string
* @return {string} Parsed string
*/
function parseNormalSubnormal(str) {
var p1 = parseNormal(str);
var ntokens = _.filter(p1.dateStrArr, notSubnormal)
// the proper, 1st parsed normal string
var p1Str = injectNormal(str, ntokens)
var p2Str = parseSubnormal(p1Str)
return p2Str
}

/**
* 1. Parse normal forms. Try to parse and return a normal Date, parseable from new Date(str), by continuously trimming off its tail and retry until either get a valid date, or string runs out.
* Doesn't parse string with length <5
* @private
* @param {string} str The input string.
* @return {string} A Date in ISO string, or null.
*/
function parseNormal(str) {
// keep chopping off tail until either get a valid date, or string runs out
// array of parsed date and the string consumed
var dateArr = [],
dateStrArr = [];
// ensure single spacing
str = str.replace(/\s+/g, ' ');
// tokenize by space
var strArr = str.split(/\s+/g);

// init the normalDate and head string used
var normalDate = null,
head = '';
// do while there's still string to go
while (!_.isEmpty(strArr)) {
head = _.trim(head + ' ' + strArr.shift());
try {
normalDate = new Date(head).toISOString();
// Extend head: if parse successful, extend continuously until failure, then that's the longest parseable head string, ...<date>
var advanceHead = head + ' ' + strArr[0]
while (1) {
try {
var advanceDate = new Date(advanceHead).toISOString();
if (advanceDate != 'Invalid Date') {
// if advanceDate is parseable, set to current, update heads
var normalDate = advanceDate;
head = head + ' ' + strArr.shift()
advanceHead = advanceHead + ' ' + strArr[0]
} else {
break;
}
} catch (e) {
// when fail, just break
break;
}
}
// Shrink head: from the whole parseable head ...<date>, trim front till we get <date>
while (1) {
try {
if (new Date(head.replace(/^\s*\S+\s*/, '')).toISOString() != normalDate) {
// front token eaten causes change, dont update head
break;
} else {
// update head
head = head.replace(/^\s*\S+\s*/, '');
}
} catch (e) {
break;
}
}
// only consider a valid parse if the parsed str is long enough
if (head.length > 4 && !/\d+\.\d+/.exec(head)) {
dateArr.push(normalDate);
// get head = <date> only, then reset
dateStrArr.push(head)
}
head = ''
} catch (e) {}
}
return { dateArr: dateArr, dateStrArr: dateStrArr };
}

/**
* 2. Parse subnormal forms after parseNormal. Gradually replace tokens of the input string while parseable.
* @private
* @param {string} str The input string.
* @return {string} The parsed string.
*/
function parseSubnormal(str) {
var m, res;
if (m = re.MMsDDdMMsDD.exec(str)) {
// 12/20 - 12/21
var yMd1 = yMdParse(m[1], m[2]);
var yMd2 = yMdParse(m[3], m[4]);
res = 't:' + yMd1 + ',dt: - t:' + yMd2 + ',dt: '
} else if (m = re.MMsDDdDD.exec(str)) {
// 12/22 - 23
var yMd1 = yMdParse(m[1], m[2]);
var yMd2 = yMdParse(m[1], m[3]);
res = 't:' + yMd1 + ',dt: - t:' + yMd2 + ',dt: '
} else if (m = re.MMsDD.exec(str)) {
// if year
var yMd = yMdParse(m[1], m[2])
// 12/24
res = 't:' + yMd1 + ',dt: '
} else if (m = re.hhcmm.exec(str)) {
// 05:30pm, 0530pm, 1730, 1730pm, 1730[re:h], remove the [re:h]
res = ' ' + _.trim(m[1]) + ":" + m[2] + (m[3] || '')
} else {
// exit recursion if hits here
return str
}
// recurse down till no more substitution (CFG is not cyclic, so ok)
str = parseSubnormal(str.replace(m[0], res))
return str
}

//////////////////////
// Helper functions //
//////////////////////

/**
* Try to parse two tokens for T form into MM/dd, or MM/yyyy if either token hsa length 4.
* @private
* @param {string} token1
* @param {string} token2
* @return {string} in the form <y><M><d>
*/
function yMdParse(token1, token2) {
var part = _.partition([token1, token2], function(token) {
return token.length == 4
})
var y = part[0][0] ? part[0][0] + 'y' : '';
var M = part[1][0] + 'M';
var d = part[1][1] ? part[1][1] + 'd' : '';
return y+M+d
}
/**
* Check if the dateStr is strictly normal and not subnormal. Used to extract parseSubnormal overrides.
* @private
* @param {string} dateStr
* @return {Boolean}
*/
function notSubnormal(dateStr) {
var subnormalStr = parseSubnormal(dateStr)
// remove T and see if still has words
var noT = subnormalStr.replace(/t\:\S*,dt\:\S*(\s*-\s*t\:\S*,dt\:\S*)?/, '')
return /\w+/g.exec(noT) != null
}

/**
* Given a string and array of its parsed phrases, convert them into T ISO UTC then T format, and inject into the original string, return.
* @private
* @param {string} str The original string.
* @param {Array} parsedArr The parsed phrases from the string.
* @return {string} The string with parsed phrases replaced in T format.
*
* @example
* injectNormal('05 October 2011 14:48 UTC 08/11 2020', [ '05 October 2011 14:48 UTC', '08/11 2020' ])
* // => 't:2011y10M05d14h48m00.000s,dt: t:2020y08M11d04h00m00.000s,dt: '
*/
function injectNormal(str, parsedArr) {
_.each(parsedArr, function(parsed) {
var T = util.ISOtoT(new Date(parsed).toISOString())
str = str.replace(parsed, T)
})
return str;
}

module.exports = norm
7 changes: 5 additions & 2 deletions lib/parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

var date = require('./date');
var debug = require('debug')('date:parser');
var norm = require('./norm')

/**
* Days
Expand All @@ -22,7 +23,7 @@ var rMeridiem = /^(\d{1,2})([:.](\d{1,2}))?([:.](\d{1,2}))?\s*([ap]m)/;
var rHourMinute = /^(\d{1,2})([:.](\d{1,2}))([:.](\d{1,2}))?/;
var rAtHour = /^at\s?(\d{1,2})$/;
var rDays = /\b(sun(day)?|mon(day)?|tues(day)?|wed(nesday)?|thur(sday|s)?|fri(day)?|sat(urday)?)s?\b/;
var rMonths = /^((\d{1,2})(st|nd|rd|th))\sof\s(january|february|march|april|may|june|july|august|september|october|november|december)/i;
var rMonths = /^((\d{1,2})\s*(st|nd|rd|th))\s(of\s)?(january|february|march|april|may|june|july|august|september|october|november|december)/i;
var rPast = /\b(last|yesterday|ago)\b/;
var rDayMod = /\b(morning|noon|afternoon|night|evening|midnight)\b/;
var rAgo = /^(\d*)\s?\b(second|minute|hour|day|week|month|year)[s]?\b\s?ago$/;
Expand All @@ -42,6 +43,8 @@ module.exports = parser;
*/

function parser(str, offset) {
// normalize string before parsing, using maps.json
str = norm(str)
if(!(this instanceof parser)) return new parser(str, offset);
if(typeof offset == 'string') offset = parser(offset);
var d = offset || new Date;
Expand Down Expand Up @@ -253,7 +256,7 @@ parser.prototype.monthByName = function() {
var captures;
if (captures = rMonths.exec(this.str)) {
var day = captures[2]
var month = captures[4];
var month = captures[5];
this.date.date.setMonth((months.indexOf(month)));
if (day) this.date.date.setDate(parseInt(day));
this.skip(captures);
Expand Down