Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[* DateTimeV2] Split extraction of "[day] [calendar date] into multiple entities if day doesn't match date (#2709) #2752

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -126,17 +126,17 @@ public static class DateTimeDefinitions
public const string DatePreposition = @"\b(on|in)";
public static readonly string DateExtractorYearTermRegex = $@"(\s+|\s*[/\\.,-]\s*|\s+of\s+){DateYearRegex}";
public static readonly string DayPrefix = $@"\b({WeekDayRegex}|{SpecialDayRegex})\b";
public static readonly string DateExtractor1 = $@"\b({DayPrefix}\s*[,-]?\s*)?(({MonthRegex}[\.]?\s*[/\\.,-]?\s*{DayRegex})|(\({MonthRegex}\s*[-./]\s*{DayRegex}\)))(\s*\(\s*{DayPrefix}\s*\))?({DateExtractorYearTermRegex}\b)?";
public static readonly string DateExtractor3 = $@"\b({DayPrefix}(\s+|\s*,\s*))?({DayRegex}[\.]?(\s+|\s*[-,/]\s*|\s+of\s+){MonthRegex}[\.]?((\s+in)?{DateExtractorYearTermRegex})?|{BaseDateTime.FourDigitYearRegex}\s*[-./]?\s*(the\s+)?(?<day>(?:3[0-1]|[1-2]\d|0?[1-9])(?:th|nd|rd|st)?)[\.]?(\s+|\s*[-,/]\s*|\s+of\s+){MonthRegex}[\.]?)\b";
public static readonly string DateExtractor1 = $@"\b(?<dayprefix>{DayPrefix}\s*[,-]?\s*)?(({MonthRegex}[\.]?\s*[/\\.,-]?\s*{DayRegex})|(\({MonthRegex}\s*[-./]\s*{DayRegex}\)))(?<dayprefix>\s*\(\s*{DayPrefix}\s*\))?({DateExtractorYearTermRegex}\b)?";
public static readonly string DateExtractor3 = $@"\b(?<dayprefix>{DayPrefix}(\s+|\s*,\s*))?({DayRegex}[\.]?(\s+|\s*[-,/]\s*|\s+of\s+){MonthRegex}[\.]?((\s+in)?{DateExtractorYearTermRegex})?|{BaseDateTime.FourDigitYearRegex}\s*[-./]?\s*(the\s+)?(?<day>(?:3[0-1]|[1-2]\d|0?[1-9])(?:th|nd|rd|st)?)[\.]?(\s+|\s*[-,/]\s*|\s+of\s+){MonthRegex}[\.]?)\b";
public static readonly string DateExtractor4 = $@"\b{MonthNumRegex}\s*[/\\\-]\s*{DayRegex}[\.]?\s*[/\\\-]\s*{DateYearRegex}";
public static readonly string DateExtractor5 = $@"\b({DayPrefix}(\s*,)?\s+)?{DayRegex}\s*[/\\\-\.]\s*({MonthNumRegex}|{MonthRegex})\s*[/\\\-\.]\s*{DateYearRegex}(?!\s*[/\\\-\.]\s*\d+)";
public static readonly string DateExtractor6 = $@"(?<={DatePreposition}\s+)({StrictRelativeRegex}\s+)?({DayPrefix}\s+)?{MonthNumRegex}[\-\.]{DayRegex}(?![%]){BaseDateTime.CheckDecimalRegex}\b";
public static readonly string DateExtractor7L = $@"\b({DayPrefix}(\s*,)?\s+)?{MonthNumRegex}\s*/\s*{DayRegex}{DateExtractorYearTermRegex}(?![%])\b";
public static readonly string DateExtractor7S = $@"\b({DayPrefix}(\s*,)?\s+)?{MonthNumRegex}\s*/\s*{DayRegex}(?![%]){BaseDateTime.CheckDecimalRegex}\b";
public static readonly string DateExtractor8 = $@"(?<={DatePreposition}\s+)({StrictRelativeRegex}\s+)?({DayPrefix}\s+)?{DayRegex}[\\\-]{MonthNumRegex}(?![%]){BaseDateTime.CheckDecimalRegex}\b";
public static readonly string DateExtractor9L = $@"\b({DayPrefix}(\s*,)?\s+)?{DayRegex}\s*/\s*{MonthNumRegex}{DateExtractorYearTermRegex}(?![%])\b";
public static readonly string DateExtractor9S = $@"\b({DayPrefix}(\s*,)?\s+)?{DayRegex}\s*/\s*{MonthNumRegex}{BaseDateTime.CheckDecimalRegex}(?![%])\b";
public static readonly string DateExtractorA = $@"\b({DayPrefix}(\s*,)?\s+)?(({BaseDateTime.FourDigitYearRegex}\s*[/\\\-\.]\s*({MonthNumRegex}|{MonthRegex})\s*[/\\\-\.]\s*{DayRegex})|({MonthRegex}\s*[/\\\-\.]\s*{BaseDateTime.FourDigitYearRegex}\s*[/\\\-\.]\s*(the\s+)?(?<day>(?:3[0-1]|[1-2]\d|0?[1-9])(?:th|nd|rd|st)?))|({DayRegex}\s*[/\\\-\.]\s*{BaseDateTime.FourDigitYearRegex}\s*[/\\\-\.]\s*{MonthRegex}))";
public static readonly string DateExtractor5 = $@"\b(?<dayprefix>{DayPrefix}(\s*,)?\s+)?{DayRegex}\s*[/\\\-\.]\s*({MonthNumRegex}|{MonthRegex})\s*[/\\\-\.]\s*{DateYearRegex}(?!\s*[/\\\-\.]\s*\d+)";
public static readonly string DateExtractor6 = $@"(?<={DatePreposition}\s+)({StrictRelativeRegex}\s+)?(?<dayprefix>{DayPrefix}\s+)?{MonthNumRegex}[\-\.]{DayRegex}(?![%]){BaseDateTime.CheckDecimalRegex}\b";
public static readonly string DateExtractor7L = $@"\b(?<dayprefix>{DayPrefix}(\s*,)?\s+)?{MonthNumRegex}\s*/\s*{DayRegex}{DateExtractorYearTermRegex}(?![%])\b";
public static readonly string DateExtractor7S = $@"\b(?<dayprefix>{DayPrefix}(\s*,)?\s+)?{MonthNumRegex}\s*/\s*{DayRegex}(?![%]){BaseDateTime.CheckDecimalRegex}\b";
public static readonly string DateExtractor8 = $@"(?<={DatePreposition}\s+)({StrictRelativeRegex}\s+)?(?<dayprefix>{DayPrefix}\s+)?{DayRegex}[\\\-]{MonthNumRegex}(?![%]){BaseDateTime.CheckDecimalRegex}\b";
public static readonly string DateExtractor9L = $@"\b(?<dayprefix>{DayPrefix}(\s*,)?\s+)?{DayRegex}\s*/\s*{MonthNumRegex}{DateExtractorYearTermRegex}(?![%])\b";
public static readonly string DateExtractor9S = $@"\b(?<dayprefix>{DayPrefix}(\s*,)?\s+)?{DayRegex}\s*/\s*{MonthNumRegex}{BaseDateTime.CheckDecimalRegex}(?![%])\b";
public static readonly string DateExtractorA = $@"\b(?<dayprefix>{DayPrefix}(\s*,)?\s+)?(({BaseDateTime.FourDigitYearRegex}\s*[/\\\-\.]\s*({MonthNumRegex}|{MonthRegex})\s*[/\\\-\.]\s*{DayRegex})|({MonthRegex}\s*[/\\\-\.]\s*{BaseDateTime.FourDigitYearRegex}\s*[/\\\-\.]\s*(the\s+)?(?<day>(?:3[0-1]|[1-2]\d|0?[1-9])(?:th|nd|rd|st)?))|({DayRegex}\s*[/\\\-\.]\s*{BaseDateTime.FourDigitYearRegex}\s*[/\\\-\.]\s*{MonthRegex}))";
public static readonly string OfMonth = $@"^\s*(day\s+)?of\s*{MonthRegex}";
public static readonly string MonthEnd = $@"{MonthRegex}\s*(the)?\s*$";
public static readonly string WeekDayEnd = $@"(this\s+)?{WeekDayRegex}\s*,?\s*$";
Expand Down
5 changes: 5 additions & 0 deletions .NET/Microsoft.Recognizers.Text.DateTime/Constants.cs
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,11 @@ public static class Constants
public const string MinuteGroupName = "min";
public const string HourGroupName = "hour";
public const string YearGroupName = "year";
public const string MonthGroupName = "month";
public const string DayGroupName = "day";
public const string WeekdayGroupName = "weekday";
public const string DayPrefixGroupName = "dayprefix";
public const string DayOfMonthGroupName = "DayOfMonth";
public const string TimeOfDayGroupName = "timeOfDay";
public const string BusinessDayGroupName = "business";
public const string LeftAmPmGroupName = "leftDesc";
Expand Down
147 changes: 133 additions & 14 deletions .NET/Microsoft.Recognizers.Text.DateTime/Extractors/BaseDateExtractor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ private List<ExtractResult> ExtractImpl(string text, DateObject reference)
tokens.AddRange(NumberWithMonth(text, reference));
tokens.AddRange(ExtractRelativeDurationDate(text, tokens, reference));

tokens = TruncateInconsistentDates(tokens);

var results = Token.MergeAllTokens(tokens, text, ExtractorName);

return results;
Expand Down Expand Up @@ -179,6 +181,8 @@ private List<Token> BasicRegexMatch(string text)
results.Add(new Token(match.Index, match.Index + match.Length));
}

// Check if prefix weekday and date agree
results = ValidateWeekdayPrefix(match, results);
}
}
}
Expand All @@ -191,11 +195,11 @@ private List<Token> BasicRegexMatch(string text)
private bool ValidateMatch(Match match, string text)
{
// If the match doesn't contains "year" part, it will not be ambiguous and it's a valid match
var isValidMatch = !match.Groups["year"].Success;
var isValidMatch = !match.Groups[Constants.YearGroupName].Success;

if (!isValidMatch)
{
var yearGroup = match.Groups["year"];
var yearGroup = match.Groups[Constants.YearGroupName];

// If the "year" part is not at the end of the match, it's a valid match
if (yearGroup.Index + yearGroup.Length != match.Index + match.Length)
Expand All @@ -222,11 +226,11 @@ private bool ValidateMatch(Match match, string text)
}

// Expressions with mixed separators are not considered valid dates e.g. "30/4.85" (unless one is a comma "30/4, 2016")
if (match.Groups["day"].Success && match.Groups["month"].Success)
if (match.Groups[Constants.DayGroupName].Success && match.Groups[Constants.MonthGroupName].Success)
{
var noDateText = match.Value.Replace(match.Groups["year"].Value, string.Empty)
.Replace(match.Groups["month"].Value, string.Empty)
.Replace(match.Groups["day"].Value, string.Empty);
var noDateText = match.Value.Replace(match.Groups[Constants.YearGroupName].Value, string.Empty)
.Replace(match.Groups[Constants.MonthGroupName].Value, string.Empty)
.Replace(match.Groups[Constants.DayGroupName].Value, string.Empty);
var separators = new List<char> { '/', '\\', '-', '.' };

if (separators.Count(separator => noDateText.Contains(separator)) > 1)
Expand Down Expand Up @@ -331,7 +335,7 @@ private List<Token> NumberWithMonth(string text, DateObject reference)
var endIndex = match.Index + match.Length + (result.Length ?? 0);

ExtendWithWeekdayAndYear(
ref startIndex, ref endIndex, Config.MonthOfYear.GetValueOrDefault(match.Groups["month"].Value, reference.Month),
ref startIndex, ref endIndex, Config.MonthOfYear.GetValueOrDefault(match.Groups[Constants.MonthGroupName].Value, reference.Month),
num, text, reference);

ret.Add(new Token(startIndex, endIndex));
Expand All @@ -345,7 +349,7 @@ private List<Token> NumberWithMonth(string text, DateObject reference)
{
if (matchCase.Success)
{
var ordinalNum = matchCase.Groups["DayOfMonth"].Value;
var ordinalNum = matchCase.Groups[Constants.DayOfMonthGroupName].Value;
if (ordinalNum == result.Text)
{
var endLength = 0;
Expand All @@ -372,16 +376,16 @@ private List<Token> NumberWithMonth(string text, DateObject reference)
{
if (matchCase.Success)
{
var ordinalNum = matchCase.Groups["DayOfMonth"].Value;
if (ordinalNum == result.Text && matchCase.Groups["DayOfMonth"].Index == result.Start)
var ordinalNum = matchCase.Groups[Constants.DayOfMonthGroupName].Value;
if (ordinalNum == result.Text && matchCase.Groups[Constants.DayOfMonthGroupName].Index == result.Start)
{
// Get week of day for the ordinal number which is regarded as a date of reference month
var date = DateObject.MinValue.SafeCreateFromValue(reference.Year, reference.Month, num);
var numWeekDayInt = (int)date.DayOfWeek;

// Get week day from text directly, compare it with the weekday generated above
// to see whether they refer to the same week day
var extractedWeekDayStr = matchCase.Groups["weekday"].Value;
var extractedWeekDayStr = matchCase.Groups[Constants.WeekdayGroupName].Value;

if (!date.Equals(DateObject.MinValue) &&
numWeekDayInt == Config.DayOfWeek[extractedWeekDayStr])
Expand Down Expand Up @@ -455,7 +459,7 @@ private List<Token> NumberWithMonth(string text, DateObject reference)
if (beginMatch.Success && num >= 1 && num <= 5
&& result.Type.Equals(Number.Constants.SYS_NUM_ORDINAL, StringComparison.Ordinal))
{
var weekDayStr = beginMatch.Groups["weekday"].Value;
var weekDayStr = beginMatch.Groups[Constants.WeekdayGroupName].Value;
if (this.Config.DayOfWeek.ContainsKey(weekDayStr))
{
var spaceLen = suffixStr.Length - suffixStr.Trim().Length;
Expand All @@ -476,7 +480,7 @@ private List<Token> NumberWithMonth(string text, DateObject reference)
var endIndex = (result.Start + result.Length ?? 0) + match.Length;

ExtendWithWeekdayAndYear(ref startIndex, ref endIndex,
Config.MonthOfYear.GetValueOrDefault(match.Groups["month"].Value, reference.Month),
Config.MonthOfYear.GetValueOrDefault(match.Groups[Constants.MonthGroupName].Value, reference.Month),
num, text, reference);

ret.Add(new Token(startIndex, endIndex));
Expand Down Expand Up @@ -520,7 +524,7 @@ private void ExtendWithWeekdayAndYear(ref int startIndex, ref int endIndex, int
{
// Get weekday from context directly, compare it with the weekday extraction above
// to see whether they reference the same weekday
var extractedWeekDayStr = matchWeekDay.Groups["weekday"].Value;
var extractedWeekDayStr = matchWeekDay.Groups[Constants.WeekdayGroupName].Value;
var numWeekDayStr = date.DayOfWeek.ToString().ToLowerInvariant();

if (Config.DayOfWeek.TryGetValue(numWeekDayStr, out var weekDay1) &&
Expand Down Expand Up @@ -692,5 +696,120 @@ private int GetYearIndex(string affix, ref int year, out bool success, bool inPr

return index;
}

// Remove weekday prefix when it does not agree with date
private List<Token> TruncateInconsistentDates(List<Token> tokens)
{
var newTokens = new List<Token>();
var splitIndices = tokens.Select(o => o.Metadata != null ? o.Metadata.SplitIndex : 0).ToList();
splitIndices.RemoveAll(i => i == 0);

foreach (var token in tokens)
{
var newToken = token;
foreach (var index in splitIndices)
{
if (index < token.End && index > token.Start)
{
newToken = new Token(index, token.End);
break;
}
}

newTokens.Add(newToken);
}

return newTokens;
}

// Check if weekday prefix agrees with date
private List<Token> ValidateWeekdayPrefix(Match match, List<Token> results)
{
if (match.Groups[Constants.WeekdayGroupName].Success && match.Groups[Constants.DayPrefixGroupName].Success)
{
// If a span has already been checked, skip
for (int i = 0; i < results.Count - 1; i++)
{
if (match.Index == results[i].Start && match.Index + match.Length == results[i].End)
{
return results;
}
}

var date = ParseDate(match);
if (date > DateObject.MinValue)
{
// Get weekday from date
var numWeekDayInt = (int)date.DayOfWeek;

// Get weekday from text directly, compare it with the weekday generated above
// to see whether they refer to the same day
var extractedWeekDayStr = match.Groups[Constants.WeekdayGroupName].Value;

// If weekdays do not agree, add metadata to token so that prefix will be removed
if (numWeekDayInt != Config.DayOfWeek[extractedWeekDayStr])
{
var startPrefix = match.Groups[Constants.DayPrefixGroupName].Index;
var endPrefix = startPrefix + match.Groups[Constants.DayPrefixGroupName].Length;
var splitIndex = match.Groups[Constants.DayGroupName].Index >= endPrefix ? endPrefix : 0;

var metadata = new Metadata { SplitIndex = splitIndex };
var newToken = new Token(results[results.Count - 1].Start, results[results.Count - 1].End, metadata);
results[results.Count - 1] = newToken;
}
}
}

return results;
}

// Parse date from match
private DateObject ParseDate(Match match)
{
// Get year
var isYearParsed = int.TryParse(match.Groups[Constants.YearGroupName].Value, out var year);

// Get month
var isMonthParsed = int.TryParse(match.Groups[Constants.MonthGroupName].Value, out var month);
if (!isMonthParsed)
{
isMonthParsed = Config.MonthOfYear.TryGetValue(match.Groups[Constants.MonthGroupName].Value, out month);
}

// Get day
var day = 0;
if (isYearParsed && isMonthParsed)
{
var dayStr = match.Groups[Constants.DayGroupName].Value;

var isDayParsed = int.TryParse(dayStr, out day);
if (!isDayParsed)
{
var dayErs = Config.OrdinalExtractor.Extract(dayStr);
if (dayErs.Count == 0)
{
dayErs.AddRange(this.Config.IntegerExtractor.Extract(dayStr));
}

foreach (var er in dayErs)
{
isDayParsed = int.TryParse((this.Config.NumberParser.Parse(er).Value ?? 0).ToString(), out day);
if (isDayParsed)
{
break;
}
}
}
}

// Create date
var date = DateObject.MinValue;
if (day > 0 && day <= 31)
{
date = date.SafeCreateFromValue(year, month, day);
}

return date;
}
}
}
3 changes: 3 additions & 0 deletions .NET/Microsoft.Recognizers.Text/Extractors/Metadata.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ public class Metadata
// For cases where a language has variations in handling decimal separators
public bool TreatAsInteger { get; set; } = false;

// Used to split dates when the weekday does not agree with the day
public int SplitIndex { get; set; } = 0;

public Metadata Clone()
{
return (Metadata)MemberwiseClone();
Expand Down