diff --git a/WhoisNormalization.Tests/NormalizationUtilsTests.cs b/WhoisNormalization.Tests/NormalizationUtilsTests.cs index b7f2188..f27ed9e 100644 --- a/WhoisNormalization.Tests/NormalizationUtilsTests.cs +++ b/WhoisNormalization.Tests/NormalizationUtilsTests.cs @@ -26,14 +26,25 @@ namespace Microsoft.Geolocation.Whois.Normalization.Tests public class NormalizationUtilsTests { [TestMethod] - public void TestFindOldestDate() + public void TestFindOldestDateOptimized() { - Assert.IsNull(NormalizationUtils.FindOldestDate(null), "The extracted date should be null because the input text is null"); - Assert.IsNull(NormalizationUtils.FindOldestDate(" kenken@sfc.wide.ad.jp 2000021 kenken@sfc.wide.ad.jp 2000210 kenken@sfc.wide.ad.jp 2000-0-10 "), "The extracted date should be null because the input text is invalid"); + Assert.IsNull(NormalizationUtils.FindOldestDateOptimized(null), "The extracted date should be null because the input text is null"); + Assert.IsNull(NormalizationUtils.FindOldestDateOptimized(" kenken@sfc.wide.ad.jp 2000021 kenken@sfc.wide.ad.jp 2000210 kenken@sfc.wide.ad.jp 2000-0-10 "), "The extracted date should be null because the input text is invalid"); - Assert.AreEqual("2000-02-10", NormalizationUtils.FindOldestDate(" kenken@sfc.wide.ad.jp 20000210 "), "The extracted date should be 2000-02-10"); - Assert.AreEqual("2005-02-05", NormalizationUtils.FindOldestDate("ripe-dbm@ripe.net 20010724 hostmaster@ripe.net 20011024 hostmaster@ripe.net 20020805 ripe-dbm@ripe.net 20040503 ripe-dbm@ripe.net 20041229 hostmaster@afrinic.net 20050205"), "The extracted date should be 2005-02-05"); - Assert.AreEqual("1987-07-08", NormalizationUtils.FindOldestDate("1987-07-08"), "The extracted date should be 1987-07-08"); + Assert.AreEqual("2000-02-10", NormalizationUtils.FindOldestDateOptimized(" kenken@sfc.wide.ad.jp 20000210 "), "The extracted date should be 2000-02-10"); + Assert.AreEqual("2005-02-05", NormalizationUtils.FindOldestDateOptimized("ripe-dbm@ripe.net 20010724 hostmaster@ripe.net 20011024 hostmaster@ripe.net 20020805 ripe-dbm@ripe.net 20040503 ripe-dbm@ripe.net 20041229 hostmaster@afrinic.net 20050205"), "The extracted date should be 2005-02-05"); + Assert.AreEqual("1987-07-08", NormalizationUtils.FindOldestDateOptimized("1987-07-08"), "The extracted date should be 1987-07-08"); + } + + [TestMethod] + public void TestFindOldestDateSlow() + { + Assert.IsNull(NormalizationUtils.FindOldestDateSlow(null), "The extracted date should be null because the input text is null"); + Assert.IsNull(NormalizationUtils.FindOldestDateSlow(" kenken@sfc.wide.ad.jp 2000021 kenken@sfc.wide.ad.jp 2000210 kenken@sfc.wide.ad.jp 2000-0-10 "), "The extracted date should be null because the input text is invalid"); + + Assert.AreEqual("2000-02-10", NormalizationUtils.FindOldestDateSlow(" kenken@sfc.wide.ad.jp 20000210 "), "The extracted date should be 2000-02-10"); + Assert.AreEqual("2005-02-05", NormalizationUtils.FindOldestDateSlow("ripe-dbm@ripe.net 20010724 hostmaster@ripe.net 20011024 hostmaster@ripe.net 20020805 ripe-dbm@ripe.net 20040503 ripe-dbm@ripe.net 20041229 hostmaster@afrinic.net 20050205"), "The extracted date should be 2005-02-05"); + Assert.AreEqual("1987-07-08", NormalizationUtils.FindOldestDateSlow("1987-07-08"), "The extracted date should be 1987-07-08"); } } } diff --git a/WhoisNormalization/NormalizationUtils.cs b/WhoisNormalization/NormalizationUtils.cs index 6163a8d..d4753cc 100644 --- a/WhoisNormalization/NormalizationUtils.cs +++ b/WhoisNormalization/NormalizationUtils.cs @@ -7,14 +7,19 @@ namespace Microsoft.Geolocation.Whois.Normalization { using System; - using System.Linq; using System.Collections.Generic; + using System.Globalization; + using System.Linq; using System.Text; using Parsers; - using System.Globalization; public static class NormalizationUtils { + private static string[] dateFormats = { "yyyy-MM-dd", "yyyyMMdd" }; + private static string dateOutputFormat = "yyyy-MM-dd"; + private static char[] dateWordsSplitChars = new char[] { ' ', '\t', '\r', '\n' }; + private static CultureInfo dateCultureInfo = new CultureInfo("en-US"); + private static HashSet updatedFields = new HashSet(StringComparer.OrdinalIgnoreCase) { "Updated", @@ -104,8 +109,8 @@ public static void ExtractCommonRecordMetadata(RawWhoisSection section, string i { target.Id = id; target.Name = FindFirstMatchingFieldValueInRecords(section, nameFieldNames); - target.Created = FindOldestDate(FindFirstMatchingFieldValueInRecords(section, createdFields)); - target.Updated = FindOldestDate(FindFirstMatchingFieldValueInRecords(section, updatedFields)); + target.Created = FindOldestDateOptimized(FindFirstMatchingFieldValueInRecords(section, createdFields)); + target.Updated = FindOldestDateOptimized(FindFirstMatchingFieldValueInRecords(section, updatedFields)); target.UpdatedBy = FindFirstMatchingFieldValueInRecords(section, updatedByFields); target.Description = FindFirstMatchingFieldValueInRecords(section, descriptionFields); target.Comment = FindFirstMatchingFieldValueInRecords(section, commentFields); @@ -130,26 +135,87 @@ public static void AddToBuilderWithComma(StringBuilder builder, string text) } } - public static string FindOldestDate(string text) + public static DateTime? ExtractDateExact(string text) { if (text == null) { return null; } - var words = new List(text.Split(new char[] { ' ' })); + var dateNoDash = ExtractDateExactNoDash(text); + var dateDash = ExtractDateExactDash(text); + + if (dateNoDash != null) + { + return dateNoDash; + } + + if (dateDash != null) + { + return dateDash; + } + + return null; + } + + public static string FindOldestDateOptimized(string text) + { + if (text == null) + { + return null; + } + + var words = new List(text.Split(dateWordsSplitChars)); words = words.Select(word => word.Trim()).ToList(); words = words.Where(word => word.Length > 0).ToList(); DateTime? oldestParsedDate = null; - string[] dateFormats = { "yyyy-dd-MM", "yyyyddMM" }; + foreach (var word in words) + { + DateTime? currentParsedDate = ExtractDateExact(word); + + if (currentParsedDate != null) + { + if (oldestParsedDate == null) + { + oldestParsedDate = currentParsedDate; + } + else if (oldestParsedDate < currentParsedDate) + { + oldestParsedDate = currentParsedDate; + } + } + } + + if (oldestParsedDate.HasValue) + { + return oldestParsedDate.Value.ToString(dateOutputFormat); + } + else + { + return null; + } + } + + public static string FindOldestDateSlow(string text) + { + if (text == null) + { + return null; + } + + var words = new List(text.Split(dateWordsSplitChars)); + words = words.Select(word => word.Trim()).ToList(); + words = words.Where(word => word.Length > 0).ToList(); + + DateTime? oldestParsedDate = null; foreach (var word in words) { DateTime currentParsedDate; - if (DateTime.TryParseExact(s: word, formats: dateFormats, provider: new CultureInfo("en-US"), style: DateTimeStyles.None, result: out currentParsedDate)) + if (DateTime.TryParseExact(s: word, formats: dateFormats, provider: dateCultureInfo, style: DateTimeStyles.None, result: out currentParsedDate)) { if (oldestParsedDate == null) { @@ -164,12 +230,123 @@ public static string FindOldestDate(string text) if (oldestParsedDate.HasValue) { - return oldestParsedDate.Value.ToString("yyyy-dd-MM"); + return oldestParsedDate.Value.ToString(dateOutputFormat); } else { return null; } } + + private static DateTime? ExtractDateExactNoDash(string text) + { + // Example: 20101112 + if (text.Length != 8) + { + return null; + } + + foreach (var c in text) + { + if (!char.IsNumber(c)) + { + return null; + } + } + + if (text[0] == '0') + { + return null; + } + + var rawYear = text.Substring(0, 4); + var rawMonth = text.Substring(4, 2); + var rawDay = text.Substring(6, 2); + + int year; + int month; + int day; + + if (!int.TryParse(rawYear, out year)) + { + return null; + } + + if (!int.TryParse(rawMonth, out month)) + { + return null; + } + + if (!int.TryParse(rawDay, out day)) + { + return null; + } + + return new DateTime(year, month, day); + } + + private static DateTime? ExtractDateExactDash(string text) + { + // Example: 2010-11-12 + if (text.Length != 10) + { + return null; + } + + for (var i = 0; i < text.Length; i++) + { + var c = text[i]; + + switch (i) + { + case 4: // First dash + case 7: // Second dash + if (c != '-') + { + return null; + } + + break; + default: + + if (!char.IsNumber(c)) + { + return null; + } + + break; + } + } + + if (text[0] == '0') + { + return null; + } + + var rawYear = text.Substring(0, 4); + var rawMonth = text.Substring(5, 2); + var rawDay = text.Substring(8, 2); + + int year; + int month; + int day; + + if (!int.TryParse(rawYear, out year)) + { + return null; + } + + if (!int.TryParse(rawMonth, out month)) + { + return null; + } + + if (!int.TryParse(rawDay, out day)) + { + return null; + } + + return new DateTime(year, month, day); + } } } diff --git a/WhoisNormalization/NormalizedLocation.cs b/WhoisNormalization/NormalizedLocation.cs index cf5b2f7..d06d397 100644 --- a/WhoisNormalization/NormalizedLocation.cs +++ b/WhoisNormalization/NormalizedLocation.cs @@ -13,13 +13,6 @@ namespace Microsoft.Geolocation.Whois.Normalization public class NormalizedLocation { - static NormalizedLocation() - { - allBlacklistedValues = new HashSet(StringComparer.OrdinalIgnoreCase); - allBlacklistedValues.UnionWith(blacklistedValuesSimilarToCountries); - allBlacklistedValues.UnionWith(blacklistedValuesExceptSimilarToCountries); - } - private static HashSet allBlacklistedValues; private static HashSet blacklistedValuesExceptSimilarToCountries = new HashSet(StringComparer.OrdinalIgnoreCase) @@ -156,6 +149,13 @@ static NormalizedLocation() "Customer Country Code" }; + static NormalizedLocation() + { + allBlacklistedValues = new HashSet(StringComparer.OrdinalIgnoreCase); + allBlacklistedValues.UnionWith(blacklistedValuesSimilarToCountries); + allBlacklistedValues.UnionWith(blacklistedValuesExceptSimilarToCountries); + } + public string Address { get; set; } public string Street { get; set; } diff --git a/WhoisTsvExport/ApnicTsvWriter.cs b/WhoisTsvExport/ApnicTsvWriter.cs index b93074b..e0f65fb 100644 --- a/WhoisTsvExport/ApnicTsvWriter.cs +++ b/WhoisTsvExport/ApnicTsvWriter.cs @@ -73,7 +73,7 @@ protected new void NetworksWithLocationsToTsv(WhoisParser parser, string inputFi outputFile.WriteLine(networkTsv); } } - // TODO: Else log + //// TODO: Else log } } } diff --git a/WhoisTsvExport/RWhoisTsvWriter.cs b/WhoisTsvExport/RWhoisTsvWriter.cs index 52bc978..2fbec47 100644 --- a/WhoisTsvExport/RWhoisTsvWriter.cs +++ b/WhoisTsvExport/RWhoisTsvWriter.cs @@ -75,7 +75,7 @@ public void NetworksWithLocationsToSeparateTsv(string inputFolderPath, string ou var networkTsv = network.ToLocationTsv(); outputFile.WriteLine(networkTsv); } - // TODO: else log + //// TODO: else log } } } @@ -104,7 +104,7 @@ public void NetworksWithLocationsToTsv(string inputFolderPath, string outputFile var networkTsv = network.ToLocationTsv(); outputFile.WriteLine(networkTsv); } - // TODO: else log + //// TODO: else log } } } @@ -165,7 +165,7 @@ public void NetworksLocationPropertyCountsToTsv(string inputFolderPath, string p stringsCount[value] = currentCount; } } - // TODO: else log + //// TODO: else log } } diff --git a/WhoisTsvExport/TsvWriter.cs b/WhoisTsvExport/TsvWriter.cs index 37f5392..1ae3366 100644 --- a/WhoisTsvExport/TsvWriter.cs +++ b/WhoisTsvExport/TsvWriter.cs @@ -74,7 +74,7 @@ protected void NetworksWithLocationsToTsv(WhoisParser parser, string inputFilePa var networkTsv = network.ToLocationTsv(); outputFile.WriteLine(networkTsv); } - // TODO: Else log + //// TODO: Else log } } } @@ -131,7 +131,7 @@ protected void NetworksLocationPropertyCountsToTsv(WhoisParser parser, string in stringsCount[value] = currentCount; } } - // TODO: Else log + //// TODO: Else log } using (var outputFile = new StreamWriter(outputFilePath)) diff --git a/nuget/WhoisParsers.nuspec b/nuget/WhoisParsers.nuspec index 0465e99..24149c6 100644 --- a/nuget/WhoisParsers.nuspec +++ b/nuget/WhoisParsers.nuspec @@ -2,7 +2,7 @@ WhoisParsers - 0.1.7 + 0.1.8 Whois and RWhois Parsers and Crawlers Ovidiu Dan Ovidiu Dan @@ -11,7 +11,7 @@ false Download and parse Whois records from bulk whois database dumps of IANA organizations (ARIN, AFRINIC, APNIC, LACNIC, RIPE ). Crawl and parse RWhois records from RFC 2167 ARIN Referral Whois Servers. This allows you to: 1) Download and parse Whois records from bulk whois database dumps of IANA organizations (ARIN, AFRINIC, APNIC, LACNIC, RIPE ) and 2) Crawl and parse RWhois records from RFC 2167 ARIN Referral Whois Servers. It also provides utilities to increment IP addresses and to output Whois databases in TSV format. - Added date parsing for Updated and Created fields + Created an optimized version of FindOldestDate Copyright Microsoft whois rwhois parser parsers crawling arin afrinic apnic lacnic ripe iana bulk database databases rfc 2167 referral servers download unpack decompress tsv