diff --git a/jwat-common/src/main/java/org/jwat/common/Numbers.java b/jwat-common/src/main/java/org/jwat/common/Numbers.java new file mode 100644 index 0000000..f39d6d3 --- /dev/null +++ b/jwat-common/src/main/java/org/jwat/common/Numbers.java @@ -0,0 +1,199 @@ +/** + * Java Web Archive Toolkit - Software to read and validate ARC, WARC + * and GZip files. (http://jwat.org/) + * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.jwat.common; + +import java.util.ArrayList; +import java.util.List; + +/** + * Helper class to work with numbers and their string representation. + * + * @author nicl + */ +public class Numbers { + + /** Integer array for integer string length binary search. */ + protected static int[] islArr; + + /** Long array for long string length binary search. */ + protected static long[] lslArr; + + /* + * Initialize the internal comparison arrays for binary search. + */ + static { + boolean b; + List islList = new ArrayList(); + islList.add(0); + islList.add(0); + int i = 1; + int li; + int lm = Integer.MAX_VALUE; + b = true; + while (b) { + if (i < lm) { + li = i; + i = (i << 3) + (i << 1); + if (i / 10 == li) { + islList.add(i); + } + else { + b = false; + } + } + else { + b = false; + } + } + islArr = new int[islList.size()]; + for (i=0; i lslList = new ArrayList(); + lslList.add(0L); + lslList.add(0L); + long l = 1; + long ll; + long ml = Long.MAX_VALUE; + b = true; + while (b) { + if (l < ml) { + ll = l; + l = (l << 3) + (l << 1); + if (l / 10 == ll) { + lslList.add(l); + } + else { + b = false; + } + } + else { + b = false; + } + } + lslArr = new long[lslList.size()]; + for (i=0; i> 1); + //System.out.println(l + " " + min + " " + max + " " + islArr[min] + " " + i + " " + islArr[max]); + c = islArr[idx]; + if (i < c) { + max = --idx; + } + else { + min = idx; + } + bLoop = (max - min) != 0; + //System.out.println(l + " " + min + " " + max + " " + islArr[min] + " " + i + " " + islArr[max]); + } + return idx; + } + + /** + * Find the string length of the long without leading zeroes. + * @param i integer value + * @return string length of the long + */ + public static int longStrLen(long l) { + int min = 1; + int max = 19; + int idx = 0; + long c; + boolean bLoop = true; + while (bLoop) { + idx = min + ((max - min + 1) >> 1); + //System.out.println(l + " " + min + " " + max + " " + lslArr[min] + " " + i + " " + lslArr[max]); + c = lslArr[idx]; + if (l < c) { + max = --idx; + } + else { + min = idx; + } + bLoop = (max - min) != 0; + //System.out.println(l + " " + min + " " + max + " " + lslArr[min] + " " + i + " " + lslArr[max]); + } + return idx; + } + + /** + * Returns the number of trailing zeroes in an integer, or zero if the number is 0. + * @param i number to count trailing zeroes in + * @return the number of trailing zeroes in an integer, or zero if the number is 0 + */ + public static int intTrailingZeros(int i) { + int cnt = 0; + if (i != 0) { + while (i % 10 == 0) { + i = i / 10; + ++cnt; + } + } + return cnt; + } + + /** + * Divu10 from Hackers Delight. + * @param n integer to divide by 10 + * @return integer divided by 10 + */ + public static int divu10hd(int n) { + int q, r; + q = (n >> 1) + (n >> 2); // q=n/2+n/4 = 3n/4 + q = q + (q >> 4); // q=3n/4+(3n/4)/16 = 3n/4+3n/64 = 51n/64 + q = q + (q >> 8); // q=51n/64+(51n/64)/256 = 51n/64 + 51n/16384 = 13107n/16384 + q = q + (q >> 16); // q= 13107n/16384+(13107n/16384)/65536=13107n/16348+13107n/1073741824=858993458n/1073741824 + // note: q is now roughly 0.8n + q = q >> 3; // q=n/8 = (about 0.1n or n/10) + r = n - (((q << 2) + q) << 1); // rounding: r= n-2*(n/10*4+n/10)=n-2*5n/10=n-10n/10 + return q + ((r > 9) ? 1 : 0); // adjust answer by error term + } + + /** + * Divu10 from GNU GCC. + * @param n integer to divide by 10 + * @return integer divided by 10 + */ + public static int divu10gcc(int n) { + return (int)((((long)n) * 0xcccccccdL) >>> 35); + } + + /* This does not seem to work in Java, most likely dues to missing unsigned multiplication. + public static long divu10(long l) { + return (l * 0xcccccccccccccccdL) >>> 3; + } + */ + +} diff --git a/jwat-common/src/test/java/org/jwat/common/TestNumbers.java b/jwat-common/src/test/java/org/jwat/common/TestNumbers.java new file mode 100644 index 0000000..fa8acc1 --- /dev/null +++ b/jwat-common/src/test/java/org/jwat/common/TestNumbers.java @@ -0,0 +1,245 @@ +/** + * Java Web Archive Toolkit - Software to read and validate ARC, WARC + * and GZip files. (http://jwat.org/) + * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.jwat.common; + +import org.junit.Assert; +import org.junit.Ignore; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class TestNumbers { + + @Test + public void test_numbers() { + //System.out.println(Integer.MAX_VALUE); + //System.out.println(Long.MAX_VALUE); + int[] islArrCases = new int[] { + 0, + 0, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000 + }; + // 2147483647 + Assert.assertEquals(islArrCases.length, Numbers.islArr.length); + for (int i=0; i 0) { + iv += islCases[i][1]; + // Debug + //System.out.println(iv); + Assert.assertEquals(i, Numbers.intStrLen(iv)); + --it; + } + } + long[] lslArrCases = new long[] { + 0, + 0, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000L, + 100000000000L, + 1000000000000L, + 10000000000000L, + 100000000000000L, + 1000000000000000L, + 10000000000000000L, + 100000000000000000L, + 1000000000000000000L + }; + // 9223372036854775807 + Assert.assertEquals(lslArrCases.length, Numbers.lslArr.length); + for (int i=0; i 0) { + lv += lslArrCases[i]; + // Debug + //System.out.println(lv); + Assert.assertEquals(i, Numbers.longStrLen(lv)); + --it; + } + } + /* + Assert.assertEquals("1".length(), Numbers.intStrLen(1)); + Assert.assertEquals(Numbers.intStrLen(10) + " 10"); + Assert.assertEquals(Numbers.intStrLen(100) + " 100"); + Assert.assertEquals(Numbers.intStrLen(1000) + " 1000"); + Assert.assertEquals(Numbers.intStrLen(10000) + " 10000"); + Assert.assertEquals(Numbers.intStrLen(100000) + " 100000"); + Assert.assertEquals(Numbers.intStrLen(1000000) + " 1000000"); + Assert.assertEquals(Numbers.intStrLen(10000000) + " 10000000"); + Assert.assertEquals(Numbers.intStrLen(100000000) + " 100000000"); + Assert.assertEquals(Numbers.intStrLen(1000000000) + " 1000000000"); + + System.out.println(Numbers.intStrLen(1) + " 1"); + System.out.println(Numbers.intStrLen(10) + " 10"); + System.out.println(Numbers.intStrLen(100) + " 100"); + System.out.println(Numbers.intStrLen(1000) + " 1000"); + System.out.println(Numbers.intStrLen(10000) + " 10000"); + System.out.println(Numbers.intStrLen(100000) + " 100000"); + System.out.println(Numbers.intStrLen(1000000) + " 1000000"); + System.out.println(Numbers.intStrLen(10000000) + " 10000000"); + System.out.println(Numbers.intStrLen(100000000) + " 100000000"); + System.out.println(Numbers.intStrLen(1000000000) + " 1000000000"); + */ + + System.out.println(Numbers.intTrailingZeros(1000000000)); + System.out.println(Numbers.intTrailingZeros(100000000)); + System.out.println(Numbers.intTrailingZeros(10000000)); + System.out.println(Numbers.intTrailingZeros(1000000)); + System.out.println(Numbers.intTrailingZeros(100000)); + System.out.println(Numbers.intTrailingZeros(10000)); + System.out.println(Numbers.intTrailingZeros(1000)); + System.out.println(Numbers.intTrailingZeros(100)); + System.out.println(Numbers.intTrailingZeros(10)); + System.out.println(Numbers.intTrailingZeros(1)); + System.out.println(Numbers.intTrailingZeros(0)); + } + + @Test + public void test_number_string_length() { + Assert.assertEquals(11, Numbers.islArr.length); + Assert.assertEquals(20, Numbers.lslArr.length); + for (int i=0; i<10; ++i) { + } + } + + @Test + @Ignore + public void test_divu10_hack() { + int q, r; + int res; + for (int n=0; n<100000000; ++n) { + q = (n >> 1) + (n >> 2); // q=n/2+n/4 = 3n/4 + q = q + (q >> 4); // q=3n/4+(3n/4)/16 = 3n/4+3n/64 = 51n/64 + q = q + (q >> 8); // q=51n/64+(51n/64)/256 = 51n/64 + 51n/16384 = 13107n/16384 + q = q + (q >> 16); // q= 13107n/16384+(13107n/16384)/65536=13107n/16348+13107n/1073741824=858993458n/1073741824 + // note: q is now roughly 0.8n + q = q >> 3; // q=n/8 = (about 0.1n or n/10) + r = n - (((q << 2) + q) << 1); // rounding: r= n-2*(n/10*4+n/10)=n-2*5n/10=n-10n/10 + res = q + ((r > 9) ? 1 : 0); // adjust answer by error term + if (res != n / 10) { + Assert.fail("Precision fail!"); + } + if (Numbers.divu10gcc(n) != n / 10) { + Assert.fail("Precision fail!"); + } + } + + long ctm1 = System.currentTimeMillis(); + int r2 = 0; + for (int n=0; n<100000000; ++n) { + q = n / 10; + r2 += q; + } + long ctm2 = System.currentTimeMillis(); + int r1 = 0; + for (int n=0; n<100000000; ++n) { + q = (n >> 1) + (n >> 2); // q=n/2+n/4 = 3n/4 + q = q + (q >> 4); // q=3n/4+(3n/4)/16 = 3n/4+3n/64 = 51n/64 + q = q + (q >> 8); // q=51n/64+(51n/64)/256 = 51n/64 + 51n/16384 = 13107n/16384 + q = q + (q >> 16); // q= 13107n/16384+(13107n/16384)/65536=13107n/16348+13107n/1073741824=858993458n/1073741824 + // note: q is now roughly 0.8n + q = q >> 3; // q=n/8 = (about 0.1n or n/10) + r = n - (((q << 2) + q) << 1); // rounding: r= n-2*(n/10*4+n/10)=n-2*5n/10=n-10n/10 + r1 += q + ((r > 9) ? 1 : 0); // adjust answer by error term + } + long ctm3 = System.currentTimeMillis(); + int r3 = 0; + for (int n=0; n<100000000; ++n) { + q = Numbers.divu10hd(n); + r3 += q; + } + long ctm4 = System.currentTimeMillis(); + int r4 = 0; + for (int n=0; n<100000000; ++n) { + q = Numbers.divu10gcc(n); + r4 += q; + } + long ctm5 = System.currentTimeMillis(); + System.out.println(ctm2 - ctm1); + System.out.println(ctm3 - ctm2); + System.out.println(ctm4 - ctm3); + System.out.println(ctm5 - ctm4); + System.out.println(r1); + System.out.println(r2); + System.out.println(r3); + System.out.println(r4); + + //ctm = System.currentTimeMillis(); + int n = 12340000; + q = (n >> 1) + (n >> 2); // q=n/2+n/4 = 3n/4 + q = q + (q >> 4); // q=3n/4+(3n/4)/16 = 3n/4+3n/64 = 51n/64 + q = q + (q >> 8); // q=51n/64+(51n/64)/256 = 51n/64 + 51n/16384 = 13107n/16384 + q = q + (q >> 16); // q= 13107n/16384+(13107n/16384)/65536=13107n/16348+13107n/1073741824=858993458n/1073741824 + // note: q is now roughly 0.8n + q = q >> 3; // q=n/8 = (about 0.1n or n/10) + r = n - (((q << 2) + q) << 1); // rounding: r= n-2*(n/10*4+n/10)=n-2*5n/10=n-10n/10 + res = q + ((r > 9) ? 1 : 0); // adjust answer by error term + System.out.println(res); + } + +} diff --git a/jwat-warc/src/main/java/org/jwat/warc/WarcDate.java b/jwat-warc/src/main/java/org/jwat/warc/WarcDate.java new file mode 100644 index 0000000..96e01cb --- /dev/null +++ b/jwat-warc/src/main/java/org/jwat/warc/WarcDate.java @@ -0,0 +1,596 @@ +/** + * Java Web Archive Toolkit - Software to read and validate ARC, WARC + * and GZip files. (http://jwat.org/) + * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.jwat.warc; + +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.util.Date; +import java.util.Objects; +import java.util.TimeZone; + +import org.jwat.common.Numbers; + +/** + * Separate class to handle WARC dates now that they can have varying precision levels. + * Also useful now that the Date class has been deprecated. + * This class maintains the UTC date time internally along with other fields. + * The old implementation just used the Date object directly leaving the timezone mess to the user. + * + * Exampples: + * + * WARC-Date: 2016-01-11T23:24:25.412030Z + * WARC-Date: 2016-01-11T23:24:25Z + * WARC-Date: 2016-01 + * Year: + * YYYY (eg 1997) + * Year and month: + * YYYY-MM (eg 1997-07) + * Complete date: + * YYYY-MM-DD (eg 1997-07-16) + * Complete date plus hours and minutes: + * YYYY-MM-DDThh:mmTZD (eg 1997-07-16T19:20+01:00) + * Complete date plus hours, minutes and seconds: + * YYYY-MM-DDThh:mm:ssTZD (eg 1997-07-16T19:20:30+01:00) + * Complete date plus hours, minutes, seconds and a decimal fraction of a second + * YYYY-MM-DDThh:mm:ss.sTZD (eg 1997-07-16T19:20:30.45+01:00) + * + * @author nicl + */ +public class WarcDate { + + /** UTC TimeZone object for reuse. */ + protected static TimeZone UTC_TIMEZONE = TimeZone.getTimeZone("UTC"); + + /** internal array used to convert digit to char. */ + protected static int[] asciiInt; + + /* + * Initialize the digit to char array. + */ + static { + asciiInt = new int[256]; + for (int i=0; i<256; ++i) { + asciiInt[i] = -1; + } + asciiInt['0'] = 0; + asciiInt['1'] = 1; + asciiInt['2'] = 2; + asciiInt['3'] = 3; + asciiInt['4'] = 4; + asciiInt['5'] = 5; + asciiInt['6'] = 6; + asciiInt['7'] = 7; + asciiInt['8'] = 8; + asciiInt['9'] = 9; + } + + /** Year level precision. */ + public static final int P_YEAR = 0; + /** Month level precision. */ + public static final int P_MONTH = 1; + /** Day of month level precision. */ + public static final int P_DAYOFMONTH = 2; + /** Minute level precision. */ + public static final int P_MINUTE = 3; + /** Second level precision. */ + public static final int P_SECOND = 4; + /** Nano/fraction level precision. */ + public static final int P_FRACTION = 5; + + /** Object used to convert dates, handle timezones and retrieve fields. */ + public LocalDateTime ldt; + + /** Precision level of the warc date (year to fraction). */ + public int precision; + + /** Year. */ + public int year; + /** Month (1-12). */ + public int month; + /** Day of month (1-31). */ + public int dayOfMonth; + /** Hour (0-23). */ + public int hour; + /** Minute (0-59). */ + public int minute; + /** Second (0-59). */ + public int second; + /** Nano of second (0-999.999.999).*/ + public int nanoOfSecond; + /** Bigger fraction in case someone needs more than nano precision. */ + public long fraction; + /** Length of the fraction string representation. */ + public int fractionLen; + + /** + * Construct WarcDate with year level precision. + * @param year year + */ + public WarcDate(int year) { + precision = P_YEAR; + this.ldt = LocalDateTime.of(year, 1, 1, 0, 0); + this.year = year; + } + + /** + * Construct WarcDate with month level precision. + * @param year year + * @param month integer between 1 and 12 + */ + public WarcDate(int year, int month) { + precision = P_MONTH; + this.ldt = LocalDateTime.of(year, month, 1, 0, 0); + this.year = year; + this.month = month; + } + + /** + * Create WarcDate with day of month level precision. + * @param year year + * @param month integer between 1 and 12 + * @param dayOfMonth integer between 1 and 31 + */ + public WarcDate(int year, int month, int dayOfMonth) { + precision = P_DAYOFMONTH; + this.ldt = LocalDateTime.of(year, month, dayOfMonth, 0, 0); + this.year = year; + this.month = month; + this.dayOfMonth = dayOfMonth; + } + + /** + * Create WarcDate with minute level precision. + * @param year year + * @param month integer between 1 and 12 + * @param dayOfMonth integer between 1 and 31 + * @param hour integer between 0 and 59 + * @param minute integer between 0 and 59 + */ + public WarcDate(int year, int month, int dayOfMonth, int hour, int minute) { + precision = P_MINUTE; + this.ldt = LocalDateTime.of(year, month, dayOfMonth, hour, minute); + this.year = year; + this.month = month; + this.dayOfMonth = dayOfMonth; + this.hour = hour; + this.minute = minute; + } + + /** + * Create WarcDate with second level precision. + * @param year year + * @param month integer between 1 and 12 + * @param dayOfMonth integer between 1 and 31 + * @param hour integer between 0 and 59 + * @param minute integer between 0 and 59 + * @param second integer between 0 and 59 + */ + public WarcDate(int year, int month, int dayOfMonth, int hour, int minute, int second) { + precision = P_SECOND; + this.ldt = LocalDateTime.of(year, month, dayOfMonth, hour, minute, second); + this.year = year; + this.month = month; + this.dayOfMonth = dayOfMonth; + this.hour = hour; + this.minute = minute; + this.second = second; + } + + /** + * Create WarcDate with nano if second level precision. + * Nano of second is used with the internal LocalDateTime object. + * @param year year + * @param month integer between 1 and 12 + * @param dayOfMonth integer between 1 and 31 + * @param hour integer between 0 and 59 + * @param minute integer between 0 and 59 + * @param second integer between 0 and 59 + * @param nanoOfSecond integer between 0 and 999.999.999. + */ + public WarcDate(int year, int month, int dayOfMonth, int hour, int minute, int second, int nanoOfSecond) { + precision = P_FRACTION; + this.ldt = LocalDateTime.of(year, month, dayOfMonth, hour, minute, second, nanoOfSecond); + this.year = year; + this.month = month; + this.dayOfMonth = dayOfMonth; + this.hour = hour; + this.minute = minute; + this.second = second; + this.nanoOfSecond = nanoOfSecond; + this.fraction = nanoOfSecond; + if (nanoOfSecond == 0) { + this.fractionLen = 1; + } + else { + this.fractionLen = 9; + int trailingZeros = Numbers.intTrailingZeros(nanoOfSecond); + while (trailingZeros > 0) { + fraction = fraction / 10; + --fractionLen; + --trailingZeros; + } + } + } + + /** + * Create WarcDate with nano if second level precision. + * Nano of second is used with the internal LocalDateTime object. + * Even though nano of second is currently the limit in Java this implementation can + * parse, store and ouput the fraction part as an long value. + * @param year year + * @param month integer between 1 and 12 + * @param dayOfMonth integer between 1 and 31 + * @param hour integer between 0 and 59 + * @param minute integer between 0 and 59 + * @param second integer between 0 and 59 + * @param nanoOfSecond integer between 0 and 999999999. + * @param fraction long fraction between 0 and 9223372036854775807. + * @param fractionLen franction length to output when converting to a string representation + */ + public WarcDate(int year, int month, int dayOfMonth, int hour, int minute, int second, int nanoOfSecond, long fraction, int fractionLen) { + precision = P_FRACTION; + this.ldt = LocalDateTime.of(year, month, dayOfMonth, hour, minute, second, nanoOfSecond); + this.year = year; + this.month = month; + this.dayOfMonth = dayOfMonth; + this.hour = hour; + this.minute = minute; + this.second = second; + this.nanoOfSecond = nanoOfSecond; + this.fraction = fraction; + this.fractionLen = fractionLen; + } + + /** + * Create WarcDate with nano if second level precision. + * Nano of second is used with the internal LocalDateTime object. + * Do not reuse the LocalDateTime instance! + * Nano of second is used with the internal LocalDateTime object. + * @param ldt keep this object internally and use it values + */ + public WarcDate(LocalDateTime ldt) { + precision = P_FRACTION; + this.ldt = ldt; + this.year = ldt.getYear(); + this.month = ldt.getMonthValue(); + this.dayOfMonth = ldt.getDayOfMonth(); + this.hour = ldt.getHour(); + this.minute = ldt.getMinute(); + this.second = ldt.getSecond(); + this.nanoOfSecond = ldt.getNano(); + this.fraction = nanoOfSecond; + this.fractionLen = 9; + } + + /** + * Returns a WarcDate representing the current UTC date time. + * @return a WarcDate representing the current UTC date time + */ + public static WarcDate now() { + return new WarcDate(LocalDateTime.now(UTC_TIMEZONE.toZoneId())); + } + + /** + * Returns a WarcDate object from on a Date in the default system timezone. + * @param date date in the system default timezone (technically deprecated) + * @return WarcDate object + */ + public static WarcDate fromLocalDate(Date date) { + // Date adjusted from system default timezone to UTC timezone. + return new WarcDate(LocalDateTime.ofInstant(date.toInstant(), UTC_TIMEZONE.toZoneId())); + } + + /** + * Returns a Date representing the internal date time adjusted to the system default timezone. + * @return a Date representing the internal date time adjusted to the system default timezone + */ + public Date getDateLocal() { + // Date adjusted from UTZ timezone to system default timezone. + // Meaning it is the local date time. + return Date.from((ldt.atZone(UTC_TIMEZONE.toZoneId()).toInstant())); + } + + /** + * Returns a WarcDate object from on a Date in the default system timezone. + * @param date date in the system default timezone (technically deprecated) + * @return WarcDate object + */ + public static WarcDate fromUTCDate(Date date) { + // Date adjusted from system default timezone to system default timezone. + // Meaning it is left as is, UTC timezone date expected. + return new WarcDate(LocalDateTime.ofInstant(date.toInstant(), ZoneId.systemDefault())); + } + + /** + * Returns a Date representing the internal date time (UTC timezone). + * @return a Date representing the internal date time (UTC timezone) + */ + public Date getDateUTC() { + // Date adjusted from system default timezone to system default timezone. + // Meaning it is left as is, which should be UTC. + return Date.from((ldt.atZone(ZoneId.systemDefault()).toInstant())); + } + + /** + * Attempt to parse a WARC date string in a subset of the W3CDTF format. + * @param datestring WARC date string + * @return WarcDate object + */ + public static WarcDate getWarcDate(String datestring) { + byte[] bytes = datestring.getBytes(); + return getWarcDate(bytes, 0, bytes.length); + } + + /** + * Attempt to parse a WARC date string from a byte array in a subset of the W3CDTF format. + * @param bytes WARC date string as a byte array + * @param pos start position in the byte array + * @param limit limit in the byte array + * @return WarcDate object + */ + public static WarcDate getWarcDate(byte[] bytes, int pos, int limit) { + //TimeZone timeZone = TimeZone.getTimeZone("UTC"); + //Calendar calendar = Calendar.getInstance(timeZone); + int idx = pos; + int len = limit - pos; + int c; + int chr; + int chr2; + int year; + int month; + int dayOfMonth; + int hour; + int minute; + int second; + int ie; + int ie2; + int nanoOfSecond; + int nanoStrLen; + long fraction; + int fractionStrLen; + long le; + if (len >= 4) { + // Year + year = asciiInt[bytes[idx++] & 255]; + ie = year; + c = asciiInt[bytes[idx++] & 255]; + ie |= c; + year = (year << 3) + (year << 1) + c; + c = asciiInt[bytes[idx++] & 255]; + ie |= c; + year = (year << 3) + (year << 1) + c; + c = asciiInt[bytes[idx++] & 255]; + ie |= c; + year = (year << 3) + (year << 1) + c; + if (ie < 0) { + return null; + } + if (len == 4) { + return new WarcDate(year); + } + if (len >= 7) { + // Month + chr = bytes[idx++] & 255; + month = asciiInt[bytes[idx++] & 255]; + ie = month; + c = asciiInt[bytes[idx++] & 255]; + ie |= c; + month = (month << 3) + (month << 1) + c; + if (ie < 0 || '-' != chr) { + return null; + } + if (len == 7) { + return new WarcDate(year, month); + } + if (len >= 10) { + // Date + chr = bytes[idx++] & 255; + dayOfMonth = asciiInt[bytes[idx++] & 255]; + ie = dayOfMonth; + c = asciiInt[bytes[idx++] & 255]; + ie |= c; + dayOfMonth = (dayOfMonth << 3) + (dayOfMonth << 1) + c; + if (ie < 0 || '-' != chr) { + return null; + } + if (len == 10) { + return new WarcDate(year, month, dayOfMonth); + } + if (len >= 16) { + // 'T' hour and minute. + chr = bytes[idx++] & 255; + hour = asciiInt[bytes[idx++] & 255]; + ie = hour; + c = asciiInt[bytes[idx++] & 255]; + ie |= c; + hour = (hour << 3) + (hour << 1) + c; + chr2 = bytes[idx++] & 255; + minute = asciiInt[bytes[idx++] & 255]; + ie2 = minute; + c = asciiInt[bytes[idx++] & 255]; + ie2 |= c; + minute = (minute << 3) + (minute << 1) + c; + if (ie < 0 || ie2 < 0 || ('T' != chr && 't' != chr) || ':' != chr2) { + return null; + } + if (len == 17) { + chr = (bytes[idx++] & 255); + if ('Z' != chr && 'z' != chr) { + return null; + } + else { + return new WarcDate(year, month, dayOfMonth, hour, minute); + } + } + if (len >= 19) { + // Second. + chr = bytes[idx++] & 255; + second = asciiInt[bytes[idx++] & 255]; + ie = second; + c = asciiInt[bytes[idx++] & 255]; + ie |= c; + second = (second << 3) + (second << 1) + c; + if (ie < 0 || ':' != chr) { + return null; + } + if (len == 20) { + chr = (bytes[idx++] & 255); + if ('Z' != chr && 'z' != chr) { + return null; + } + else { + return new WarcDate(year, month, dayOfMonth, hour, minute, second); + } + } + if (len >= 22) { + // '.' Fraction. + chr = bytes[idx++] & 255; + fraction = 0; + fractionStrLen = 0; + le = 0; + while (idx < len - 1 && fractionStrLen < 9) { + c = asciiInt[bytes[idx++] & 255]; + le |= c; + fraction = (fraction << 3) + (fraction << 1) + c; + ++fractionStrLen; + } + nanoOfSecond = (int)fraction; + nanoStrLen = fractionStrLen; + while (idx < len - 1) { + c = asciiInt[bytes[idx++] & 255]; + le |= c; + fraction = (fraction << 3) + (fraction << 1) + c; + ++fractionStrLen; + } + while (nanoStrLen < 9) { + nanoOfSecond = (nanoOfSecond << 3) + (nanoOfSecond << 1); + ++nanoStrLen; + } + chr = (bytes[idx++] & 255); + if (le < 0 || ('Z' != chr && 'z' != chr)) { + return null; + } + else { + return new WarcDate(year, month, dayOfMonth, hour, minute, second, nanoOfSecond, fraction, fractionStrLen); + } + } + } + } + } + } + } + return null; + } + + @Override + public String toString() { + char[] tmpStr = null; + int fractionIdx; + int tmpFractionLen; + long tmpFraction; + switch (precision) { + case P_YEAR: + tmpStr = new char[4]; + break; + case P_MONTH: + tmpStr = new char[7]; + break; + case P_DAYOFMONTH: + tmpStr = new char[10]; + break; + case P_MINUTE: + tmpStr = new char[17]; + break; + case P_SECOND: + tmpStr = new char[20]; + break; + case P_FRACTION: + tmpStr = new char[21 + fractionLen]; + break; + default: + throw new IllegalStateException("Invalid precision level."); + } + int idx = 0; + if (precision >= P_YEAR) { + tmpStr[idx++] = (char)('0' + year / 1000 % 10); + tmpStr[idx++] = (char)('0' + year / 100 % 10); + tmpStr[idx++] = (char)('0' + year / 10 % 10); + tmpStr[idx++] = (char)('0' + year % 10); + if (precision >= P_MONTH) { + tmpStr[idx++] = '-'; + tmpStr[idx++] = (char)('0' + month / 10 % 10); + tmpStr[idx++] = (char)('0' + month % 10); + if (precision >= P_DAYOFMONTH) { + tmpStr[idx++] = '-'; + tmpStr[idx++] = (char)('0' + dayOfMonth / 10 % 10); + tmpStr[idx++] = (char)('0' + dayOfMonth % 10); + if (precision >= P_MINUTE) { + tmpStr[idx++] = 'T'; + tmpStr[idx++] = (char)('0' + hour / 10 % 10); + tmpStr[idx++] = (char)('0' + hour % 10); + tmpStr[idx++] = ':'; + tmpStr[idx++] = (char)('0' + minute / 10 % 10); + tmpStr[idx++] = (char)('0' + minute % 10); + if (precision >= P_SECOND) { + tmpStr[idx++] = ':'; + tmpStr[idx++] = (char)('0' + second / 10 % 10); + tmpStr[idx++] = (char)('0' + second % 10); + if (precision >= P_FRACTION) { + tmpStr[idx++] = '.'; + tmpFraction = fraction; + tmpFractionLen = fractionLen; + idx += fractionLen; + fractionIdx = idx; + tmpStr[--fractionIdx] = (char)('0' + tmpFraction % 10); + --tmpFractionLen; + while (tmpFractionLen > 0) { + tmpFraction = tmpFraction / 10; + tmpStr[--fractionIdx] = (char)('0' + tmpFraction % 10); + --tmpFractionLen; + } + } + } + tmpStr[idx++] = 'Z'; + } + } + } + } + return new String(tmpStr); + } + + @Override + public int hashCode() { + return Objects.hash(dayOfMonth, fraction, fractionLen, hour, ldt, minute, month, nanoOfSecond, precision, second, year); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + WarcDate other = (WarcDate) obj; + return dayOfMonth == other.dayOfMonth && fraction == other.fraction && fractionLen == other.fractionLen + && hour == other.hour && Objects.equals(ldt, other.ldt) && minute == other.minute + && month == other.month && nanoOfSecond == other.nanoOfSecond && precision == other.precision + && second == other.second && year == other.year; + } + +} diff --git a/jwat-warc/src/main/java/org/jwat/warc/WarcDateParser.java b/jwat-warc/src/main/java/org/jwat/warc/WarcDateParser.java index 70e42f7..46b818d 100644 --- a/jwat-warc/src/main/java/org/jwat/warc/WarcDateParser.java +++ b/jwat-warc/src/main/java/org/jwat/warc/WarcDateParser.java @@ -28,6 +28,7 @@ * * @author lbihanic, selghissassi, nicl */ +@Deprecated public final class WarcDateParser { /** WARC DateFormat as specified in the WARC ISO standard. */ diff --git a/jwat-warc/src/test/java/org/jwat/warc/TestWarcDate.java b/jwat-warc/src/test/java/org/jwat/warc/TestWarcDate.java new file mode 100644 index 0000000..9447568 --- /dev/null +++ b/jwat-warc/src/test/java/org/jwat/warc/TestWarcDate.java @@ -0,0 +1,292 @@ +/** + * Java Web Archive Toolkit - Software to read and validate ARC, WARC + * and GZip files. (http://jwat.org/) + * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.jwat.warc; + +import java.util.Date; + +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class TestWarcDate { + + @Test + public void test_warcdate_w3cdtf() { + Object[][] cases; + WarcDate wd; + + cases = new Object[][] { + {"4"}, + {"a"}, + {"43"}, + {"a3"}, + {"ab"}, + {"4b"}, + {"432"}, + {"a32"}, + {"4b2"}, + {"43c"}, + {"abc"}, + {"a321"}, + {"4b21"}, + {"43c1"}, + {"432d"}, + {"a33d"}, + {"4bc1"}, + {"abcd"}, + {"1972-1"}, + {"1972-a2"}, + {"1972-1b"}, + {"1972:12"}, + {"1984-34-3"}, + {"1984-34-a4"}, + {"1984-34-3b"}, + {"1984-34:34"}, + {"1992-56-78T"}, + {"1992-56-78T1"}, + {"1992-56-78T12"}, + {"1992-56-78T12:"}, + {"1992-56-78T12:3"}, + {"1992-56-78T12:34"}, + {"1992-56-78T12:34S"}, + {"1992-56-78S12:34Z"}, + {"1992-56-78Ta2:34Z"}, + {"1992-56-78T1b:34Z"}, + {"1992-56-78T12:c4Z"}, + {"1992-56-78T12:3dZ"}, + {"1992-56-78Ta2:3dZ"}, + {"1992-56-78T1b:c4Z"}, + {"1992-56-78Tab:cdZ"}, + {"1992-56-78T12-34Z"}, + {"1992-56-78T12:34:"}, + {"1992-56-78T12:34:5"}, + {"1992-56-78T12:34:56"}, + {"1992-56-78T12:34:5b"}, + {"1992-56-78T12:34:a6"}, + {"1992-56-78T12:34:ab"}, + {"1992-56-78T12:34:5bZ"}, + {"1992-56-78T12:34:a6Z"}, + {"1992-56-78T12:34:abZ"}, + {"1992-56-78T12:34-56Z"}, + {"1992-56-78T12:34:56S"}, + {"1992-56-78T12:34:56.Z"}, + {"1991-07-12T14:41:00.aZ"}, + {"1991-07-12T14:41:00.1bZ"}, + {"1991-07-12T14:41:00.12cZ"}, + {"1991-07-12T14:41:00.123dZ"}, + {"1991-07-12T14:41:00.12cdZ"}, + {"1991-07-12T14:41:00.1bcdZ"}, + {"1991-07-12T14:41:00.abcdZ"}, + {"1991-07-12T14:41:00.abc4Z"}, + {"1991-07-12T14:41:00.abc4Z"}, + {"1991-07-12T14:41:00.a234Z"}, + {"1991-07-12T14:41:00.1234567890S"}, + }; + for (int i=0; i