From 6996761e42e6f09685269eee0b656c2686cbc2d6 Mon Sep 17 00:00:00 2001 From: csrster Date: Fri, 6 Nov 2020 11:37:31 +0100 Subject: [PATCH] Tidied up logic in client and tests --- .../common/utils/warc/WarcRecordClient.java | 70 ++++++------ .../utils/warc/WarcRecordClientTest.java | 3 +- .../utils/warc/WarcRecordClientTester.java | 101 ++++++------------ 3 files changed, 61 insertions(+), 113 deletions(-) diff --git a/common/common-core/src/main/java/dk/netarkivet/common/utils/warc/WarcRecordClient.java b/common/common-core/src/main/java/dk/netarkivet/common/utils/warc/WarcRecordClient.java index 5b75562a0c..c1eb9ccf0f 100644 --- a/common/common-core/src/main/java/dk/netarkivet/common/utils/warc/WarcRecordClient.java +++ b/common/common-core/src/main/java/dk/netarkivet/common/utils/warc/WarcRecordClient.java @@ -1,38 +1,28 @@ package dk.netarkivet.common.utils.warc; -import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.URI; -import java.net.URISyntaxException; import java.nio.file.Paths; import dk.netarkivet.common.CommonSettings; import dk.netarkivet.common.distribute.arcrepository.BitarchiveRecord; -import dk.netarkivet.common.distribute.arcrepository.bitrepository.Bitrepository; import dk.netarkivet.common.exceptions.ArgumentNotValid; -import dk.netarkivet.common.exceptions.IOFailure; import dk.netarkivet.common.utils.Settings; -import org.apache.commons.httpclient.methods.ExpectContinueMethod; import org.apache.http.*; -import org.apache.http.client.ClientProtocolException; -import org.apache.http.client.ResponseHandler; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; -import org.apache.http.util.EntityUtils; import org.archive.io.ArchiveReader; +import org.archive.io.ArchiveReaderFactory; import org.archive.io.ArchiveRecord; -import org.archive.io.warc.WARCReaderFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import dk.netarkivet.common.utils.FileUtils; public class WarcRecordClient { /** @@ -56,7 +46,6 @@ public URI getBaseUri() { private static final String USER_AGENT = "Mozilla/5.0"; private static int timeout = MILLISECONDS_PER_SECOND; private long offset; - boolean atFirst = true; public long getOffset() { return offset; @@ -89,7 +78,7 @@ public WarcRecordClient(URI baseUri) { * @throws IOException if reading file fails * @throws UnsupportedOperationException is used if method is not implemented */ - public BitarchiveRecord getWarc(URI uri, long offset) throws Exception { + private BitarchiveRecord fetchBitarchiveRecord(URI uri, long offset) throws Exception { RequestConfig.Builder requestBuilder = RequestConfig.custom(); requestBuilder.setConnectTimeout(timeout); requestBuilder.setConnectionRequestTimeout(timeout); @@ -101,29 +90,30 @@ public BitarchiveRecord getWarc(URI uri, long offset) throws Exception { .addHeader("Range", "bytes=" + offset + "-") .build(); log.debug("Executing request " + request.getRequestLine()); - try { - CloseableHttpClient closableHttpClient = HttpClients.custom().setConnectionManager(cm).build(); - try (CloseableHttpResponse httpResponse = closableHttpClient.execute(request)) { - log.debug("httpResponse status: " + httpResponse.getStatusLine().toString()); - if (httpResponse.getStatusLine().getStatusCode() != 200) { - log.error("Http request error " + httpResponse.getStatusLine().getStatusCode()); - return null; - } - HttpEntity entity = httpResponse.getEntity(); - if (entity != null) { - InputStream iStr = entity.getContent(); - ArchiveReader archiveReader = WARCReaderFactory.get("fake.warc", iStr, atFirst); - ArchiveRecord archiveRecord = archiveReader.get(); - BitarchiveRecord reply = new BitarchiveRecord(archiveRecord, fileName); - log.debug("reply: " + reply.toString()); - return reply; - } else { - log.warn("Received null response entity for request for {}, {}", uri, offset); - return null; - } + boolean atFirst = (offset == 0L); + CloseableHttpClient closableHttpClient = HttpClients.custom().setConnectionManager(cm).build(); + try (CloseableHttpResponse httpResponse = closableHttpClient.execute(request)) { + log.debug("httpResponse status: " + httpResponse.getStatusLine().toString()); + if (httpResponse.getStatusLine().getStatusCode() != 200) { + log.error("Http request error " + httpResponse.getStatusLine().getStatusCode()); + return null; + } + HttpEntity entity = httpResponse.getEntity(); + if (entity != null) { + InputStream iStr = entity.getContent(); + //Note that data that comes back from WarcRecordService has been decompressed so to get the + //right arc/warc parser from the ArchiveReaderFactory we have to give it the name of the + //uncompressed file. + final String inflatedName = fileName.replace(".gz", ""); + ArchiveReader archiveReader = ArchiveReaderFactory.get(inflatedName, iStr, atFirst); + ArchiveRecord archiveRecord = archiveReader.get(); + BitarchiveRecord reply = new BitarchiveRecord(archiveRecord, fileName); + log.debug("reply: " + reply.toString()); + return reply; + } else { + log.warn("Received null response entity for request for {}, {}", uri, offset); + return null; } - } catch (Exception e) { - throw e; } } @@ -131,12 +121,12 @@ public BitarchiveRecord getWarc(URI uri, long offset) throws Exception { * Retrieves a single BitarchiveRecord from the repository from a given file and offset. If the operation fails for * any reason, this method returns null. * - * @param arcfileName Name of the arcfile to retrieve. + * @param arcfileName Name of the arcfile/warcfile to retrieve. * @param index offset to fetch specific record from warc or arc file */ - public BitarchiveRecord get(String arcfileName, long index) { + public BitarchiveRecord getBitarchiveRecord(String arcfileName, long index) { - BitarchiveRecord warcInstance = null; + BitarchiveRecord bitarchiveRecord = null; try { ArgumentNotValid.checkNotNullOrEmpty(arcfileName, "arcfile"); ArgumentNotValid.checkNotNegative(index, "index"); @@ -146,11 +136,11 @@ public BitarchiveRecord get(String arcfileName, long index) { String strUri = this.getBaseUri().toString() + "/" + arcfileName; URI uri = new URI(strUri); - warcInstance = this.getWarc(uri, index); + bitarchiveRecord = this.fetchBitarchiveRecord(uri, index); } catch (Exception e) { log.error("Failed to retrieve record at offset {} from file {}.", index, arcfileName, e); } - return warcInstance; + return bitarchiveRecord; } diff --git a/common/common-core/src/test/java/dk/netarkivet/common/utils/warc/WarcRecordClientTest.java b/common/common-core/src/test/java/dk/netarkivet/common/utils/warc/WarcRecordClientTest.java index bac8739511..53b1edbeb1 100644 --- a/common/common-core/src/test/java/dk/netarkivet/common/utils/warc/WarcRecordClientTest.java +++ b/common/common-core/src/test/java/dk/netarkivet/common/utils/warc/WarcRecordClientTest.java @@ -13,7 +13,6 @@ import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; -import java.sql.SQLOutput; import static org.junit.Assert.*; @@ -68,7 +67,7 @@ public void get() throws IOException, URISyntaxException { //setting to NetarchiveSuite. WarcRecordClient warcRecordClient = new WarcRecordClient(baseUri); - BitarchiveRecord bitarchiveRecord = warcRecordClient.get("10-4-20161218234343407-00000-kb-test-har-003.kb.dk.warc.gz", 3442L); + BitarchiveRecord bitarchiveRecord = warcRecordClient.getBitarchiveRecord("10-4-20161218234343407-00000-kb-test-har-003.kb.dk.warc.gz", 3442L); assertNotNull("Should have non null BitarchiveRecord", bitarchiveRecord); assertTrue("Expect a non-zero length bitarchiveRecord", IOUtils.toByteArray(bitarchiveRecord.getData()).length > 100); System.out.println("\n\n" + IOUtils.toString(bitarchiveRecord.getData())); diff --git a/common/common-core/src/test/java/dk/netarkivet/common/utils/warc/WarcRecordClientTester.java b/common/common-core/src/test/java/dk/netarkivet/common/utils/warc/WarcRecordClientTester.java index 8b7a48f7a4..248f1b3630 100644 --- a/common/common-core/src/test/java/dk/netarkivet/common/utils/warc/WarcRecordClientTester.java +++ b/common/common-core/src/test/java/dk/netarkivet/common/utils/warc/WarcRecordClientTester.java @@ -2,9 +2,7 @@ import java.io.ByteArrayOutputStream; import java.io.File; -import java.io.FileDescriptor; import java.io.FileInputStream; -import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; @@ -12,11 +10,8 @@ import org.apache.commons.io.IOUtils; import org.archive.io.ArchiveReader; import org.archive.io.ArchiveRecord; -import org.archive.io.arc.ARCReaderFactory; import org.archive.io.arc.ARCRecord; -import org.archive.io.warc.WARCReader; import org.archive.io.warc.WARCReaderFactory; -import org.archive.io.warc.WARCRecord; import org.junit.Test; import dk.netarkivet.common.distribute.arcrepository.BitarchiveRecord; @@ -36,7 +31,7 @@ public class WarcRecordClientTester { public void testGet() throws Exception { final URI baseUri = new URI("http://localhost:8883/cgi-bin2/py1.cgi"); WarcRecordClient warcRecordClient = new WarcRecordClient(baseUri); - BitarchiveRecord bitarchiveRecord = warcRecordClient.get("10-4-20161218234343407-00000-kb-test-har-003.kb.dk.warc.gz", 3442L); + BitarchiveRecord bitarchiveRecord = warcRecordClient.getBitarchiveRecord("10-4-20161218234343407-00000-kb-test-har-003.kb.dk.warc.gz", 3442L); assertNotNull("Should have non null BitarchiveRecord", bitarchiveRecord); assertTrue("Expect a non-zero length bitarchiveRecord",IOUtils.toByteArray(bitarchiveRecord.getData()).length > 100); @@ -95,17 +90,10 @@ public void testBuildingBitarchiveRecord5() throws Exception { URI test_uri = SAMPLE_HOST; long offset = 3442L; WarcRecordClient warcRecordClient = new WarcRecordClient(test_uri); - BitarchiveRecord warcRecord = warcRecordClient.getWarc(SAMPLE_HOST, offset); - Boolean fail = false; - - try { - BitarchiveRecord bitarchiveRecord = warcRecordClient.get(filename, offset); - bitarchiveRecord.getData(System.out); - } catch (NullPointerException e) { - System.out.println("Nullpointer Exception caused by offset errror"); - fail = true; - } - assertFalse("Exception", fail); + BitarchiveRecord warcRecord = warcRecordClient.getBitarchiveRecord(filename, offset); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + warcRecord.getData(baos); + assertTrue("Should have significant data", baos.size() > 10); } @Test @@ -115,7 +103,7 @@ public void testMultipleCalls() throws Exception { long offset = 3442L; WarcRecordClient warcRecordClient = new WarcRecordClient(HOST); for (int i = 0; i < 40; i++) { - BitarchiveRecord bitarchiveRecord = warcRecordClient.get(filename, offset); + BitarchiveRecord bitarchiveRecord = warcRecordClient.getBitarchiveRecord(filename, offset); assertNotNull("Expect non null record", bitarchiveRecord); ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); bitarchiveRecord.getData(byteArrayOutputStream); @@ -131,11 +119,11 @@ public void testPosBuildingBitarchiveRecord6() throws Exception { URI test_uri = SAMPLE_HOST; long offset = 3442L; WarcRecordClient warcRecordClient = new WarcRecordClient(test_uri); - BitarchiveRecord warcRecord = warcRecordClient.getWarc(SAMPLE_HOST, offset); + BitarchiveRecord warcRecord = warcRecordClient.getBitarchiveRecord(filename, offset); Boolean fail = false; try { - BitarchiveRecord bitarchiveRecord = warcRecordClient.get(filename, offset); + BitarchiveRecord bitarchiveRecord = warcRecordClient.getBitarchiveRecord(filename, offset); bitarchiveRecord.getData(System.out); } catch (NullPointerException e) { System.out.println("Nullpointer Exception caused by offset errror"); @@ -173,17 +161,8 @@ public void testBuildingBitarchiveRecord001() throws Exception { URI test_uri = SAMPLE_HOST; long offset = 5000L; // 3442L; //5000L; WarcRecordClient warcRecordClient = new WarcRecordClient(test_uri); - BitarchiveRecord warcRecord = warcRecordClient.getWarc(SAMPLE_HOST, offset); - Boolean fail = false; - - try { - BitarchiveRecord bitarchiveRecord = warcRecordClient.get("10-4-20161218234343407-00000-kb-test-har-003.kb.dk.warc.gz", offset); - bitarchiveRecord.getData(System.out); - } catch (NullPointerException e) { - System.out.println("Nullpointer Exception caused by offset errror"); - fail = true; - } - assertTrue("Exception", fail); + BitarchiveRecord warcRecord = warcRecordClient.getBitarchiveRecord("10-4-20161218234343407-00000-kb-test-har-003.kb.dk.warc", offset); + assertNull(warcRecord); } @Test @@ -211,21 +190,9 @@ public void testFailInBuildingBitarchiveRecord3() throws IOException, URISyntaxE URI SAMPLE_HOST = new URI("http://localhost:8883/cgi-bin2/py1.cgi/" + filename); URI test_uri = SAMPLE_HOST; long offset = 4000L; - File inputFile = new File("src/test/java/data.txt"); - System.out.println(inputFile.getAbsolutePath()); - FileInputStream fileInputStream = new FileInputStream(inputFile); - Boolean fail = false; - try { - WarcRecordClient warcRecordClient = new WarcRecordClient(test_uri); - BitarchiveRecord warcRecord = warcRecordClient.getWarc(SAMPLE_HOST, offset); - - warcRecord.getData(System.out); - } - catch (Exception e) { - System.out.println("Expect NullPointerException: " + e.getMessage()); - fail = true; //Boolean.parseBoolean("Expect IOException" + e.getMessage()); - } - assertTrue("Exception: ", fail); + WarcRecordClient warcRecordClient = new WarcRecordClient(test_uri); + BitarchiveRecord warcRecord = warcRecordClient.getBitarchiveRecord(filename, offset); + assertNull(warcRecord); } @Test @@ -234,20 +201,9 @@ public void testFailInBuildingBitarchiveRecord4() throws IOException, URISyntaxE URI SAMPLE_HOST = new URI("http://localhost:8883/cgi-bin2/py1.cgi/" + filename); URI test_uri = SAMPLE_HOST; long offset = 2L; - File inputFile = new File("src/test/java/data.txt"); - System.out.println(inputFile.getAbsolutePath()); - FileInputStream fileInputStream = new FileInputStream(inputFile); - Boolean fail = false; - try { - WarcRecordClient warcRecordClient = new WarcRecordClient(test_uri); - BitarchiveRecord warcRecord = warcRecordClient.getWarc(SAMPLE_HOST, offset); - - warcRecord.getData(System.out); - } catch (Exception e) { - System.out.println("Exception: " + e.getMessage()); - fail = true; //Boolean.parseBoolean("Expect IOException" + e.getMessage()); - } - assertTrue("Exception: ", fail); + WarcRecordClient warcRecordClient = new WarcRecordClient(test_uri); + BitarchiveRecord warcRecord = warcRecordClient.getBitarchiveRecord(filename, offset); + assertNull(warcRecord); } // Test for file not found @@ -259,17 +215,8 @@ public void testBuildingBitarchiveRecord4() throws Exception { URI test_uri = SAMPLE_HOST; long offset = 3442L; WarcRecordClient warcRecordClient = new WarcRecordClient(test_uri); - BitarchiveRecord warcRecord = warcRecordClient.getWarc(SAMPLE_HOST, offset); - Boolean fail = false; - - try { - BitarchiveRecord bitarchiveRecord = warcRecordClient.get(filename, offset); - bitarchiveRecord.getData(System.out); - } catch (NullPointerException e) { - System.out.println("Nullpointer Exception caused by File Not Found Error: " + e.getMessage()); - fail = true; - } - assertTrue("Exception", fail); + BitarchiveRecord warcRecord = warcRecordClient.getBitarchiveRecord(filename, offset); + assertNull(warcRecord); } // Test using the wrong reader for .arc files @@ -295,4 +242,16 @@ public void testFailInBuildingBitarchiveRecord1() throws IOException { assertTrue("Exception: ", fail); } + @Test + public void testPosBuildingBitarchiveRecord13() throws Exception { + String filename = "42-23-20060726143926-00000-udvikling.kb.dk.arc.gz"; + URI uri = new URI("http://localhost:8883/cgi-bin2/py1.cgi/" + filename); + long offset = 789L; + WarcRecordClient warcRecordClient = new WarcRecordClient(uri); + BitarchiveRecord bitarchiveRecord = warcRecordClient.getBitarchiveRecord(filename, offset); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + bitarchiveRecord.getData(baos); + assertTrue("Should have non-zero length output", baos.size() > 10); + } + } \ No newline at end of file