Skip to content

Commit

Permalink
Tidied up logic in client and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
csrster committed Nov 6, 2020
1 parent 75a6326 commit 6996761
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 113 deletions.
@@ -1,38 +1,28 @@
package dk.netarkivet.common.utils.warc;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.file.Paths;

import dk.netarkivet.common.CommonSettings;
import dk.netarkivet.common.distribute.arcrepository.BitarchiveRecord;
import dk.netarkivet.common.distribute.arcrepository.bitrepository.Bitrepository;
import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.utils.Settings;

import org.apache.commons.httpclient.methods.ExpectContinueMethod;
import org.apache.http.*;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;
import org.archive.io.ArchiveRecord;
import org.archive.io.warc.WARCReaderFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import dk.netarkivet.common.utils.FileUtils;

public class WarcRecordClient {
/**
Expand All @@ -56,7 +46,6 @@ public URI getBaseUri() {
private static final String USER_AGENT = "Mozilla/5.0";
private static int timeout = MILLISECONDS_PER_SECOND;
private long offset;
boolean atFirst = true;

public long getOffset() {
return offset;
Expand Down Expand Up @@ -89,7 +78,7 @@ public WarcRecordClient(URI baseUri) {
* @throws IOException if reading file fails
* @throws UnsupportedOperationException is used if method is not implemented
*/
public BitarchiveRecord getWarc(URI uri, long offset) throws Exception {
private BitarchiveRecord fetchBitarchiveRecord(URI uri, long offset) throws Exception {
RequestConfig.Builder requestBuilder = RequestConfig.custom();
requestBuilder.setConnectTimeout(timeout);
requestBuilder.setConnectionRequestTimeout(timeout);
Expand All @@ -101,42 +90,43 @@ public BitarchiveRecord getWarc(URI uri, long offset) throws Exception {
.addHeader("Range", "bytes=" + offset + "-")
.build();
log.debug("Executing request " + request.getRequestLine());
try {
CloseableHttpClient closableHttpClient = HttpClients.custom().setConnectionManager(cm).build();
try (CloseableHttpResponse httpResponse = closableHttpClient.execute(request)) {
log.debug("httpResponse status: " + httpResponse.getStatusLine().toString());
if (httpResponse.getStatusLine().getStatusCode() != 200) {
log.error("Http request error " + httpResponse.getStatusLine().getStatusCode());
return null;
}
HttpEntity entity = httpResponse.getEntity();
if (entity != null) {
InputStream iStr = entity.getContent();
ArchiveReader archiveReader = WARCReaderFactory.get("fake.warc", iStr, atFirst);
ArchiveRecord archiveRecord = archiveReader.get();
BitarchiveRecord reply = new BitarchiveRecord(archiveRecord, fileName);
log.debug("reply: " + reply.toString());
return reply;
} else {
log.warn("Received null response entity for request for {}, {}", uri, offset);
return null;
}
boolean atFirst = (offset == 0L);
CloseableHttpClient closableHttpClient = HttpClients.custom().setConnectionManager(cm).build();
try (CloseableHttpResponse httpResponse = closableHttpClient.execute(request)) {
log.debug("httpResponse status: " + httpResponse.getStatusLine().toString());
if (httpResponse.getStatusLine().getStatusCode() != 200) {
log.error("Http request error " + httpResponse.getStatusLine().getStatusCode());
return null;
}
HttpEntity entity = httpResponse.getEntity();
if (entity != null) {
InputStream iStr = entity.getContent();
//Note that data that comes back from WarcRecordService has been decompressed so to get the
//right arc/warc parser from the ArchiveReaderFactory we have to give it the name of the
//uncompressed file.
final String inflatedName = fileName.replace(".gz", "");
ArchiveReader archiveReader = ArchiveReaderFactory.get(inflatedName, iStr, atFirst);
ArchiveRecord archiveRecord = archiveReader.get();
BitarchiveRecord reply = new BitarchiveRecord(archiveRecord, fileName);
log.debug("reply: " + reply.toString());
return reply;
} else {
log.warn("Received null response entity for request for {}, {}", uri, offset);
return null;
}
} catch (Exception e) {
throw e;
}
}

/**
* Retrieves a single BitarchiveRecord from the repository from a given file and offset. If the operation fails for
* any reason, this method returns null.
*
* @param arcfileName Name of the arcfile to retrieve.
* @param arcfileName Name of the arcfile/warcfile to retrieve.
* @param index offset to fetch specific record from warc or arc file
*/
public BitarchiveRecord get(String arcfileName, long index) {
public BitarchiveRecord getBitarchiveRecord(String arcfileName, long index) {

BitarchiveRecord warcInstance = null;
BitarchiveRecord bitarchiveRecord = null;
try {
ArgumentNotValid.checkNotNullOrEmpty(arcfileName, "arcfile");
ArgumentNotValid.checkNotNegative(index, "index");
Expand All @@ -146,11 +136,11 @@ public BitarchiveRecord get(String arcfileName, long index) {
String strUri = this.getBaseUri().toString() + "/" + arcfileName;

URI uri = new URI(strUri);
warcInstance = this.getWarc(uri, index);
bitarchiveRecord = this.fetchBitarchiveRecord(uri, index);
} catch (Exception e) {
log.error("Failed to retrieve record at offset {} from file {}.", index, arcfileName, e);
}
return warcInstance;
return bitarchiveRecord;
}


Expand Down
Expand Up @@ -13,7 +13,6 @@
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.sql.SQLOutput;

import static org.junit.Assert.*;

Expand Down Expand Up @@ -68,7 +67,7 @@ public void get() throws IOException, URISyntaxException {

//setting to NetarchiveSuite.
WarcRecordClient warcRecordClient = new WarcRecordClient(baseUri);
BitarchiveRecord bitarchiveRecord = warcRecordClient.get("10-4-20161218234343407-00000-kb-test-har-003.kb.dk.warc.gz", 3442L);
BitarchiveRecord bitarchiveRecord = warcRecordClient.getBitarchiveRecord("10-4-20161218234343407-00000-kb-test-har-003.kb.dk.warc.gz", 3442L);
assertNotNull("Should have non null BitarchiveRecord", bitarchiveRecord);
assertTrue("Expect a non-zero length bitarchiveRecord", IOUtils.toByteArray(bitarchiveRecord.getData()).length > 100);
System.out.println("\n\n" + IOUtils.toString(bitarchiveRecord.getData()));
Expand Down
Expand Up @@ -2,21 +2,16 @@

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileDescriptor;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.commons.io.IOUtils;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveRecord;
import org.archive.io.arc.ARCReaderFactory;
import org.archive.io.arc.ARCRecord;
import org.archive.io.warc.WARCReader;
import org.archive.io.warc.WARCReaderFactory;
import org.archive.io.warc.WARCRecord;
import org.junit.Test;

import dk.netarkivet.common.distribute.arcrepository.BitarchiveRecord;
Expand All @@ -36,7 +31,7 @@ public class WarcRecordClientTester {
public void testGet() throws Exception {
final URI baseUri = new URI("http://localhost:8883/cgi-bin2/py1.cgi");
WarcRecordClient warcRecordClient = new WarcRecordClient(baseUri);
BitarchiveRecord bitarchiveRecord = warcRecordClient.get("10-4-20161218234343407-00000-kb-test-har-003.kb.dk.warc.gz", 3442L);
BitarchiveRecord bitarchiveRecord = warcRecordClient.getBitarchiveRecord("10-4-20161218234343407-00000-kb-test-har-003.kb.dk.warc.gz", 3442L);
assertNotNull("Should have non null BitarchiveRecord", bitarchiveRecord);
assertTrue("Expect a non-zero length bitarchiveRecord",IOUtils.toByteArray(bitarchiveRecord.getData()).length > 100);

Expand Down Expand Up @@ -95,17 +90,10 @@ public void testBuildingBitarchiveRecord5() throws Exception {
URI test_uri = SAMPLE_HOST;
long offset = 3442L;
WarcRecordClient warcRecordClient = new WarcRecordClient(test_uri);
BitarchiveRecord warcRecord = warcRecordClient.getWarc(SAMPLE_HOST, offset);
Boolean fail = false;

try {
BitarchiveRecord bitarchiveRecord = warcRecordClient.get(filename, offset);
bitarchiveRecord.getData(System.out);
} catch (NullPointerException e) {
System.out.println("Nullpointer Exception caused by offset errror");
fail = true;
}
assertFalse("Exception", fail);
BitarchiveRecord warcRecord = warcRecordClient.getBitarchiveRecord(filename, offset);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
warcRecord.getData(baos);
assertTrue("Should have significant data", baos.size() > 10);
}

@Test
Expand All @@ -115,7 +103,7 @@ public void testMultipleCalls() throws Exception {
long offset = 3442L;
WarcRecordClient warcRecordClient = new WarcRecordClient(HOST);
for (int i = 0; i < 40; i++) {
BitarchiveRecord bitarchiveRecord = warcRecordClient.get(filename, offset);
BitarchiveRecord bitarchiveRecord = warcRecordClient.getBitarchiveRecord(filename, offset);
assertNotNull("Expect non null record", bitarchiveRecord);
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
bitarchiveRecord.getData(byteArrayOutputStream);
Expand All @@ -131,11 +119,11 @@ public void testPosBuildingBitarchiveRecord6() throws Exception {
URI test_uri = SAMPLE_HOST;
long offset = 3442L;
WarcRecordClient warcRecordClient = new WarcRecordClient(test_uri);
BitarchiveRecord warcRecord = warcRecordClient.getWarc(SAMPLE_HOST, offset);
BitarchiveRecord warcRecord = warcRecordClient.getBitarchiveRecord(filename, offset);
Boolean fail = false;

try {
BitarchiveRecord bitarchiveRecord = warcRecordClient.get(filename, offset);
BitarchiveRecord bitarchiveRecord = warcRecordClient.getBitarchiveRecord(filename, offset);
bitarchiveRecord.getData(System.out);
} catch (NullPointerException e) {
System.out.println("Nullpointer Exception caused by offset errror");
Expand Down Expand Up @@ -173,17 +161,8 @@ public void testBuildingBitarchiveRecord001() throws Exception {
URI test_uri = SAMPLE_HOST;
long offset = 5000L; // 3442L; //5000L;
WarcRecordClient warcRecordClient = new WarcRecordClient(test_uri);
BitarchiveRecord warcRecord = warcRecordClient.getWarc(SAMPLE_HOST, offset);
Boolean fail = false;

try {
BitarchiveRecord bitarchiveRecord = warcRecordClient.get("10-4-20161218234343407-00000-kb-test-har-003.kb.dk.warc.gz", offset);
bitarchiveRecord.getData(System.out);
} catch (NullPointerException e) {
System.out.println("Nullpointer Exception caused by offset errror");
fail = true;
}
assertTrue("Exception", fail);
BitarchiveRecord warcRecord = warcRecordClient.getBitarchiveRecord("10-4-20161218234343407-00000-kb-test-har-003.kb.dk.warc", offset);
assertNull(warcRecord);
}

@Test
Expand Down Expand Up @@ -211,21 +190,9 @@ public void testFailInBuildingBitarchiveRecord3() throws IOException, URISyntaxE
URI SAMPLE_HOST = new URI("http://localhost:8883/cgi-bin2/py1.cgi/" + filename);
URI test_uri = SAMPLE_HOST;
long offset = 4000L;
File inputFile = new File("src/test/java/data.txt");
System.out.println(inputFile.getAbsolutePath());
FileInputStream fileInputStream = new FileInputStream(inputFile);
Boolean fail = false;
try {
WarcRecordClient warcRecordClient = new WarcRecordClient(test_uri);
BitarchiveRecord warcRecord = warcRecordClient.getWarc(SAMPLE_HOST, offset);

warcRecord.getData(System.out);
}
catch (Exception e) {
System.out.println("Expect NullPointerException: " + e.getMessage());
fail = true; //Boolean.parseBoolean("Expect IOException" + e.getMessage());
}
assertTrue("Exception: ", fail);
WarcRecordClient warcRecordClient = new WarcRecordClient(test_uri);
BitarchiveRecord warcRecord = warcRecordClient.getBitarchiveRecord(filename, offset);
assertNull(warcRecord);
}

@Test
Expand All @@ -234,20 +201,9 @@ public void testFailInBuildingBitarchiveRecord4() throws IOException, URISyntaxE
URI SAMPLE_HOST = new URI("http://localhost:8883/cgi-bin2/py1.cgi/" + filename);
URI test_uri = SAMPLE_HOST;
long offset = 2L;
File inputFile = new File("src/test/java/data.txt");
System.out.println(inputFile.getAbsolutePath());
FileInputStream fileInputStream = new FileInputStream(inputFile);
Boolean fail = false;
try {
WarcRecordClient warcRecordClient = new WarcRecordClient(test_uri);
BitarchiveRecord warcRecord = warcRecordClient.getWarc(SAMPLE_HOST, offset);

warcRecord.getData(System.out);
} catch (Exception e) {
System.out.println("Exception: " + e.getMessage());
fail = true; //Boolean.parseBoolean("Expect IOException" + e.getMessage());
}
assertTrue("Exception: ", fail);
WarcRecordClient warcRecordClient = new WarcRecordClient(test_uri);
BitarchiveRecord warcRecord = warcRecordClient.getBitarchiveRecord(filename, offset);
assertNull(warcRecord);
}

// Test for file not found
Expand All @@ -259,17 +215,8 @@ public void testBuildingBitarchiveRecord4() throws Exception {
URI test_uri = SAMPLE_HOST;
long offset = 3442L;
WarcRecordClient warcRecordClient = new WarcRecordClient(test_uri);
BitarchiveRecord warcRecord = warcRecordClient.getWarc(SAMPLE_HOST, offset);
Boolean fail = false;

try {
BitarchiveRecord bitarchiveRecord = warcRecordClient.get(filename, offset);
bitarchiveRecord.getData(System.out);
} catch (NullPointerException e) {
System.out.println("Nullpointer Exception caused by File Not Found Error: " + e.getMessage());
fail = true;
}
assertTrue("Exception", fail);
BitarchiveRecord warcRecord = warcRecordClient.getBitarchiveRecord(filename, offset);
assertNull(warcRecord);
}

// Test using the wrong reader for .arc files
Expand All @@ -295,4 +242,16 @@ public void testFailInBuildingBitarchiveRecord1() throws IOException {
assertTrue("Exception: ", fail);
}

@Test
public void testPosBuildingBitarchiveRecord13() throws Exception {
String filename = "42-23-20060726143926-00000-udvikling.kb.dk.arc.gz";
URI uri = new URI("http://localhost:8883/cgi-bin2/py1.cgi/" + filename);
long offset = 789L;
WarcRecordClient warcRecordClient = new WarcRecordClient(uri);
BitarchiveRecord bitarchiveRecord = warcRecordClient.getBitarchiveRecord(filename, offset);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
bitarchiveRecord.getData(baos);
assertTrue("Should have non-zero length output", baos.size() > 10);
}

}

0 comments on commit 6996761

Please sign in to comment.