Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Opens gzip compressed content #513

Merged
merged 8 commits into from
Jan 2, 2024
147 changes: 114 additions & 33 deletions metafacture-io/src/main/java/org/metafacture/io/HttpOpener.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2013, 2022 Deutsche Nationalbibliothek et al
* Copyright 2013, 2023 Deutsche Nationalbibliothek et al
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -32,19 +32,22 @@
import java.io.SequenceInputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLDecoder;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

/**
* Opens an {@link HttpURLConnection} and passes a reader to the receiver.
*
* @author Christoph Böhme
* @author Jan Schnasse
* @author Jens Wille
* @author Pascal Christoph (dr0i)
*/
@Description("Opens an HTTP resource. Supports setting HTTP header fields `Accept`, `Accept-Charset` and `Content-Type`, as well as generic headers (separated by `\\n`). Defaults: request `method` = `GET`, request `url` = `@-` (input data), request `body` = `@-` (input data) if request method supports body and input data not already used, `Accept` header = `*/*`, `Accept-Charset` header (`encoding`) = `UTF-8`, `errorPrefix` = `ERROR: `.")
@Description("Opens an HTTP resource. Supports setting HTTP header fields `Accept`, `Accept-Charset`, `Accept-Encoding` and `Content-Type`, as well as generic headers (separated by `\\n`). Defaults: request `method` = `GET`, request `url` = `@-` (input data), request `body` = `@-` (input data) if request method supports body and input data not already used, `Accept` header = `*/*`, `Accept-Charset` header (`encoding`) = `UTF-8`, `errorPrefix` = `ERROR: `.")
dr0i marked this conversation as resolved.
Show resolved Hide resolved
@In(String.class)
@Out(Reader.class)
@FluxCommand("open-http")
Expand All @@ -53,22 +56,21 @@ public final class HttpOpener extends DefaultObjectPipe<String, ObjectReceiver<R
public static final String ACCEPT_DEFAULT = "*/*";
public static final String ACCEPT_HEADER = "accept";
public static final String CONTENT_TYPE_HEADER = "content-type";
public static final String ACCEPT_ENCODING_HEADER = "accept-encoding";
public static final String ENCODING_HEADER = "content-encoding";
dr0i marked this conversation as resolved.
Show resolved Hide resolved
public static final String DEFAULT_PREFIX = "ERROR: ";
public static final String ENCODING_DEFAULT = "UTF-8";
public static final String ENCODING_HEADER = "accept-charset";
public static final String CHARSET_DEFAULT = "UTF-8";
public static final String ACCEPT_CHARSET_HEADER = "accept-charset";
public static final String INPUT_DESIGNATOR = "@-";

dr0i marked this conversation as resolved.
Show resolved Hide resolved
public static final String DEFAULT_METHOD_NAME = "GET";
public static final Method DEFAULT_METHOD = Method.valueOf(DEFAULT_METHOD_NAME);

public static final String HEADER_FIELD_SEPARATOR = "\n";
public static final String HEADER_VALUE_SEPARATOR = ":";

private static final Pattern HEADER_FIELD_SEPARATOR_PATTERN = Pattern.compile(HEADER_FIELD_SEPARATOR);
private static final Pattern HEADER_VALUE_SEPARATOR_PATTERN = Pattern.compile(HEADER_VALUE_SEPARATOR);

private static final int ALLOWED_REDIRECTIONS = 3;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this value be configurable?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would say: let's wait and implement if need arises. Would you be ok with this?

private static final int CONNECTION_TIMEOUT = 11000;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How did you arrive at this value? 11 seconds seems kind of arbitrary.

Should this value be configurable?

Copy link
Member Author

@dr0i dr0i Jan 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's kind of arbitrary. Setting these values at least easily prevent possible infinite loops.
Re configurable: I would say: let's wait and implement if need arises. Would you be ok with this?

private final Map<String, String> headers = new HashMap<>();

private Method method;
private String body;
private String errorPrefix;
Expand Down Expand Up @@ -118,7 +120,7 @@ public boolean getResponseHasBody() {
*/
public HttpOpener() {
setAccept(ACCEPT_DEFAULT);
setEncoding(ENCODING_DEFAULT);
setAcceptCharset(CHARSET_DEFAULT);
setErrorPrefix(DEFAULT_PREFIX);
setMethod(DEFAULT_METHOD);
setUrl(INPUT_DESIGNATOR);
Expand Down Expand Up @@ -163,17 +165,50 @@ public void setContentType(final String contentType) {
setHeader(CONTENT_TYPE_HEADER, contentType);
}

/**
* Sets the HTTP {@value ACCEPT_CHARSET_HEADER} header value. This is the
* preferred charset for the HTTP response.
* The default charset is {@value CHARSET_DEFAULT}.
*
* @param charset name of the charset used for the accept-charset HTTP header
*/
public void setAcceptCharset(final String charset) {
setHeader(ACCEPT_CHARSET_HEADER, charset);
}

/**
* @deprecated Use {@link #setAcceptCharset} instead.
* @param charset name of the charset used for the accept-charset HTTP header
*/
@Deprecated
public void setEncoding(final String charset) {
setAcceptCharset(charset);
}

/**
* Sets the HTTP {@value ACCEPT_ENCODING_HEADER} header value. This is the
* preferred content encoding for the HTTP response. It accepts HTTP compression.
* Allowed values are i.a. "gzip" and "Brotli".
* The default for the content encoding is null, which means "no compression".
*
* @param contentEncoding name of content encoding used for the accept-encoding HTTP
* header
*/
public void setAcceptContentEncoding(final String contentEncoding) {
dr0i marked this conversation as resolved.
Show resolved Hide resolved
setHeader(ACCEPT_ENCODING_HEADER, contentEncoding);
}

/**
* Sets the HTTP {@value ENCODING_HEADER} header value. This is the
* preferred encoding for the HTTP response. Additionally, the encoding
* is used for reading the HTTP response if it does not specify a content
* encoding. The default for the encoding is {@value ENCODING_DEFAULT}.
* content encoding for the HTTP GET. It enables HTTP compression.
dr0i marked this conversation as resolved.
Show resolved Hide resolved
* Allowed values are "gzip".
* The default for the content encoding is null, which means "no compression".
*
* @param encoding name of the encoding used for the accept-charset HTTP
* @param contentEncoding name of content encoding used for the content-encoding HTTP
* header
*/
public void setEncoding(final String encoding) {
setHeader(ENCODING_HEADER, encoding);
public void setContentEncoding(final String contentEncoding) {
setHeader(ENCODING_HEADER, contentEncoding);
}

/**
Expand Down Expand Up @@ -244,23 +279,15 @@ public void process(final String input) {
try {
final String requestUrl = getInput(input, url);
final String requestBody = getInput(input,
body == null && method.getRequestHasBody() ? INPUT_DESIGNATOR : body);

final HttpURLConnection connection =
(HttpURLConnection) new URL(requestUrl).openConnection();

connection.setRequestMethod(method.name());
headers.forEach(connection::addRequestProperty);

body == null && method.getRequestHasBody() ? INPUT_DESIGNATOR : body);
Reader reader = null;
dr0i marked this conversation as resolved.
Show resolved Hide resolved
if (requestBody != null) {
connection.setDoOutput(true);
connection.getOutputStream().write(requestBody.getBytes());
reader = doPostOrPut(requestBody, new URL(requestUrl));
}

final InputStream inputStream = getInputStream(connection);
final String contentEncoding = getEncoding(connection.getContentEncoding());

getReceiver().process(new InputStreamReader(inputStream, contentEncoding));
else {
reader = doGet(requestUrl);
}
getReceiver().process(reader);
}
catch (final IOException e) {
throw new MetafactureException(e);
Expand All @@ -270,6 +297,32 @@ public void process(final String input) {
}
}

private Reader doPostOrPut(final String requestBody, final URL urlToOpen) throws IOException {
final HttpURLConnection connection = (HttpURLConnection) urlToOpen.openConnection();
connection.setDoOutput(true);
connection.setRequestMethod(method.name());
headers.forEach(connection::setRequestProperty);
dr0i marked this conversation as resolved.
Show resolved Hide resolved
connection.getOutputStream().write(requestBody.getBytes());
final InputStream inputStream = getInputStream(connection);
return new InputStreamReader(inputStream, headers.get(ACCEPT_CHARSET_HEADER));
dr0i marked this conversation as resolved.
Show resolved Hide resolved
}

private Reader doGet(final String requestUrl) throws IOException {
final Reader reader;
final HttpURLConnection connection;
connection = followRedirects(new URL(requestUrl));
final InputStream inputStream = getInputStream(connection);

if ("gzip".equalsIgnoreCase(connection.getContentEncoding())) {
final GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream);
reader = new InputStreamReader(gzipInputStream);
dr0i marked this conversation as resolved.
Show resolved Hide resolved
}
else {
reader = new InputStreamReader(inputStream, headers.get(ACCEPT_CHARSET_HEADER));
dr0i marked this conversation as resolved.
Show resolved Hide resolved
}
return reader;
}

private String getInput(final String input, final String value) {
final String result;

Expand Down Expand Up @@ -312,8 +365,36 @@ private InputStream getErrorStream(final InputStream errorStream) {
}
}

private String getEncoding(final String contentEncoding) {
return contentEncoding != null ? contentEncoding : headers.get(ENCODING_HEADER);
private HttpURLConnection followRedirects(final URL startingUrl) throws IOException {
int times = 0;
HttpURLConnection conn;
URL urlToFollow = startingUrl;
while (true) {
times = times + 1;

if (times > ALLOWED_REDIRECTIONS) {
throw new IOException("Stuck in redirect loop");
dr0i marked this conversation as resolved.
Show resolved Hide resolved
}

conn = (HttpURLConnection) urlToFollow.openConnection();
headers.forEach(conn::setRequestProperty);
dr0i marked this conversation as resolved.
Show resolved Hide resolved
conn.setRequestMethod(method.name());
conn.setConnectTimeout(CONNECTION_TIMEOUT);
dr0i marked this conversation as resolved.
Show resolved Hide resolved
conn.setInstanceFollowRedirects(false); // Make the logic below easier to detect redirections
dr0i marked this conversation as resolved.
Show resolved Hide resolved

switch (conn.getResponseCode()) {
case HttpURLConnection.HTTP_MOVED_PERM:
case HttpURLConnection.HTTP_MOVED_TEMP:
String location = conn.getHeaderField("Location");
location = URLDecoder.decode(location, "UTF-8");
urlToFollow = new URL(urlToFollow, location); // Deal with relative URLs
continue;
default:
break;
}
break;
}
return conn;
}

}
61 changes: 32 additions & 29 deletions metafacture-io/src/test/java/org/metafacture/io/HttpOpenerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,12 @@

package org.metafacture.io;

import org.metafacture.commons.ResourceUtil;
import org.metafacture.framework.ObjectReceiver;
dr0i marked this conversation as resolved.
Show resolved Hide resolved

import com.github.tomakehurst.wiremock.client.MappingBuilder;
import com.github.tomakehurst.wiremock.client.ResponseDefinitionBuilder;
import com.github.tomakehurst.wiremock.client.WireMock;
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
import com.github.tomakehurst.wiremock.http.HttpHeader;
import com.github.tomakehurst.wiremock.http.HttpHeaders;
import com.github.tomakehurst.wiremock.http.RequestMethod;
import com.github.tomakehurst.wiremock.junit.WireMockRule;
import com.github.tomakehurst.wiremock.matching.RequestPatternBuilder;
Expand All @@ -32,20 +31,22 @@
import org.junit.ComparisonFailure;
import org.junit.Rule;
import org.junit.Test;
import org.metafacture.commons.ResourceUtil;
import org.metafacture.framework.ObjectReceiver;
import org.mockito.ArgumentCaptor;
import org.mockito.Captor;
import org.mockito.Mock;
import org.mockito.Mockito;
import org.mockito.junit.MockitoJUnit;
import org.mockito.junit.MockitoRule;

import static org.mockito.Mockito.times;

import java.io.IOException;
import java.io.Reader;
import java.io.*;
dr0i marked this conversation as resolved.
Show resolved Hide resolved
import java.util.Arrays;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import java.util.zip.GZIPOutputStream;

import static org.mockito.Mockito.times;

/**
* Tests for class {@link HttpOpener}.
Expand All @@ -62,6 +63,18 @@ public final class HttpOpenerTest {

private static final String REQUEST_BODY = "request body";
private static final String RESPONSE_BODY = "response bödy"; // UTF-8
private static byte[] GZIPPED_RESPONSE_BODY;
static {
try {
ByteArrayOutputStream out = new ByteArrayOutputStream();
GZIPOutputStream gzip = new GZIPOutputStream(out);
gzip.write(RESPONSE_BODY.getBytes("UTF-8"));
gzip.close();
GZIPPED_RESPONSE_BODY = out.toByteArray();
}catch (Exception e){
dr0i marked this conversation as resolved.
Show resolved Hide resolved
e.printStackTrace();
}
}

@Rule
public MockitoRule mockitoRule = MockitoJUnit.rule();
Expand Down Expand Up @@ -226,40 +239,23 @@ public void shouldPerformPostRequestWithContentTypeParameter() throws IOExceptio
}

@Test
public void shouldPerformPostRequestWithEncodingParameter() throws IOException {
final String encoding = "ISO-8859-1";
public void shouldPerformPostRequestWithCharsetParameter() throws IOException {
final String charset = "ISO-8859-1";
final String header = "Accept-Charset";
final StringValuePattern value = WireMock.equalTo(encoding);
final StringValuePattern value = WireMock.equalTo(charset);

try {
shouldPerformRequest(REQUEST_BODY, HttpOpener.Method.POST, (o, u) -> {
o.setMethod(HttpOpener.Method.POST);
o.setUrl(u);
o.setEncoding(encoding);
o.setAcceptCharset(charset);
}, s -> s.withHeader(header, value), q -> q.withHeader(header, value), null);
}
catch (final ComparisonFailure e) {
Assert.assertEquals("expected:<response b[ö]dy> but was:<response b[ö]dy>", e.getMessage());
}
}

@Test
public void shouldPerformPostRequestWithEncodingParameterAndContentEncodingResponseHeader() throws IOException {
dr0i marked this conversation as resolved.
Show resolved Hide resolved
final String encoding = "ISO-8859-1";
final String header = "Accept-Charset";
final StringValuePattern value = WireMock.equalTo(encoding);

shouldPerformRequest(REQUEST_BODY, HttpOpener.Method.POST, (o, u) -> {
o.setMethod(HttpOpener.Method.POST);
o.setUrl(u);
o.setEncoding(encoding);
},
s -> s.withHeader(header, value),
q -> q.withHeader(header, value),
r -> r.withHeader("Content-Encoding", "UTF-8")
);
}

@Test
public void shouldPerformGetRequestWithErrorResponse() throws IOException {
shouldPerformRequest(TEST_URL, HttpOpener.Method.GET, (o, u) -> {},
Expand All @@ -278,6 +274,14 @@ public void shouldPerformGetRequestWithErrorResponseAndWithoutErrorPrefixParamet
null, null, WireMock.badRequest().withBody(RESPONSE_BODY), RESPONSE_BODY);
}

@Test
public void shouldPerformGetRequestWithGzipedContentEncoding() throws IOException {
dr0i marked this conversation as resolved.
Show resolved Hide resolved
shouldPerformRequest(TEST_URL, HttpOpener.Method.GET, (o, u) -> o.setAcceptContentEncoding("gzip"),
null, null,
WireMock.ok().withBody(GZIPPED_RESPONSE_BODY).withHeaders(new HttpHeaders(new HttpHeader(HttpOpener.ENCODING_HEADER,"gzip"))),
RESPONSE_BODY);
}

private void shouldPerformRequest(final String input, final HttpOpener.Method method, final BiConsumer<HttpOpener, String> consumer, final String... headers) throws IOException {
shouldPerformRequest(input, method, consumer,
s -> Arrays.stream(headers).forEach(h -> s.withHeader(h, TEST_VALUE)),
Expand All @@ -289,7 +293,6 @@ private void shouldPerformRequest(final String input, final HttpOpener.Method me
if (responseConsumer != null) {
responseConsumer.accept(response);
}

shouldPerformRequest(input, method,
consumer, stubConsumer, requestConsumer,
response, method.getResponseHasBody() ? RESPONSE_BODY : "");
Expand Down