add final logic; will result in rename

Signed-off-by: Nicholas Walter Knize <nknize@apache.org>
nknize · Jul 27, 2023 · 0e159f9 · 0e159f9
1 parent 05bbb5c
commit 0e159f9
Showing 1 changed file with 113 additions and 0 deletions.
diff --git a/libs/core/src/main/java/org/opensearch/core/xcontent/MediaTypeRegistry.java b/libs/core/src/main/java/org/opensearch/core/xcontent/MediaTypeRegistry.java
@@ -32,10 +32,14 @@
 
 package org.opensearch.core.xcontent;
 
+import org.opensearch.core.common.bytes.BytesArray;
+import org.opensearch.core.common.bytes.BytesReference;
 import org.opensearch.core.xcontent.spi.MediaTypeProvider;
 
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.OutputStream;
+import java.io.UncheckedIOException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@@ -218,6 +222,115 @@ public static MediaType xContentType(CharSequence content) {
         return null;
     }
 
+    /**
+     * Guesses the content type based on the provided input stream without consuming it.
+     *
+     * @deprecated the content type should not be guessed except for few cases where we effectively don't know the content type.
+     * The REST layer should move to reading the Content-Type header instead. There are other places where auto-detection may be needed.
+     * This method is deprecated to prevent usages of it from spreading further without specific reasons.
+     */
+    @Deprecated
+    public static MediaType xContentType(InputStream si) throws IOException {
+        /*
+         * We need to guess the content type. To do this, we look for the first non-whitespace character and then try to guess the content
+         * type on the GUESS_HEADER_LENGTH bytes that follow. We do this in a way that does not modify the initial read position in the
+         * underlying input stream. This is why the input stream must support mark/reset and why we repeatedly mark the read position and
+         * reset.
+         */
+        if (si.markSupported() == false) {
+            throw new IllegalArgumentException("Cannot guess the xcontent type without mark/reset support on " + si.getClass());
+        }
+        si.mark(Integer.MAX_VALUE);
+        try {
+            // scan until we find the first non-whitespace character or the end of the stream
+            int current;
+            do {
+                current = si.read();
+                if (current == -1) {
+                    return null;
+                }
+            } while (Character.isWhitespace((char) current));
+            // now guess the content type off the next GUESS_HEADER_LENGTH bytes including the current byte
+            final byte[] firstBytes = new byte[GUESS_HEADER_LENGTH];
+            firstBytes[0] = (byte) current;
+            int read = 1;
+            while (read < GUESS_HEADER_LENGTH) {
+                final int r = si.read(firstBytes, read, GUESS_HEADER_LENGTH - read);
+                if (r == -1) {
+                    break;
+                }
+                read += r;
+            }
+            return mediaTypeFromBytes(firstBytes, 0, read);
+        } finally {
+            si.reset();
+        }
+
+    }
+
+    /**
+     * Guesses the content type based on the provided bytes.
+     *
+     * @deprecated the content type should not be guessed except for few cases where we effectively don't know the content type.
+     * The REST layer should move to reading the Content-Type header instead. There are other places where auto-detection may be needed.
+     * This method is deprecated to prevent usages of it from spreading further without specific reasons.
+     */
+    @Deprecated
+    public static MediaType xContentType(BytesReference bytes) {
+        if (bytes instanceof BytesArray) {
+            final BytesArray array = (BytesArray) bytes;
+            return mediaTypeFromBytes(array.array(), array.offset(), array.length());
+        }
+        try {
+            final InputStream inputStream = bytes.streamInput();
+            assert inputStream.markSupported();
+            return xContentType(inputStream);
+        } catch (IOException e) {
+            assert false : "Should not happen, we're just reading bytes from memory";
+            throw new UncheckedIOException(e);
+        }
+    }
+
+    /**
+     * Guesses the content type based on the provided bytes.
+     *
+     * @deprecated the content type should not be guessed except for few cases where we effectively don't know the content type.
+     * The REST layer should move to reading the Content-Type header instead. There are other places where auto-detection may be needed.
+     * This method is deprecated to prevent usages of it from spreading further without specific reasons.
+     */
+    @Deprecated
+    public static MediaType mediaTypeFromBytes(final byte[] data, int offset, int length) {
+        int totalLength = data.length;
+        if (totalLength == 0 || length == 0) {
+            return null;
+        } else if ((offset + length) > totalLength) {
+            return null;
+        }
+        for (var mediaType : formatToMediaType.values()) {
+            if (mediaType.detectedXContent(data, offset, length)) {
+                return mediaType;
+            }
+        }
+
+        // a last chance for JSON
+        int jsonStart = 0;
+        // JSON may be preceded by UTF-8 BOM
+        if (length > 3 && data[offset] == (byte) 0xEF && data[offset + 1] == (byte) 0xBB && data[offset + 2] == (byte) 0xBF) {
+            jsonStart = 3;
+        }
+
+        for (int i = jsonStart; i < length; i++) {
+            byte b = data[offset + i];
+            if (b == '{') {
+                return fromMediaType("application/json");
+            }
+            if (Character.isWhitespace(b) == false) {
+                break;
+            }
+        }
+
+        return null;
+    }
 
     /**
      * parsing media type that follows https://tools.ietf.org/html/rfc7231#section-3.1.1.1