Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle Character Encoding #238 #251

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/maven.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ jobs:
java-version: 17
# Given the fact that this is a multimodule project, build process will take long time so we activate caching
# To know more: https://maven.apache.org/extensions/maven-build-cache-extension/cache.html
cache: 'maven'
#cache: 'maven'
- name: Build with Maven
#To see the full stack trace of the errors, re-run Maven with the -e switch.
#Re-run Maven using the -X switch to enable full debug logging.
# -B,--batch-mode Run in non-interactive (batch) mode (disables output color)
# To learn more about options: https://maven.apache.org/ref/3.6.3/maven-embedder/cli.html
run: |
mvn package -B -e -X
mvn clean test -B -e -X
mvn site -B -e -X --projects 'jsgenerator-core'
env:
MAVEN_SITE_GITHUB_OAUTH_TOKEN: ${{ secrets.MAVEN_SITE_GITHUB_OAUTH_TOKEN }}
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,60 @@
import com.osscameroon.jsgenerator.core.internal.ConverterDefault;
import lombok.NonNull;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;

import static java.nio.charset.StandardCharsets.UTF_8;

@FunctionalInterface
public interface Converter {
default void convert(@NonNull final InputStream inputStream, @NonNull final OutputStream outputStream) throws IOException {
convert(inputStream, outputStream, new Configuration());
}

/**
* A helper method to work with language-native String and array of data structures.
*
* @param input The input HTML string
* @param configuration The object related to variable declaration (let, const or var) and query selector
* @return Lines of output JS code
*/
default String[] convert(@NonNull String input, Configuration configuration) throws IOException{
final var inputStream = new ByteArrayInputStream(input.getBytes(UTF_8));
final var outputStream = new ByteArrayOutputStream();

convert(inputStream, outputStream, configuration);

return outputAsStrippedLines(outputStream);
}

private String[] outputAsStrippedLines(ByteArrayOutputStream outputStream) {
return outputStream
.toString(UTF_8)
.lines()
.map(String::strip)
.filter(line -> !line.isEmpty())
.toArray(String[]::new);
}

default String[] convert(@NonNull byte[] input, Configuration configuration) throws IOException{
return convert(new String(input, UTF_8),configuration);
}

default String convert(Configuration configuration, ByteArrayOutputStream outputStream, ByteArrayInputStream inputStream) {
try {
convert(inputStream, outputStream, configuration);
} catch (IOException exception) {
throw new RuntimeException(exception);
}
return outputStream.toString(UTF_8);
}


void convert(@NonNull final InputStream inputStream, @NonNull final OutputStream outputStream, @NonNull Configuration configuration) throws IOException;

static Converter of() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,39 @@
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;

import static java.lang.String.format;
import static java.lang.String.join;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.jsoup.parser.Parser.xmlParser;

//TODO: Think about the user will use this library , if needed provide 4 explicit methods for these 4 cases
// code html to code js
// code html to file js
// file html to code js
// file html to file js


public class ConverterDefault implements Converter {
private static final List<String> BOOLEAN_ATTRIBUTES = List.of("allowfullscreen", "async", "autofocus",
"autoplay", "checked", "controls", "default", "defer", "disabled", "formnovalidate", "ismap", "itemscope",
Expand Down Expand Up @@ -65,9 +84,12 @@ public void convert(InputStream inputStream, OutputStream outputStream, Configur
// NOTE: There is nothing to do
if (content.isBlank()) return;

//String normalisedContent = nfdNormalized(content);

final var variableNameStrategy = configuration.getVariableNameStrategy();
final var document = Jsoup.parse(content, xmlParser());
final var writer = new OutputStreamWriter(outputStream);
//final var writer = new OutputStreamWriter(outputStream, StandardCharsets.UTF_8);

final var selector = configuration.getTargetElementSelector();

Expand Down Expand Up @@ -316,4 +338,91 @@ private void visitScriptNode(Writer writer, Element element, String variable,
private String resolveDeclarationKeyWord(VariableDeclaration variableDeclaration) {
return variableDeclaration.name().toLowerCase();
}

private String nfdNormalized(String txt) {
if (!Normalizer.isNormalized(txt, Normalizer.Form.NFD)) {
return Normalizer.normalize(txt, Normalizer.Form.NFD);
}
return txt;
}

private String decodeTextWithUTF8(String input) throws IOException {
return
new BufferedReader(
new InputStreamReader(
new ByteArrayInputStream(input.getBytes()),
UTF_8))
.readLine();
}

private String a (String text) throws CharacterCodingException {
// Text to decode with unknown encoding
//String text = "你好, こんにちは, 안녕하세요, مرحبًا";

// Charset for encoding and decoding (UTF-8)
Charset charset = StandardCharsets.UTF_8;

// Create a CharsetDecoder for decoding
CharsetDecoder decoder = charset.newDecoder();
decoder.onMalformedInput(CodingErrorAction.REPLACE);
decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);

// Encode the text into bytes using UTF-8
ByteBuffer encodedBytes = charset.encode(text);

// Reset the decoder and decode the bytes to characters
decoder.reset();
CharBuffer decodedChars = decoder.decode(encodedBytes);

// Get the decoded text
String decodedText = decodedChars.toString();

return decodedText;
}

private String b (String text) throws CharacterCodingException {
// Text to decode with unknown encoding
//String text = "你好, こんにちは, 안녕하세요, مرحبًا";

// Normalize the text using NFKC normalization
String normalizedText = Normalizer.normalize(text, Normalizer.Form.NFD);

// Decode the normalized text with UTF-8
byte[] bytes = normalizedText.getBytes(StandardCharsets.UTF_8);
String decodedText = new String(bytes, StandardCharsets.UTF_8);

//System.out.println("Original Text: " + text);
//System.out.println("Decoded Text: " + decodedText);

return decodedText;
}

private String c (String text) throws CharacterCodingException {
// Text to decode with unknown encoding
//String text = "你好, こんにちは, 안녕하세요, مرحبًا";

// Normalize the text using NFKC normalization
String normalizedText = Normalizer.normalize(text, Normalizer.Form.NFD);

// Charset for encoding and decoding (UTF-8)
Charset charset = StandardCharsets.UTF_8;

// Create a CharsetDecoder for decoding
CharsetDecoder decoder = charset.newDecoder();
decoder.onMalformedInput(CodingErrorAction.REPLACE);
decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);

// Encode the text into bytes using UTF-8
ByteBuffer encodedBytes = charset.encode(normalizedText);

// Reset the decoder and decode the bytes to characters
decoder.reset();
CharBuffer decodedChars = decoder.decode(encodedBytes);

// Get the decoded text
String decodedText = decodedChars.toString();

return decodedText;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
import com.osscameroon.jsgenerator.api.domain.InlineOptions;
import com.osscameroon.jsgenerator.api.domain.MultipartOptions;
import com.osscameroon.jsgenerator.api.domain.Output;
import com.osscameroon.jsgenerator.core.Configuration;
import com.osscameroon.jsgenerator.core.Converter;
import com.osscameroon.jsgenerator.core.OutputStreamResolver;
import lombok.RequiredArgsConstructor;
import org.slf4j.Logger;
import org.springframework.core.io.AbstractResource;
import org.springframework.core.io.ByteArrayResource;
import org.springframework.http.MediaType;
import org.springframework.util.LinkedMultiValueMap;
import org.springframework.util.MultiValueMap;
import org.springframework.web.bind.annotation.PostMapping;
Expand All @@ -24,6 +24,8 @@
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.List;
import java.util.Map;
import java.util.Optional;
Expand All @@ -49,7 +51,7 @@ public class ConvertController {
private final OutputStreamResolver pathOutputStreamResolver;
private final Converter converter;

//TODO: Make sure all these 4 case are taken into account
//TODO: Make sure all these 4 cases are taken into account
// code html to code js OK
// code html to file js
// file html to code js
Expand All @@ -64,9 +66,10 @@ public Reply<? extends List<? extends Output>> convertAction(@RequestBody @Valid
final var configuration = options.toConfiguration();

return Reply.ofSuccesses(options.getContents().stream()
.map(content -> convert(
.map(content -> converter.convert(
configuration,
new ByteArrayOutputStream(),
//convertInlineContentWithCopyrightCharacterWithComment works after doing this, why ? What happened ?
new ByteArrayInputStream(content.getBytes(UTF_8))))
.map(content -> {
final var filename = inlineOutputStreamResolver.resolve(options.getPattern(), Map.of(
Expand All @@ -91,7 +94,7 @@ public MultiValueMap<String, AbstractResource> convertAction(@RequestPart("optio

multipartFiles.stream().map(multipartFile -> {
try {
return convert(
return converter.convert(
command.toConfiguration(),
new ByteArrayOutputStream(),
new ByteArrayInputStream(multipartFile.getBytes()));
Expand All @@ -111,18 +114,27 @@ public MultiValueMap<String, AbstractResource> convertAction(@RequestPart("optio
return new Output(filename, content);
})
.forEach(output ->
map.add(output.getFilename(), new ByteArrayResource(output.getContent().getBytes(UTF_8))));
map.add(output.getFilename(), new ByteArrayResource(output.getContent().getBytes())));

return map;
}

private String convert(Configuration configuration, ByteArrayOutputStream outputStream, ByteArrayInputStream inputStream) {
try {
converter.convert(inputStream, outputStream, configuration);
} catch (IOException exception) {
throw new RuntimeException(exception);
}
private byte[] encodingAndDecodingInUTF8(MultipartFile file) throws IOException {

return outputStream.toString(UTF_8);
// Encode the MultipartFile as UTF-8 bytes
byte[] encodedBytes = file.getBytes();
String encodedText = new String(encodedBytes,StandardCharsets.UTF_8);

// Decode the UTF-8 bytes back to MultipartFile
byte[] decodedBytes = encodedText.getBytes(StandardCharsets.UTF_8);

/*
String content = new String (multipartFile.getBytes());

byte[] encodedBytes = content.getBytes(UTF_8);

return new String(encodedBytes, UTF_8).getBytes();*/

return decodedBytes;
}
}
6 changes: 6 additions & 0 deletions jsgenerator-slim-api/src/main/resources/application.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,9 @@ springdoc:
path: /
use-root-path: false
disable-swagger-default-url: true
server:
servlet:
encoding:
charset: UTF-8
enabled: true
force: true
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ public interface Command extends Callable<Integer> {

boolean isQuerySelectorAdded();

boolean isCommentConversionModeActivated();

List<String> getInlineContents();

Converter getConverter();
Expand Down
Loading