Permalink
Browse files

Figuring out BOM, XHTML, encoding URLs (duh)

  • Loading branch information...
morphex committed Apr 23, 2018
1 parent cc37b73 commit f32b12f429e257b241cabd4f4d426f2472251065
Showing with 43 additions and 4 deletions.
  1. +6 −2 src/main/java/org/morphex/app/App.java
  2. +37 −2 src/main/java/org/morphex/app/WriteXHTML.java
@@ -7,6 +7,8 @@
import java.io.*;
import org.apache.commons.text.StringEscapeUtils;
import org.apache.commons.codec.net.URLCodec;
import org.apache.commons.codec.EncoderException;
class TwitterStatusFetcher {
Twitter twitter;
@@ -44,8 +46,10 @@ Status getNextStatus() throws TwitterException {
static ConfigurationBuilder config = null;
static String outputXHTML = "";
static ResolveRedirect resolver;
static URLCodec URLEncoder = new URLCodec();
public static String URLtoHTML(String URL) {
public static String URLtoHTML(String URL) throws EncoderException {
URL = URLEncoder.encode(URL);
return "<a href='" + URL + "'>" + URL + "</a>";
}
@@ -105,7 +109,7 @@ public static void main( String[] args ) throws Exception
// Thread.sleep(2000);
count += 1;
// Safeguard for testing
if (count >= 50) {
if (count >= 50 && true) {
break;
}
}
@@ -6,8 +6,43 @@
public class WriteXHTML {
static String charset_ = "UTF-16LE";
// static Charset utf = Charset.forName(charset);
static String header = new String("<?xml version='1.0' encoding='" + charset_ + "' ?><!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Transitional//EN' 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'><html xmlns='http://www.w3.org/1999/xhtml'><head><title>Test</title></head><body>");
static String header1 = new String("<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Transitional//EN' 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'><html xmlns='http://www.w3.org/1999/xhtml' lang='");
static String header2 = new String("' xml:lang='");
static String header3 = new String("'><head><title>Test</title>");
// Validator https://validator.w3.org/i18n-checker/ says one
// should use meta charset tag instead of http-equiv.
// static String header4 = new String("<meta http-equiv='content-type' content='text/xhtml; charset=");
static String header4 = new String("<meta charset='");
static String header5 = new String("' />");
static String header6 = new String("</head><body>");
static String footer = new String("</body></html>");
static String language = new String("en");
public static String getHeader() {
String charset__ = "";
// W3C HTML validator gives weird errors, but
// https://validator.w3.org/i18n-checker/
// explains that UTF-16 should be specified
// as an encoding inside the document, and a
// BOM should indicate the endian orientation
//
// Commented out as W3C validator complains about
// UTF-16 in meta http-equiv content-type and charset.
/*
if (charset_.equals("UTF-16LE") ||
charset_.equals("UTF-16BE")) {
charset__ = "UTF-16";
} else {
charset__ = charset_;
}
*/
/*
return header1 + language + header2 + language + header3 +
header4 + charset__ + header5 + header6;
*/
return header1 + language + header2 + language + header3 +
header6;
}
public static byte[] e(String toEncode) throws Exception {
return toEncode.getBytes(charset_);
@@ -30,7 +65,7 @@ public static void writeFile(String html) throws Exception {
} else {
throw new java.lang.Error("Unsupported encoding for unicode BOM");
}
out.write(e(header));
out.write(e(getHeader()));
out.write(e(html));
out.write(e(footer));
out.close();

0 comments on commit f32b12f

Please sign in to comment.