Browse files

added mapping of \u<hex> strings to bytes using code adapted from rjson.

  • Loading branch information...
1 parent 15e742d commit 81de092aef31c7c3c628dc1256ab54d25c13ac9e @duncantl duncantl committed Feb 21, 2012
Showing with 225 additions and 12 deletions.
  1. +2 −0 .Rbuildignore
  2. +1 −1 .Rinstignore
  3. +16 −1 Changes.html
  4. +1 −1 DESCRIPTION
  5. +2 −1 R/curl.S
  6. +20 −4 R/dynamic.R
  7. +2 −2 config.log
  8. +5 −1 inst/doc/GNUmakefile
  9. +1 −1 inst/doc/RCurl.tex
  10. +8 −0 src/curl.c
  11. +167 −0 src/json.c
View
2 .Rbuildignore
@@ -19,3 +19,5 @@ cookies.R
hang.R
postFormEg.R
manyConnections.R
+.*\.log
+.*\.aux
View
2 .Rinstignore
@@ -1,2 +1,2 @@
RCurl.tex
-logo.jpg
+
View
17 Changes.html
@@ -5,6 +5,21 @@
<body>
+<h2>1.92-0</h2>
+<dl>
+ <dt>
+ <li> Added conversion for downloaded content to map
+ Unicode characters of the form \unnnn to proper UTF encoding in R.
+ <dd> Code is adapted from Alex Couture-Beil's rjson package. Thanks.
+</dl>
+
+<h2>1.91-1</h2>
+<dl>
+ <dt>
+ <li> Move logo.jpg and a reference to it in an example.
+ <dd>
+</dl>
+
<h2>1.91-0</h2>
<dl>
<dt>
@@ -916,6 +931,6 @@
<address><a href="http://www.stat.ucdavis.edu/~duncan">Duncan Temple Lang</a>
<a href=mailto:duncan@wald.ucdavis.edu>&lt;duncan@wald.ucdavis.edu&gt;</a></address>
<!-- hhmts start -->
-Last modified: Tue Jan 31 20:08:05 PST 2012
+Last modified: Mon Feb 20 18:10:27 PST 2012
<!-- hhmts end -->
</body> </html>
View
2 DESCRIPTION
@@ -1,5 +1,5 @@
Package: RCurl
-Version: 1.91-0
+Version: 1.92-0
Title: General network (HTTP/FTP/...) client interface for R
Author: Duncan Temple Lang
SystemRequirements: libcurl (version 7.14.0 or higher) http://curl.haxx.se.
View
3 R/curl.S
@@ -86,13 +86,14 @@ function(txt = character(), max = NA, value = NULL)
val = if(missing(value))
function(collapse="", ...) {
+ txt = mapUnicodeEscapes(txt)
if(is.null(collapse))
return(txt)
paste(txt, collapse = collapse, ...)
}
else
- function() value(txt)
+ function() value(mapUnicodeEscapes(txt))
ans = list(update = update,
View
24 R/dynamic.R
@@ -142,10 +142,9 @@ function(curl = getCurlHandle(), txt = character(), max = NA, value = NULL, verb
encode =
function(str) {
- if(grepl("\\\\u[0-9]", str))
- RCurlIconv(str, from = "C99", to = encoding)
- else
- str
+ # RCurlIconv(str, from = "C99", to = encoding)
+ mapUnicodeEscapes(str)
+
}
@@ -159,3 +158,20 @@ function(curl = getCurlHandle(), txt = character(), max = NA, value = NULL, verb
ans$reset()
ans
}
+
+mapUnicodeEscapes =
+ #
+ # processes the string, converting \u<hex>{4} sequences to bytes
+ # and returning a UTF-8 encoded string.
+ #
+ #
+function(str, len = nchar(str) * 4L)
+{
+ str = as.character(str)
+ len = rep(as.integer(len), length = length(str))
+
+ if(any(grepl("\\\\u[0-9A-Fa-f]", str)))
+ .Call("R_mapString", str, len, PACKAGE = "RCurl")
+ else
+ str
+}
View
4 config.log
@@ -30,8 +30,8 @@ Kernel configured for up to 4 processors.
Processor type: i486 (Intel 80486)
Processors active: 0 1 2 3
Primary memory available: 8.00 gigabytes
-Default processor set: 235 tasks, 1089 threads, 4 processors
-Load average: 3.41, Mach factor: 1.39
+Default processor set: 201 tasks, 895 threads, 4 processors
+Load average: 3.42, Mach factor: 0.57
/bin/machine = unknown
/usr/bin/oslevel = unknown
/bin/universe = unknown
View
6 inst/doc/GNUmakefile
@@ -1,5 +1,9 @@
philosophy.html:
+ifndef DYN_DOCS
+ DYN_DOCS=$(HOME)/Classes/StatComputing/XDynDocs/inst
+endif
+
# HOME=D:/cygwin/home/duncan
XSLTPROC=xsltproc
@@ -27,4 +31,4 @@ WEB_SITE=www.omegahat.org:/home3/WebSites/Omega/RCurl
ship: philosophy.html
scp $^ $(WEB_SITE)
-include $(DYN_DOCS)/inst/Make/Makefile
+include $(DYN_DOCS)/Make/Makefile
View
2 inst/doc/RCurl.tex
@@ -180,7 +180,7 @@ \subsubsection{.netrc}
\subsubsection{SSL}
-\section{Advanced Features}\labe{AdvancedFeatures}
+\section{Advanced Features}\label{AdvancedFeatures}
\subsection{Accessing the header information}
View
8 src/curl.c
@@ -960,7 +960,15 @@ R_call_R_write_function(SEXP fun, void *buffer, size_t size, size_t nmemb, RWrit
// nmemb =- 2;
// }
// probably don't need the encoding at this point!
+#if 0
+ const char *tmp;
+ int len = size * nmemb;
+ tmp = Rf_reEnc(buffer, CE_NATIVE, CE_UTF8, 0);
+ len = strlen(tmp);
+ PROTECT(str = mkCharLenCE(tmp, len, encoding));
+#else
PROTECT(str = mkCharLenCE(buffer, size * nmemb, encoding));
+#endif
#else
/* PROTECT(str = mkCharLen(buffer, size * nmemb)); */
// PROTECT(str = mkCharLen(buffer, size *nmemb)); /* Problems with the upload example in complete.Rd */
View
167 src/json.c
@@ -0,0 +1,167 @@
+/*
+ This code is adapted from Alex Couture-Beil's <rjson_pkg@mofo.ca>
+ rjson package. It converts strings containing Unicode of the form
+ \u<4 hex characters> to R's format, i.e. by mapping them to
+ 1, 2, 3 or 4 bytes.
+
+ This is adapted so that it can be used independently of a JSON string
+ and just converts an arbitrary character.
+
+ It is distributed under the GPL-2 license.
+ */
+
+#if 0
+static int x;
+#else
+#include <Rdefines.h>
+#include <Rinternals.h>
+
+#include <stdlib.h>
+
+#define MASKBITS 0x3F
+#define MASKBYTE 0x80
+#define MASK2BYTES 0xC0
+#define MASK3BYTES 0xE0
+
+
+
+int UTF8Encode2BytesUnicode( unsigned short input, char * s )
+{
+ // 0xxxxxxx
+ if( input < 0x80 )
+ {
+ s[ 0 ] = input;
+ return 1;
+ }
+ // 110xxxxx 10xxxxxx
+ else if( input < 0x800 )
+ {
+ s[ 0 ] = (MASK2BYTES | ( input >> 6 ) );
+ s[ 1 ] = (MASKBYTE | ( input & MASKBITS ) );
+ return 2;
+ }
+ // 1110xxxx 10xxxxxx 10xxxxxx
+ else if( input < 0x10000 )
+ {
+ s[ 0 ] = (MASK3BYTES | ( input >> 12 ) );
+ s[ 1 ] = (MASKBYTE | ( ( input >> 6 ) & MASKBITS ) );
+ s[ 2 ] = (MASKBYTE | ( input & MASKBITS ) );
+ return 3;
+ }
+}
+
+
+SEXP mapString(const char *s, char *buf, size_t bufLen)
+{
+ int i = 0;
+ int nchar = strlen(s);
+ buf[0] = '\0';
+ char *cur = buf;
+
+ while( i < nchar ) {
+ while(i < nchar && s[ i ] != '\\' && s[ i ] != '\0') {
+ cur[0] = s[i];
+ i++; cur++;
+ }
+
+ if(s[i] == '\0')
+ break;
+
+ if( s[ i ] == '\\' ) {
+ i++;
+ switch( s[ i ] ) {
+ case '"':
+ cur[0] = '\\';
+ cur[1] = '"';
+ cur+=2;
+ break;
+ case '\\':
+ case '/':
+ cur[ 0 ] = s[ i ];
+ cur++;
+ break;
+ case 'r':
+ cur[0] = '\r'; cur++;
+ break;
+ case 'n':
+ cur[0] = '\n'; cur++;
+ break;
+ case 'b':
+ cur[0] = '\b'; cur++;
+ break;
+ case 't':
+ cur[0] = '\t'; cur++;
+ break;
+ case 'f':
+ cur[0] = '\f'; cur++;
+ break;
+ case 'u':
+ for( int j = 1; j <= 4; j++ )
+ if( ( ( s[ i + j ] >= 'a' && s[ i + j ] <= 'f' ) ||
+ ( s[ i + j ] >= 'A' && s[ i + j ] <= 'F' ) ||
+ ( s[ i + j ] >= '0' && s[ i + j ] <= '9' ) ) == FALSE ) {
+ PROBLEM "unexpected unicode escaped char '%c'; 4 hex digits should follow the \\u (found %i valid digits)", s[ i + j ], j - 1
+ ERROR;
+ }
+
+ unsigned short unicode;
+ char unicode_buf[ 5 ]; /* to hold 4 digit hex (to prevent scanning a 5th digit accidentally */
+ strncpy( unicode_buf, s + i + 1, 5 );
+ unicode_buf[ 4 ] = '\0';
+ sscanf( unicode_buf, "%hx", &unicode);
+ cur += UTF8Encode2BytesUnicode( unicode, cur);
+
+ i += 4; /* skip the four digits - actually point to last digit, which is then incremented outside of switch */
+
+ break;
+ default:
+ cur[ 0 ] = s[ i ];
+ cur++;
+ break;
+/*
+ PROBLEM "unexpected escaped character '\\%c' at position %d", s[ i ], i + 1
+ ERROR;
+*/
+ break;
+ }
+
+ i++; /* move to next char */
+ }
+ }
+ cur[0] = '\0';
+
+
+ return(mkCharCE( buf, CE_UTF8 ));
+}
+
+
+
+SEXP R_mapString(SEXP str, SEXP suggestedLen)
+{
+ int numEls = Rf_length(str);
+ SEXP ans;
+ PROTECT(ans = NEW_CHARACTER(numEls));
+ for(int i = 0; i < numEls; i++) {
+
+ int num;
+ if(Rf_length(suggestedLen))
+ num = INTEGER(suggestedLen)[i];
+ else
+ num = 4 * strlen(CHAR(STRING_ELT(str, i)));
+
+ char * buf = (char *) R_alloc(num, sizeof(char));
+ if(!buf) {
+ PROBLEM "can't allocate memory for working buffer"
+ ERROR;
+ }
+
+ SET_VECTOR_ELT(ans, i, mapString(CHAR(STRING_ELT(str, i)), buf, INTEGER(suggestedLen)[i]));
+ }
+
+ UNPROTECT(1);
+ return(ans);
+}
+
+#endif
+
+

0 comments on commit 81de092

Please sign in to comment.