Permalink
Browse files

lang/perl/Encode-JP-Mobile: AirHJIS supported. Not write changes.

  • Loading branch information...
1 parent 9ce07c3 commit d60d68508765ff800dbc49657b9e6e56eb3b6177 clouder committed May 19, 2008
Showing with 236 additions and 0 deletions.
  1. +2 −0 MANIFEST
  2. +9 −0 lib/Encode/JP/Mobile.pm
  3. +186 −0 lib/Encode/JP/Mobile/AirHJIS.pm
  4. +39 −0 t/airh-jis.t
View
@@ -9,6 +9,7 @@ dat/softbank-table.pl
dat/softbank-table.yaml
dat/softbank-unicode2sjis_auto.yaml
lib/Encode/JP/Mobile.pm
+lib/Encode/JP/Mobile/AirHJIS.pm
lib/Encode/JP/Mobile/Character.pm
lib/Encode/JP/Mobile/Charnames.pm
lib/Encode/JP/Mobile/ConvertPictogramSJIS.pm
@@ -20,6 +21,7 @@ Makefile.PL
MANIFEST This list of files
Rakefile
README
+t/airh-jis.t
t/character.t
t/charnames.t
t/convert-bruteforce.t
View
@@ -33,6 +33,7 @@ define_alias( 'x-utf8-vodafone' => 'x-utf8-softbank' );
use Encode::JP::Mobile::Vodafone;
use Encode::JP::Mobile::KDDIJIS;
+use Encode::JP::Mobile::AirHJIS;
use Encode::JP::Mobile::ConvertPictogramSJIS;
require Encode::JP::Mobile::Fallback;
require Encode::JP::Mobile::Character;
@@ -237,6 +238,14 @@ I<x-sjis-airedge> は I<x-sjis-docomo> の別名、として考えておくと
SoftBank および KDDI/AU の絵文字は適切な DoCoMo 絵文字(ウェブ入力用絵文字)にマッピングされます。
+=item x-iso-2022-jp-airh
+
+AirEDGE から送られる絵文字入りのメールを変換するためのエンコーディングです。
+
+AirEDGE から送られるメールは通常のメールと同様に I<iso-2022-jp> がベースとなっていますが、絵文字部分のみ I<x-sjis-docomo-raw> という独自仕様になっており、これはその混在したメールを変換するためのエンコーディングになります。
+
+C<x-iso-2022-jp-airedge>をエイリアスとして利用できます。
+
=item x-utf8-docomo, x-utf8-softbank, x-utf8-kddi
これらのエンコーディングは、Unicode 私用領域にある各キャリアの絵文字を相互変換しながら UTF-8 互換のエンコーディングにエンコードするのに使用します。utf-8 という名前がついていますが、実際にはすべての Unicode 文字をエンコードするわけではなく、サブセットとして、
@@ -0,0 +1,186 @@
+package Encode::JP::Mobile::AirHJIS;
+use strict;
+use warnings;
+use base qw(Encode::Encoding);
+use Encode::Alias;
+use Encode::CJKConstants qw(:all);
+use Encode qw(:fallbacks);
+use Encode::JP::Mobile;
+use POSIX 'ceil';
+use Carp;
+
+define_alias('x-iso-2022-jp-airh' => 'x-iso-2022-jp-airedge');
+__PACKAGE__->Define(qw(x-iso-2022-jp-airh));
+
+my $re_scan_sjis = qr{
+ $RE{SJIS_KANA}|$RE{SJIS_C}
+}x;
+
+my $re_scan_jis = qr{
+ (?:($RE{JIS_0212})|$RE{JIS_0208}|($RE{ISO_ASC})|($RE{JIS_KANA}))([^\e]*)
+}x;
+
+sub _encoding() { 'x-sjis-docomo-raw' }
+
+sub decode($$;$) {
+ my ($self, $str, $chk) = @_;
+
+ my $residue = '';
+ if ($chk) {
+ $str =~ s/([^\x00-\x7f].*)$//so and $residue = $1;
+ }
+ $residue .= _jis_sjis( \$str );
+ $_[1] = $residue if $chk;
+
+ return Encode::decode( $self->_encoding, $str, FB_PERLQQ );
+}
+
+sub encode($$;$) {
+ my ( $obj, $utf8, $chk ) = @_;
+ my $octet = Encode::encode( $obj->_encoding, $utf8, $chk );
+ return _sjis_jis( $octet );
+}
+
+sub ASC () { 1 }
+sub JIS_0208 () { 2 }
+sub KANA () { 3 }
+sub _sjis_jis {
+ my $octet = shift;
+
+ use bytes;
+
+ my @chars = split //, $octet;
+ my $mode = ASC;
+ my $res = '';
+
+ for (my $i=0; $i<@chars; $i++) {
+ my $x = ord $chars[$i];
+ if ($x < 0x80) {
+ if ($mode != ASC) {
+ $res .= $ESC{ASC};
+ $mode = ASC;
+ }
+ $res .= chr $x;
+ } elsif (0xA1 <= $x && $x <= 0xDF) {
+ if ($mode != KANA) {
+ $res .= $ESC{KANA};
+ $mode = KANA;
+ }
+ $mode = KANA;
+ $res .= chr($x - 0x80);
+ } else {
+ if ($mode != JIS_0208) {
+ $res .= $ESC{JIS_0208};
+ $mode = JIS_0208;
+ }
+ $i++;
+ last unless $i<@chars;
+ my ($c1, $c2) = _sjis2jis_one($x, ord $chars[$i]);
+ $res .= $c2 ? chr($c1).chr($c2) : $c1;
+ }
+ }
+
+ if ($mode != ASC) {
+ $res .= $ESC{ASC};
+ }
+
+ $res;
+}
+sub _sjis2jis_one {
+ my ($c1, $c2) = @_;
+
+ # 0xF89F - 0xF949
+ # 0xF950 - 0xF952
+ # 0xF955 - 0xF957
+ # 0xF95B - 0xF95E
+ # 0xF972 - 0xF9FC
+ my $c = ($c1<<8) + $c2;
+ if (0xF89F <= $c && $c <= 0xF949 ||
+ 0xF950 <= $c && $c <= 0xF952 ||
+ 0xF955 <= $c && $c <= 0xF957 ||
+ 0xF95B <= $c && $c <= 0xF95E ||
+ 0xF972 <= $c && $c <= 0xF9FC) {
+ return pack('H*', sprintf('%X', $c));
+ }
+
+ $c1 -= ($c1 <= 0x9f) ? 0x71 : 0xB1;
+ $c1 = $c1*2 + 1;
+
+ if ($c2 > 0x7F) {
+ $c2 -= 0x01;
+ }
+
+ if ($c2>=0x9E) {
+ $c2 = $c2-0x7D;
+ $c1++;
+ } else {
+ $c2 -= 0x1F;
+ }
+
+ return ($c1, $c2);
+}
+
+sub _jis_sjis {
+ local ${^ENCODING};
+
+ my $r_str = shift;
+ $$r_str =~ s($re_scan_jis){
+ my ($esc_0212, $esc_asc, $esc_kana, $chunk) = ($1, $2, $3, $4);
+
+ if ($esc_kana) {
+ $chunk =~ s{(.)}{
+ pack "H*", sprintf "%X", (0x80 + (hex unpack "H*", $1));
+ }geox;
+ $chunk;
+ } elsif ($esc_asc) {
+ $chunk;
+ } else {
+ $chunk =~ s{(?:($re_scan_sjis)|(..))}{
+ $1 ? $1 : pack "H*", sprintf "%X", _jis2sjis_one(hex(unpack "H*", $2))
+ }geox;
+ $chunk;
+ }
+ }geox;
+
+ my ($residue) = ( $$r_str =~ s/(\e.*)$//so );
+
+ return $residue;
+}
+
+sub _jis2sjis_one { my $x = shift; return ( _xy($x) << 8 ) + _zu($x) } # input is binary
+
+sub _high { my $x = shift; $x >> 8 }
+sub _low { my $x = shift; $x & 0xff }
+
+sub _xy {
+ my $jis = shift;
+
+ my $pq = _high($jis);
+ my $t = ceil( $pq / 2 ) + 0x70;
+ my $ans = ($t <= 0x9F) ? $t : $t+0x40;
+
+ # XXX !!!
+ if (0xED == $ans || $ans == 0xEE) {
+ return $ans + 0x06;
+ } elsif (0xEB == $ans || $ans == 0xEC) {
+ return $ans + 0x0b;
+ } else {
+ return $ans;
+ }
+}
+
+sub _zu {
+ my $jis = shift;
+ my $pq = _high($jis);
+ my $rs = _low($jis);
+
+ if ( $pq % 2 ) { # odd
+ my $t = $rs + 0x20;
+ return ( $t > 0x7f ) ? $t : $t - 1;
+ }
+ else { # even
+ return $rs + 0x7E;
+ }
+}
+
+1;
View
@@ -0,0 +1,39 @@
+use strict;
+use warnings;
+use Test::More tests => 31;
+use Encode;
+use Encode::JP::Mobile;
+
+sub test_it {
+ my ($jis, $normal_uni, $case) = @_;
+ $case ||= unpack "H*", $normal_uni;
+
+ is decode("x-iso-2022-jp-airh", $jis), $normal_uni, "decoding $case";
+ is $jis, encode("x-iso-2022-jp-airh", $normal_uni), "encoding $case";
+
+ my $bytes = $jis;
+ Encode::from_to($bytes, "x-iso-2022-jp-airh" => "x-sjis-airh");
+ Encode::from_to($bytes, "x-sjis-airh", "x-iso-2022-jp-airh");
+ is $bytes, $jis, "x-sjis-airh $case";
+}
+
+test_it("\e\$B\xF8\x9F\e(B", "\x{E63E}", "pictogram");
+
+test_it "a", decode('utf8', 'a'), 'alphabet';
+test_it "\e\$B\x24\x57\e(B", "\x{3077}", 'hiragana(pu)';
+
+is encode('x-iso-2022-jp-airh', "\x{5bc5}"), encode('iso-2022-jp', "\x{5bc5}"), "kanji(tora)";
+is decode('x-iso-2022-jp-airh', "\e\$B\x24\x57\e(B", Encode::FB_PERLQQ), "\x{3077}", "test fallback branch(only for test coverage)";
+is decode('x-iso-2022-jp-airh', "\e\$B\x24\x22\xF8\xA0\e(B"), "\x{3042}\x{E63F}", "hiragana alphabet";
+is decode('x-iso-2022-jp-airh', "a\e\$B\x24\x22\xF8\xA0\e(B"), "a\x{3042}\x{E63F}", "alphabet hiragana pictogram";
+is decode('x-iso-2022-jp-airh', "\xF8\xA0\e\$B\x24\x22\e(Ba"), "\x{E63F}\x{3042}a", "pictogram hiragana alphabet(real position of escape sequence)";
+is decode('x-iso-2022-jp-airh', "\e\$B\xF8\xA0\x24\x22\e(Ba"), "\x{E63F}\x{3042}a", "pictogram hiragana alphabet(unreal position of escape sequence)";
+is decode('x-iso-2022-jp-airh', "\xF8\xA0a\e\$B\x24\x22\e(B"), "\x{E63F}a\x{3042}", "pictogram alphabet hiragana(real position of escape sequence)";
+
+test_it "\e\$B\xF8\xA0\e(B", "\x{E63F}", 'pictogram';
+test_it "\e\$B\xF8\xA2\xF8\xA1\e(B", "\x{E641}\x{E640}", 'pictogram';
+test_it encode('iso-2022-jp', decode("utf8", "")), decode('utf8', ""), 'o';
+test_it encode('iso-2022-jp', decode("utf8", "おい。山田くん。zabutonイチマイ。")), decode('utf8', "おい。山田くん。zabutonイチマイ。"), 'kanji, hiragana, alphabet';
+test_it "\e\(I\x4c\x5e\x30\x4c\x28\x30\e(B", decode('utf8', "ブーフィー"), 'half width katakana';
+
+# is decode('x-iso-2022-jp-kddi', "\e\$(D\x2B\x21\x30\x57\e(B"), "\x{00E1}\x{4F0C}", 'JIS X 0212';

0 comments on commit d60d685

Please sign in to comment.