Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit e5feb7c
Showing
20 changed files
with
410 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
cover_db | ||
META.yml | ||
Makefile | ||
blib | ||
inc | ||
pm_to_blib | ||
MANIFEST | ||
Makefile.old | ||
nytprof.out | ||
MANIFEST.bak | ||
*.sw[po] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
steps = FindVersion, ChangeVersion, CheckChangeLog, DistTest, Commit, Tag, MakeDist, UploadCPAN |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
Revision history for Perl extension Lingua::JA::NormalizeText | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
This is Perl module Lingua::JA::NormalizeText. | ||
|
||
INSTALLATION | ||
|
||
Lingua::JA::NormalizeText installation is straightforward. If your CPAN shell is set up, | ||
you should just be able to do | ||
|
||
% cpan Lingua::JA::NormalizeText | ||
|
||
Download it, unpack it, then build it as per the usual: | ||
|
||
% perl Makefile.PL | ||
% make && make test | ||
|
||
Then install it: | ||
|
||
% make install | ||
|
||
DOCUMENTATION | ||
|
||
Lingua::JA::NormalizeText documentation is available as in POD. So you can do: | ||
|
||
% perldoc Lingua::JA::NormalizeText | ||
|
||
to read the documentation online with your favorite pager. | ||
|
||
pawa- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
\bRCS\b | ||
\bCVS\b | ||
^MANIFEST\. | ||
^Makefile$ | ||
~$ | ||
^# | ||
\.old$ | ||
^blib/ | ||
^pm_to_blib | ||
^MakeMaker-\d | ||
\.gz$ | ||
\.cvsignore | ||
^t/9\d_.*\.t | ||
^t/perlcritic | ||
^tools/ | ||
\.svn/ | ||
^[^/]+\.yaml$ | ||
^[^/]+\.pl$ | ||
^\.shipit$ | ||
^\.git/ | ||
\.sw[po]$ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
{ | ||
"abstract" : "normalizes text", | ||
"author" : [ | ||
"pawa <pawapawa@cpan.org>" | ||
], | ||
"dynamic_config" : 0, | ||
"generated_by" : "Module::Install version 1.06, CPAN::Meta::Converter version 2.112150", | ||
"license" : [ | ||
"perl_5" | ||
], | ||
"meta-spec" : { | ||
"url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", | ||
"version" : "2" | ||
}, | ||
"name" : "Lingua-JA-NormalizeText", | ||
"no_index" : { | ||
"directory" : [ | ||
"inc", | ||
"t", | ||
"xt" | ||
] | ||
}, | ||
"prereqs" : { | ||
"build" : { | ||
"requires" : { | ||
"ExtUtils::MakeMaker" : "6.59", | ||
"Test::Fatal" : 0, | ||
"Test::More" : "0.88" | ||
} | ||
}, | ||
"configure" : { | ||
"requires" : { | ||
"ExtUtils::MakeMaker" : "6.59" | ||
} | ||
}, | ||
"runtime" : { | ||
"requires" : { | ||
"Carp" : 0, | ||
"Exporter" : 0, | ||
"HTML::Entities" : "3.66", | ||
"Unicode::Normalize" : "1", | ||
"perl" : "5.008001" | ||
} | ||
} | ||
}, | ||
"release_status" : "testing", | ||
"resources" : { | ||
"bugtracker" : { | ||
"web" : "https://github.com/pawa-/Lingua-JA-NormalizeText/issues" | ||
}, | ||
"license" : [ | ||
"http://dev.perl.org/licenses" | ||
], | ||
"repository" : { | ||
"url" : "https://github.com/pawa-/Lingua-JA-NormalizeText" | ||
} | ||
}, | ||
"version" : "0.00_1" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
--- | ||
abstract: 'normalizes text' | ||
author: | ||
- 'pawa <pawapawa@cpan.org>' | ||
build_requires: | ||
ExtUtils::MakeMaker: 6.59 | ||
Test::Fatal: 0 | ||
Test::More: 0.88 | ||
configure_requires: | ||
ExtUtils::MakeMaker: 6.59 | ||
dynamic_config: 0 | ||
generated_by: 'Module::Install version 1.06, CPAN::Meta::Converter version 2.112150' | ||
license: perl | ||
meta-spec: | ||
url: http://module-build.sourceforge.net/META-spec-v1.4.html | ||
version: 1.4 | ||
name: Lingua-JA-NormalizeText | ||
no_index: | ||
directory: | ||
- inc | ||
- t | ||
- xt | ||
requires: | ||
Carp: 0 | ||
Exporter: 0 | ||
HTML::Entities: 3.66 | ||
Unicode::Normalize: 1 | ||
perl: 5.008001 | ||
resources: | ||
bugtracker: https://github.com/pawa-/Lingua-JA-NormalizeText/issues | ||
license: http://dev.perl.org/licenses | ||
repository: https://github.com/pawa-/Lingua-JA-NormalizeText | ||
version: 0.00_1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
use inc::Module::Install; | ||
|
||
name 'Lingua-JA-NormalizeText'; | ||
all_from 'lib/Lingua/JA/NormalizeText.pm'; | ||
|
||
requires 'Carp'; | ||
requires 'Exporter'; | ||
requires 'Unicode::Normalize' => 1.00; | ||
requires 'HTML::Entities' => 3.66; | ||
|
||
test_requires 'Test::More' => 0.88; # done_testing | ||
test_requires 'Test::Fatal'; | ||
|
||
tests 't/*.t'; | ||
author_tests 'xt'; | ||
|
||
readme_from 'lib/Lingua/JA/NormalizeText.pm'; | ||
|
||
resources( | ||
license => 'http://dev.perl.org/licenses', | ||
repository => 'https://github.com/pawa-/Lingua-JA-NormalizeText', | ||
bugtracker => 'https://github.com/pawa-/Lingua-JA-NormalizeText/issues', | ||
); | ||
|
||
WriteAll; |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
package Lingua::JA::NormalizeText; | ||
|
||
use 5.008_001; | ||
use strict; | ||
use warnings; | ||
use utf8; | ||
|
||
use Carp (); | ||
use Exporter qw/import/; | ||
use Unicode::Normalize qw/NFKC NFKD NFC NFD/; | ||
use HTML::Entities qw/decode_entities/; | ||
|
||
our $VERSION = '0.00_1'; | ||
our @EXPORT = qw(); | ||
our @EXPORT_OK = qw(nfkc nfkd nfc nfd decode_entities); | ||
our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] ); | ||
|
||
my @AVAILABLE_OPTS = qw/lc nfkc nfkd nfc nfd decode_entities/; | ||
|
||
|
||
sub new | ||
{ | ||
my ($class, @opts) = @_; | ||
my $self = bless {}, $class; | ||
|
||
$self->{converters} = []; | ||
|
||
my %set = map { $_ => 1 } @opts; | ||
|
||
Carp::croak("at least one option is needed") unless scalar @opts; | ||
|
||
for my $available_opt (@AVAILABLE_OPTS) | ||
{ | ||
if (delete $set{$available_opt}) | ||
{ | ||
push(@{ $self->{converters} }, $available_opt); | ||
} | ||
} | ||
|
||
Carp::croak( "unknown option(s): " . join(', ', keys %set) ) if keys %set; | ||
|
||
return $self; | ||
} | ||
|
||
sub normalize | ||
{ | ||
my ($self, $text) = @_; | ||
|
||
if (!defined $text) | ||
{ | ||
Carp::carp('undefined text') unless defined $text; | ||
return; | ||
} | ||
|
||
{ | ||
no strict 'refs'; | ||
map { $text = $_->($text) } @{ $self->{converters} }; | ||
} | ||
|
||
return $text; | ||
} | ||
|
||
sub lc { lc(shift); } | ||
sub nfkc { Unicode::Normalize::NFKC(shift); } | ||
sub nfkd { Unicode::Normalize::NFKD(shift); } | ||
sub nfc { Unicode::Normalize::NFC(shift); } | ||
sub nfd { Unicode::Normalize::NFD(shift); } | ||
|
||
=begin | ||
sub wavetilde2long | ||
{ | ||
my $tilde = chr(hex("FF5E")); | ||
my $wave = chr(hex("301C")); | ||
my $long = chr(hex("30FC")); | ||
my $text = shift; | ||
$text =~ s/[$wave$tilde]/$long/eg; | ||
return $text; | ||
} | ||
=end | ||
=cut | ||
|
||
{ | ||
no warnings 'redefine'; | ||
sub decode_entities { HTML::Entities::decode_entities(shift); } | ||
} | ||
|
||
1; | ||
|
||
__END__ | ||
=head1 NAME | ||
Lingua::JA::NormalizeText - normalizes text | ||
=head1 SYNOPSIS | ||
use Lingua::JA::NormalizeText; | ||
=head1 DESCRIPTION | ||
Lingua::JA::NormalizeText normalizes text. | ||
=head1 AUTHOR | ||
pawa E<lt>pawapawa@cpan.orgE<gt> | ||
=head1 SEE ALSO | ||
=head1 LICENSE | ||
This library is free software; you can redistribute it and/or modify | ||
it under the same terms as Perl itself. | ||
=cut |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
use strict; | ||
use Test::More tests => 1; | ||
|
||
BEGIN { use_ok 'Lingua::JA::NormalizeText' } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
use strict; | ||
use warnings; | ||
use Lingua::JA::NormalizeText; | ||
use Test::More; | ||
use Test::Fatal; | ||
use Test::Warn; | ||
|
||
|
||
my @subs = qw/new normalize lc nfkc nfkd nfc nfd decode_entities/; | ||
can_ok('Lingua::JA::NormalizeText', @subs); | ||
|
||
my $exception = exception{ Lingua::JA::NormalizeText->new; }; | ||
like($exception, qr/at least/, 'at least one option exception'); | ||
|
||
$exception = exception{ Lingua::JA::NormalizeText->new(qw/cl ld/); }; | ||
like($exception, qr/unknown option\(s\): cl, ld/, 'unknown option exception'); | ||
|
||
$exception = exception{ Lingua::JA::NormalizeText->new(qw/lc cl/); }; | ||
like($exception, qr/unknown option\(s\): cl/, 'unknown option exception'); | ||
|
||
$exception = exception{ Lingua::JA::NormalizeText->new(qw/lc nfc/); }; | ||
is($exception, undef, 'no exception'); | ||
|
||
|
||
my $normalizer = Lingua::JA::NormalizeText->new(qw/lc/); | ||
isa_ok($normalizer, 'Lingua::JA::NormalizeText'); | ||
|
||
my $result; | ||
warning_is { $result = $normalizer->normalize } 'undefined text', | ||
'undefined text exception'; | ||
is($result, undef, 'result of normalizing undefined text'); | ||
|
||
done_testing; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
use strict; | ||
use warnings; | ||
use Lingua::JA::NormalizeText; | ||
use Test::More; | ||
|
||
my $normalizer = Lingua::JA::NormalizeText->new(qw/lc/); | ||
|
||
ok($normalizer->normalize("DdD"), 'ddd'); | ||
|
||
done_testing; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
use strict; | ||
use warnings; | ||
use utf8; | ||
use Lingua::JA::NormalizeText qw/nfkc nfkd nfc nfd/; | ||
use Test::More; | ||
|
||
binmode Test::More->builder->$_ => ':utf8' | ||
for qw/output failure_output todo_output/; | ||
|
||
is( nfkc('㌦'), 'ドル', 'NFKC' ); # ドル | ||
is( length nfkc('㌦'), 2, 'NFKC' ); | ||
|
||
is( nfkd('㌦'), 'ドル', 'NFKD' ); # ト U+3099 ル (length: 3) | ||
is( length nfkd('㌦'), 3, 'NFKD' ); | ||
|
||
is( nfc('Á'), 'Á', 'NFC' ); | ||
is( nfc('①'), '①', 'NFC' ); | ||
|
||
is( nfd('①'), '①', 'NFD' ); | ||
is( nfd('Á'), 'Á', 'NFD' ); | ||
|
||
my $normalizer = Lingua::JA::NormalizeText->new(qw/nfkc/); | ||
is($normalizer->normalize('㌻'), 'ページ', 'NFKC'); | ||
|
||
done_testing; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
use strict; | ||
use warnings; | ||
use utf8; | ||
use Lingua::JA::NormalizeText qw/decode_entities/; | ||
use Test::More; | ||
|
||
binmode Test::More->builder->$_ => ':utf8' | ||
for qw/output failure_output todo_output/; | ||
|
||
is(decode_entities('♥'), '♥'); | ||
|
||
my $normalizer = Lingua::JA::NormalizeText->new(qw/decode_entities/); | ||
is($normalizer->normalize('♥'), '♥'); | ||
|
||
done_testing; |
Oops, something went wrong.