diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c38068c --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +cover_db +META.yml +Makefile +blib +inc +pm_to_blib +MANIFEST +Makefile.old +nytprof.out +MANIFEST.bak +*.sw[po] diff --git a/.shipit b/.shipit new file mode 100644 index 0000000..8731dce --- /dev/null +++ b/.shipit @@ -0,0 +1 @@ +steps = FindVersion, ChangeVersion, CheckChangeLog, DistTest, Commit, Tag, MakeDist, UploadCPAN diff --git a/Changes b/Changes new file mode 100644 index 0000000..a8ddbe3 --- /dev/null +++ b/Changes @@ -0,0 +1,2 @@ +Revision history for Perl extension Lingua::JA::NormalizeText + diff --git a/INSTALL b/INSTALL new file mode 100644 index 0000000..ad01a7a --- /dev/null +++ b/INSTALL @@ -0,0 +1,27 @@ +This is Perl module Lingua::JA::NormalizeText. + +INSTALLATION + +Lingua::JA::NormalizeText installation is straightforward. If your CPAN shell is set up, +you should just be able to do + + % cpan Lingua::JA::NormalizeText + +Download it, unpack it, then build it as per the usual: + + % perl Makefile.PL + % make && make test + +Then install it: + + % make install + +DOCUMENTATION + +Lingua::JA::NormalizeText documentation is available as in POD. So you can do: + + % perldoc Lingua::JA::NormalizeText + +to read the documentation online with your favorite pager. + +pawa- diff --git a/MANIFEST.SKIP b/MANIFEST.SKIP new file mode 100644 index 0000000..fb4c7b4 --- /dev/null +++ b/MANIFEST.SKIP @@ -0,0 +1,21 @@ +\bRCS\b +\bCVS\b +^MANIFEST\. +^Makefile$ +~$ +^# +\.old$ +^blib/ +^pm_to_blib +^MakeMaker-\d +\.gz$ +\.cvsignore +^t/9\d_.*\.t +^t/perlcritic +^tools/ +\.svn/ +^[^/]+\.yaml$ +^[^/]+\.pl$ +^\.shipit$ +^\.git/ +\.sw[po]$ diff --git a/MYMETA.json b/MYMETA.json new file mode 100644 index 0000000..67c921b --- /dev/null +++ b/MYMETA.json @@ -0,0 +1,59 @@ +{ + "abstract" : "normalizes text", + "author" : [ + "pawa " + ], + "dynamic_config" : 0, + "generated_by" : "Module::Install version 1.06, CPAN::Meta::Converter version 2.112150", + "license" : [ + "perl_5" + ], + "meta-spec" : { + "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", + "version" : "2" + }, + "name" : "Lingua-JA-NormalizeText", + "no_index" : { + "directory" : [ + "inc", + "t", + "xt" + ] + }, + "prereqs" : { + "build" : { + "requires" : { + "ExtUtils::MakeMaker" : "6.59", + "Test::Fatal" : 0, + "Test::More" : "0.88" + } + }, + "configure" : { + "requires" : { + "ExtUtils::MakeMaker" : "6.59" + } + }, + "runtime" : { + "requires" : { + "Carp" : 0, + "Exporter" : 0, + "HTML::Entities" : "3.66", + "Unicode::Normalize" : "1", + "perl" : "5.008001" + } + } + }, + "release_status" : "testing", + "resources" : { + "bugtracker" : { + "web" : "https://github.com/pawa-/Lingua-JA-NormalizeText/issues" + }, + "license" : [ + "http://dev.perl.org/licenses" + ], + "repository" : { + "url" : "https://github.com/pawa-/Lingua-JA-NormalizeText" + } + }, + "version" : "0.00_1" +} diff --git a/MYMETA.yml b/MYMETA.yml new file mode 100644 index 0000000..6346b94 --- /dev/null +++ b/MYMETA.yml @@ -0,0 +1,33 @@ +--- +abstract: 'normalizes text' +author: + - 'pawa ' +build_requires: + ExtUtils::MakeMaker: 6.59 + Test::Fatal: 0 + Test::More: 0.88 +configure_requires: + ExtUtils::MakeMaker: 6.59 +dynamic_config: 0 +generated_by: 'Module::Install version 1.06, CPAN::Meta::Converter version 2.112150' +license: perl +meta-spec: + url: http://module-build.sourceforge.net/META-spec-v1.4.html + version: 1.4 +name: Lingua-JA-NormalizeText +no_index: + directory: + - inc + - t + - xt +requires: + Carp: 0 + Exporter: 0 + HTML::Entities: 3.66 + Unicode::Normalize: 1 + perl: 5.008001 +resources: + bugtracker: https://github.com/pawa-/Lingua-JA-NormalizeText/issues + license: http://dev.perl.org/licenses + repository: https://github.com/pawa-/Lingua-JA-NormalizeText +version: 0.00_1 diff --git a/Makefile.PL b/Makefile.PL new file mode 100644 index 0000000..584bc12 --- /dev/null +++ b/Makefile.PL @@ -0,0 +1,25 @@ +use inc::Module::Install; + +name 'Lingua-JA-NormalizeText'; +all_from 'lib/Lingua/JA/NormalizeText.pm'; + +requires 'Carp'; +requires 'Exporter'; +requires 'Unicode::Normalize' => 1.00; +requires 'HTML::Entities' => 3.66; + +test_requires 'Test::More' => 0.88; # done_testing +test_requires 'Test::Fatal'; + +tests 't/*.t'; +author_tests 'xt'; + +readme_from 'lib/Lingua/JA/NormalizeText.pm'; + +resources( + license => 'http://dev.perl.org/licenses', + repository => 'https://github.com/pawa-/Lingua-JA-NormalizeText', + bugtracker => 'https://github.com/pawa-/Lingua-JA-NormalizeText/issues', +); + +WriteAll; diff --git a/README b/README new file mode 100644 index 0000000..e69de29 diff --git a/lib/Lingua/JA/NormalizeText.pm b/lib/Lingua/JA/NormalizeText.pm new file mode 100644 index 0000000..1cee6e2 --- /dev/null +++ b/lib/Lingua/JA/NormalizeText.pm @@ -0,0 +1,116 @@ +package Lingua::JA::NormalizeText; + +use 5.008_001; +use strict; +use warnings; +use utf8; + +use Carp (); +use Exporter qw/import/; +use Unicode::Normalize qw/NFKC NFKD NFC NFD/; +use HTML::Entities qw/decode_entities/; + +our $VERSION = '0.00_1'; +our @EXPORT = qw(); +our @EXPORT_OK = qw(nfkc nfkd nfc nfd decode_entities); +our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] ); + +my @AVAILABLE_OPTS = qw/lc nfkc nfkd nfc nfd decode_entities/; + + +sub new +{ + my ($class, @opts) = @_; + my $self = bless {}, $class; + + $self->{converters} = []; + + my %set = map { $_ => 1 } @opts; + + Carp::croak("at least one option is needed") unless scalar @opts; + + for my $available_opt (@AVAILABLE_OPTS) + { + if (delete $set{$available_opt}) + { + push(@{ $self->{converters} }, $available_opt); + } + } + + Carp::croak( "unknown option(s): " . join(', ', keys %set) ) if keys %set; + + return $self; +} + +sub normalize +{ + my ($self, $text) = @_; + + if (!defined $text) + { + Carp::carp('undefined text') unless defined $text; + return; + } + + { + no strict 'refs'; + map { $text = $_->($text) } @{ $self->{converters} }; + } + + return $text; +} + +sub lc { lc(shift); } +sub nfkc { Unicode::Normalize::NFKC(shift); } +sub nfkd { Unicode::Normalize::NFKD(shift); } +sub nfc { Unicode::Normalize::NFC(shift); } +sub nfd { Unicode::Normalize::NFD(shift); } + +=begin +sub wavetilde2long +{ + my $tilde = chr(hex("FF5E")); + my $wave = chr(hex("301C")); + my $long = chr(hex("30FC")); + + my $text = shift; + $text =~ s/[$wave$tilde]/$long/eg; + + return $text; +} +=end +=cut + +{ + no warnings 'redefine'; + sub decode_entities { HTML::Entities::decode_entities(shift); } +} + +1; + +__END__ + +=head1 NAME + +Lingua::JA::NormalizeText - normalizes text + +=head1 SYNOPSIS + + use Lingua::JA::NormalizeText; + +=head1 DESCRIPTION + +Lingua::JA::NormalizeText normalizes text. + +=head1 AUTHOR + +pawa Epawapawa@cpan.orgE + +=head1 SEE ALSO + +=head1 LICENSE + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut diff --git a/t/00_compile.t b/t/00_compile.t new file mode 100644 index 0000000..68b8374 --- /dev/null +++ b/t/00_compile.t @@ -0,0 +1,4 @@ +use strict; +use Test::More tests => 1; + +BEGIN { use_ok 'Lingua::JA::NormalizeText' } diff --git a/t/01_basic.t b/t/01_basic.t new file mode 100644 index 0000000..9e4d2ab --- /dev/null +++ b/t/01_basic.t @@ -0,0 +1,33 @@ +use strict; +use warnings; +use Lingua::JA::NormalizeText; +use Test::More; +use Test::Fatal; +use Test::Warn; + + +my @subs = qw/new normalize lc nfkc nfkd nfc nfd decode_entities/; +can_ok('Lingua::JA::NormalizeText', @subs); + +my $exception = exception{ Lingua::JA::NormalizeText->new; }; +like($exception, qr/at least/, 'at least one option exception'); + +$exception = exception{ Lingua::JA::NormalizeText->new(qw/cl ld/); }; +like($exception, qr/unknown option\(s\): cl, ld/, 'unknown option exception'); + +$exception = exception{ Lingua::JA::NormalizeText->new(qw/lc cl/); }; +like($exception, qr/unknown option\(s\): cl/, 'unknown option exception'); + +$exception = exception{ Lingua::JA::NormalizeText->new(qw/lc nfc/); }; +is($exception, undef, 'no exception'); + + +my $normalizer = Lingua::JA::NormalizeText->new(qw/lc/); +isa_ok($normalizer, 'Lingua::JA::NormalizeText'); + +my $result; +warning_is { $result = $normalizer->normalize } 'undefined text', +'undefined text exception'; +is($result, undef, 'result of normalizing undefined text'); + +done_testing; diff --git a/t/02_lc.t b/t/02_lc.t new file mode 100644 index 0000000..82860c8 --- /dev/null +++ b/t/02_lc.t @@ -0,0 +1,10 @@ +use strict; +use warnings; +use Lingua::JA::NormalizeText; +use Test::More; + +my $normalizer = Lingua::JA::NormalizeText->new(qw/lc/); + +ok($normalizer->normalize("DdD"), 'ddd'); + +done_testing; diff --git a/t/03_nfkc_nfkd_nfc_nfd.t b/t/03_nfkc_nfkd_nfc_nfd.t new file mode 100644 index 0000000..33df7b9 --- /dev/null +++ b/t/03_nfkc_nfkd_nfc_nfd.t @@ -0,0 +1,25 @@ +use strict; +use warnings; +use utf8; +use Lingua::JA::NormalizeText qw/nfkc nfkd nfc nfd/; +use Test::More; + +binmode Test::More->builder->$_ => ':utf8' +for qw/output failure_output todo_output/; + +is( nfkc('㌦'), 'ドル', 'NFKC' ); # ドル +is( length nfkc('㌦'), 2, 'NFKC' ); + +is( nfkd('㌦'), 'ドル', 'NFKD' ); # ト U+3099 ル (length: 3) +is( length nfkd('㌦'), 3, 'NFKD' ); + +is( nfc('Á'), 'Á', 'NFC' ); +is( nfc('①'), '①', 'NFC' ); + +is( nfd('①'), '①', 'NFD' ); +is( nfd('Á'), 'Á', 'NFD' ); + +my $normalizer = Lingua::JA::NormalizeText->new(qw/nfkc/); +is($normalizer->normalize('㌻'), 'ページ', 'NFKC'); + +done_testing; diff --git a/t/04_decode_entities.t b/t/04_decode_entities.t new file mode 100644 index 0000000..8510b7c --- /dev/null +++ b/t/04_decode_entities.t @@ -0,0 +1,15 @@ +use strict; +use warnings; +use utf8; +use Lingua::JA::NormalizeText qw/decode_entities/; +use Test::More; + +binmode Test::More->builder->$_ => ':utf8' +for qw/output failure_output todo_output/; + +is(decode_entities('♥'), '♥'); + +my $normalizer = Lingua::JA::NormalizeText->new(qw/decode_entities/); +is($normalizer->normalize('♥'), '♥'); + +done_testing; diff --git a/xt/01_podspell.t b/xt/01_podspell.t new file mode 100644 index 0000000..bf9258e --- /dev/null +++ b/xt/01_podspell.t @@ -0,0 +1,10 @@ +use Test::More; +eval q{ use Test::Spelling }; +plan skip_all => "Test::Spelling is not installed." if $@; +add_stopwords(map { split /[\s\:\-]/ } ); +$ENV{LANG} = 'C'; +all_pod_files_spelling_ok('lib'); +__DATA__ +pawa +pawapawa@cpan.org +Lingua::JA::NormalizeText diff --git a/xt/02_perlcritic.t b/xt/02_perlcritic.t new file mode 100644 index 0000000..b977df8 --- /dev/null +++ b/xt/02_perlcritic.t @@ -0,0 +1,8 @@ +use strict; +use Test::More; +eval { + require Test::Perl::Critic; + Test::Perl::Critic->import( -profile => 'xt/perlcriticrc'); +}; +plan skip_all => "Test::Perl::Critic is not installed." if $@; +all_critic_ok('lib'); diff --git a/xt/03_pod.t b/xt/03_pod.t new file mode 100644 index 0000000..437887a --- /dev/null +++ b/xt/03_pod.t @@ -0,0 +1,4 @@ +use Test::More; +eval "use Test::Pod 1.00"; +plan skip_all => "Test::Pod 1.00 required for testing POD" if $@; +all_pod_files_ok(); diff --git a/xt/04_synopsis.t b/xt/04_synopsis.t new file mode 100644 index 0000000..d414066 --- /dev/null +++ b/xt/04_synopsis.t @@ -0,0 +1,4 @@ +use Test::More; +eval "use Test::Synopsis"; +plan skip_all => "Test::Synopsis required for testing" if $@; +all_synopsis_ok(); diff --git a/xt/perlcriticrc b/xt/perlcriticrc new file mode 100644 index 0000000..fa96144 --- /dev/null +++ b/xt/perlcriticrc @@ -0,0 +1,2 @@ +[TestingAndDebugging::ProhibitNoStrict] +allow=refs