Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
pawa- committed May 28, 2012
0 parents commit e5feb7c
Show file tree
Hide file tree
Showing 20 changed files with 410 additions and 0 deletions.
11 changes: 11 additions & 0 deletions .gitignore
@@ -0,0 +1,11 @@
cover_db
META.yml
Makefile
blib
inc
pm_to_blib
MANIFEST
Makefile.old
nytprof.out
MANIFEST.bak
*.sw[po]
1 change: 1 addition & 0 deletions .shipit
@@ -0,0 +1 @@
steps = FindVersion, ChangeVersion, CheckChangeLog, DistTest, Commit, Tag, MakeDist, UploadCPAN
2 changes: 2 additions & 0 deletions Changes
@@ -0,0 +1,2 @@
Revision history for Perl extension Lingua::JA::NormalizeText

27 changes: 27 additions & 0 deletions INSTALL
@@ -0,0 +1,27 @@
This is Perl module Lingua::JA::NormalizeText.

INSTALLATION

Lingua::JA::NormalizeText installation is straightforward. If your CPAN shell is set up,
you should just be able to do

% cpan Lingua::JA::NormalizeText

Download it, unpack it, then build it as per the usual:

% perl Makefile.PL
% make && make test

Then install it:

% make install

DOCUMENTATION

Lingua::JA::NormalizeText documentation is available as in POD. So you can do:

% perldoc Lingua::JA::NormalizeText

to read the documentation online with your favorite pager.

pawa-
21 changes: 21 additions & 0 deletions MANIFEST.SKIP
@@ -0,0 +1,21 @@
\bRCS\b
\bCVS\b
^MANIFEST\.
^Makefile$
~$
^#
\.old$
^blib/
^pm_to_blib
^MakeMaker-\d
\.gz$
\.cvsignore
^t/9\d_.*\.t
^t/perlcritic
^tools/
\.svn/
^[^/]+\.yaml$
^[^/]+\.pl$
^\.shipit$
^\.git/
\.sw[po]$
59 changes: 59 additions & 0 deletions MYMETA.json
@@ -0,0 +1,59 @@
{
"abstract" : "normalizes text",
"author" : [
"pawa <pawapawa@cpan.org>"
],
"dynamic_config" : 0,
"generated_by" : "Module::Install version 1.06, CPAN::Meta::Converter version 2.112150",
"license" : [
"perl_5"
],
"meta-spec" : {
"url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
"version" : "2"
},
"name" : "Lingua-JA-NormalizeText",
"no_index" : {
"directory" : [
"inc",
"t",
"xt"
]
},
"prereqs" : {
"build" : {
"requires" : {
"ExtUtils::MakeMaker" : "6.59",
"Test::Fatal" : 0,
"Test::More" : "0.88"
}
},
"configure" : {
"requires" : {
"ExtUtils::MakeMaker" : "6.59"
}
},
"runtime" : {
"requires" : {
"Carp" : 0,
"Exporter" : 0,
"HTML::Entities" : "3.66",
"Unicode::Normalize" : "1",
"perl" : "5.008001"
}
}
},
"release_status" : "testing",
"resources" : {
"bugtracker" : {
"web" : "https://github.com/pawa-/Lingua-JA-NormalizeText/issues"
},
"license" : [
"http://dev.perl.org/licenses"
],
"repository" : {
"url" : "https://github.com/pawa-/Lingua-JA-NormalizeText"
}
},
"version" : "0.00_1"
}
33 changes: 33 additions & 0 deletions MYMETA.yml
@@ -0,0 +1,33 @@
---
abstract: 'normalizes text'
author:
- 'pawa <pawapawa@cpan.org>'
build_requires:
ExtUtils::MakeMaker: 6.59
Test::Fatal: 0
Test::More: 0.88
configure_requires:
ExtUtils::MakeMaker: 6.59
dynamic_config: 0
generated_by: 'Module::Install version 1.06, CPAN::Meta::Converter version 2.112150'
license: perl
meta-spec:
url: http://module-build.sourceforge.net/META-spec-v1.4.html
version: 1.4
name: Lingua-JA-NormalizeText
no_index:
directory:
- inc
- t
- xt
requires:
Carp: 0
Exporter: 0
HTML::Entities: 3.66
Unicode::Normalize: 1
perl: 5.008001
resources:
bugtracker: https://github.com/pawa-/Lingua-JA-NormalizeText/issues
license: http://dev.perl.org/licenses
repository: https://github.com/pawa-/Lingua-JA-NormalizeText
version: 0.00_1
25 changes: 25 additions & 0 deletions Makefile.PL
@@ -0,0 +1,25 @@
use inc::Module::Install;

name 'Lingua-JA-NormalizeText';
all_from 'lib/Lingua/JA/NormalizeText.pm';

requires 'Carp';
requires 'Exporter';
requires 'Unicode::Normalize' => 1.00;
requires 'HTML::Entities' => 3.66;

test_requires 'Test::More' => 0.88; # done_testing
test_requires 'Test::Fatal';

tests 't/*.t';
author_tests 'xt';

readme_from 'lib/Lingua/JA/NormalizeText.pm';

resources(
license => 'http://dev.perl.org/licenses',
repository => 'https://github.com/pawa-/Lingua-JA-NormalizeText',
bugtracker => 'https://github.com/pawa-/Lingua-JA-NormalizeText/issues',
);

WriteAll;
Empty file added README
Empty file.
116 changes: 116 additions & 0 deletions lib/Lingua/JA/NormalizeText.pm
@@ -0,0 +1,116 @@
package Lingua::JA::NormalizeText;

use 5.008_001;
use strict;
use warnings;
use utf8;

use Carp ();
use Exporter qw/import/;
use Unicode::Normalize qw/NFKC NFKD NFC NFD/;
use HTML::Entities qw/decode_entities/;

our $VERSION = '0.00_1';
our @EXPORT = qw();
our @EXPORT_OK = qw(nfkc nfkd nfc nfd decode_entities);
our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );

my @AVAILABLE_OPTS = qw/lc nfkc nfkd nfc nfd decode_entities/;


sub new
{
my ($class, @opts) = @_;
my $self = bless {}, $class;

$self->{converters} = [];

my %set = map { $_ => 1 } @opts;

Carp::croak("at least one option is needed") unless scalar @opts;

for my $available_opt (@AVAILABLE_OPTS)
{
if (delete $set{$available_opt})
{
push(@{ $self->{converters} }, $available_opt);
}
}

Carp::croak( "unknown option(s): " . join(', ', keys %set) ) if keys %set;

return $self;
}

sub normalize
{
my ($self, $text) = @_;

if (!defined $text)
{
Carp::carp('undefined text') unless defined $text;
return;
}

{
no strict 'refs';
map { $text = $_->($text) } @{ $self->{converters} };
}

return $text;
}

sub lc { lc(shift); }
sub nfkc { Unicode::Normalize::NFKC(shift); }
sub nfkd { Unicode::Normalize::NFKD(shift); }
sub nfc { Unicode::Normalize::NFC(shift); }
sub nfd { Unicode::Normalize::NFD(shift); }

=begin
sub wavetilde2long
{
my $tilde = chr(hex("FF5E"));
my $wave = chr(hex("301C"));
my $long = chr(hex("30FC"));
my $text = shift;
$text =~ s/[$wave$tilde]/$long/eg;
return $text;
}
=end
=cut

{
no warnings 'redefine';
sub decode_entities { HTML::Entities::decode_entities(shift); }
}

1;

__END__
=head1 NAME
Lingua::JA::NormalizeText - normalizes text
=head1 SYNOPSIS
use Lingua::JA::NormalizeText;
=head1 DESCRIPTION
Lingua::JA::NormalizeText normalizes text.
=head1 AUTHOR
pawa E<lt>pawapawa@cpan.orgE<gt>
=head1 SEE ALSO
=head1 LICENSE
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut
4 changes: 4 additions & 0 deletions t/00_compile.t
@@ -0,0 +1,4 @@
use strict;
use Test::More tests => 1;

BEGIN { use_ok 'Lingua::JA::NormalizeText' }
33 changes: 33 additions & 0 deletions t/01_basic.t
@@ -0,0 +1,33 @@
use strict;
use warnings;
use Lingua::JA::NormalizeText;
use Test::More;
use Test::Fatal;
use Test::Warn;


my @subs = qw/new normalize lc nfkc nfkd nfc nfd decode_entities/;
can_ok('Lingua::JA::NormalizeText', @subs);

my $exception = exception{ Lingua::JA::NormalizeText->new; };
like($exception, qr/at least/, 'at least one option exception');

$exception = exception{ Lingua::JA::NormalizeText->new(qw/cl ld/); };
like($exception, qr/unknown option\(s\): cl, ld/, 'unknown option exception');

$exception = exception{ Lingua::JA::NormalizeText->new(qw/lc cl/); };
like($exception, qr/unknown option\(s\): cl/, 'unknown option exception');

$exception = exception{ Lingua::JA::NormalizeText->new(qw/lc nfc/); };
is($exception, undef, 'no exception');


my $normalizer = Lingua::JA::NormalizeText->new(qw/lc/);
isa_ok($normalizer, 'Lingua::JA::NormalizeText');

my $result;
warning_is { $result = $normalizer->normalize } 'undefined text',
'undefined text exception';
is($result, undef, 'result of normalizing undefined text');

done_testing;
10 changes: 10 additions & 0 deletions t/02_lc.t
@@ -0,0 +1,10 @@
use strict;
use warnings;
use Lingua::JA::NormalizeText;
use Test::More;

my $normalizer = Lingua::JA::NormalizeText->new(qw/lc/);

ok($normalizer->normalize("DdD"), 'ddd');

done_testing;
25 changes: 25 additions & 0 deletions t/03_nfkc_nfkd_nfc_nfd.t
@@ -0,0 +1,25 @@
use strict;
use warnings;
use utf8;
use Lingua::JA::NormalizeText qw/nfkc nfkd nfc nfd/;
use Test::More;

binmode Test::More->builder->$_ => ':utf8'
for qw/output failure_output todo_output/;

is( nfkc(''), 'ドル', 'NFKC' ); # ドル
is( length nfkc(''), 2, 'NFKC' );

is( nfkd(''), 'ドル', 'NFKD' ); # ト U+3099 ル (length: 3)
is( length nfkd(''), 3, 'NFKD' );

is( nfc('Á'), 'Á', 'NFC' );
is( nfc(''), '', 'NFC' );

is( nfd(''), '', 'NFD' );
is( nfd('Á'), '', 'NFD' );

my $normalizer = Lingua::JA::NormalizeText->new(qw/nfkc/);
is($normalizer->normalize(''), 'ページ', 'NFKC');

done_testing;
15 changes: 15 additions & 0 deletions t/04_decode_entities.t
@@ -0,0 +1,15 @@
use strict;
use warnings;
use utf8;
use Lingua::JA::NormalizeText qw/decode_entities/;
use Test::More;

binmode Test::More->builder->$_ => ':utf8'
for qw/output failure_output todo_output/;

is(decode_entities('&hearts;'), '');

my $normalizer = Lingua::JA::NormalizeText->new(qw/decode_entities/);
is($normalizer->normalize('&hearts;'), '');

done_testing;

0 comments on commit e5feb7c

Please sign in to comment.