Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

added _h2z subroutine

  • Loading branch information...
commit 0f8b8c4e2fe0ba07134e2ea99f94b1052acce084 1 parent 82f7fed
@pawa- authored
View
14 README
@@ -3,6 +3,7 @@ NAME
SYNOPSIS
use Text::KyTea;
+ use utf8;
my $kytea = Text::KyTea->new(%config);
my $results = $kytea->parse($text);
@@ -29,14 +30,14 @@ DESCRIPTION
Japanese, Chinese and other languages requiring word or morpheme
segmentation.
- This module works under KyTea Ver.0.4.x. Under old versions of KyTea,
- this might not works.
+ This module works under KyTea Ver.0.3.2 and later. Under old versions of
+ KyTea, this might not work.
If you've changed default install directory of KyTea, please install
- Text::KyTea with interactive mode (e.g. cpanm --interactive or cpanm
+ Text::KyTea with interactive mode (e.g., cpanm --interactive or cpanm
-v).
- For more information about KyTea, please see the SEE ALSO.
+ For more information about KyTea, please see the "SEE ALSO" section.
METHODS
new(%config)
@@ -44,6 +45,7 @@ METHODS
my $kytea = Text::KyTea->new(
model => 'model.bin', # default is '/usr/local/share/kytea/model.bin'
+ h2z => 0, # default is 1 (enable)
notag => [1,2], # default is []
nounk => 0, # default is 0 (estimates the pronunciation of unkown words)
unkbeam => 50, # default is 50
@@ -52,6 +54,10 @@ METHODS
unktag => '', # default is ''
);
+ new(h2z => 1)
+ Converts $text from hankaku to zenkaku before parsing $text. This
+ option improves the parsing accuracy in most of model files.
+
read_model($path)
Reads the given model file. The model file should be read by
new(model => $path) method.
View
53 lib/Text/KyTea.pm
@@ -3,18 +3,21 @@ use 5.008_001;
use strict;
use warnings;
use Carp;
+use Data::Recursive::Encode;
+use Lingua::JA::Regular::Unicode qw/alnum_h2z space_h2z katakana_h2z/;
-our $VERSION = '0.23_1';
-
+our $VERSION = '0.30';
require XSLoader;
XSLoader::load(__PACKAGE__, $VERSION);
+
sub _options
{
return {
# analysis options
model => '/usr/local/share/kytea/model.bin',
+ h2z => 1,
nows => 0,
notags => 0,
notag => [],
@@ -55,6 +58,36 @@ sub new
return _init_text_kytea($class, $options);
}
+sub _h2z { katakana_h2z( space_h2z( alnum_h2z($_[0]) ) ); }
+
+sub parse
+{
+ my ($self, $text) = @_;
+
+ my $is_h2z_enable = $self->_is_h2z_enable;
+
+ if ($is_h2z_enable)
+ {
+ my @original_chars = split(//, $text);
+ my $text = _h2z($text);
+
+ my $results = Data::Recursive::Encode->decode_utf8( $self->_parse($text) );
+
+ my $i = 0;
+
+ # changed char -> original char
+ for my $result (@{$results})
+ {
+ $result->{surface} = join( '', @original_chars[$i .. $i + (length $result->{surface}) - 1] );
+ $i += length $result->{surface};
+ }
+
+ return $results;
+ }
+
+ return Data::Recursive::Encode->decode_utf8( $self->_parse($text) );
+}
+
1;
__END__
@@ -70,6 +103,7 @@ my ($text, %config, $path);
=head1 SYNOPSIS
use Text::KyTea;
+ use utf8;
my $kytea = Text::KyTea->new(%config);
my $results = $kytea->parse($text);
@@ -98,14 +132,14 @@ KyTea is a general toolkit developed for analyzing text,
with a focus on Japanese, Chinese and other languages
requiring word or morpheme segmentation.
-This module works under KyTea Ver.0.4.x.
-Under old versions of KyTea, this might not works.
+This module works under KyTea Ver.0.3.2 and later.
+Under old versions of KyTea, this might not work.
If you've changed default install directory of KyTea,
please install Text::KyTea with interactive mode
-(e.g. cpanm --interactive or cpanm -v).
+(e.g., cpanm --interactive or cpanm -v).
-For more information about KyTea, please see the SEE ALSO.
+For more information about KyTea, please see the "SEE ALSO" section.
=head1 METHODS
@@ -118,6 +152,7 @@ Creates a new Text::KyTea instance.
my $kytea = Text::KyTea->new(
model => 'model.bin', # default is '/usr/local/share/kytea/model.bin'
+ h2z => 0, # default is 1 (enable)
notag => [1,2], # default is []
nounk => 0, # default is 0 (estimates the pronunciation of unkown words)
unkbeam => 50, # default is 50
@@ -127,6 +162,12 @@ Creates a new Text::KyTea instance.
);
+=item new(h2z => 1)
+
+Converts $text from hankaku to zenkaku before parsing $text.
+This option improves the parsing accuracy in most of model files.
+
+
=item read_model($path)
Reads the given model file.
View
3  model/train.sh
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-train-kytea -full test.txt -model test.mod
View
85 t/03_notag.t
@@ -1,85 +0,0 @@
-use strict;
-use warnings;
-use Test::Base;
-plan tests => 6 * blocks;
-
-use Text::KyTea;
-
-my $model_path = './model/test.mod';
-
-my $kytea_notag1 = Text::KyTea->new(
- model => $model_path,
- notag => [1],
-);
-
-my $kytea_notag2 = Text::KyTea->new(
- model => $model_path,
- notag => [2],
-);
-
-my $kytea_notag12 = Text::KyTea->new(
- model => $model_path,
- notag => [1,2],
-);
-
-run
-{
- my $block = shift;
- my $results_notag1 = $kytea_notag1->parse($block->input);
- my $results_notag2 = $kytea_notag2->parse($block->input);
- my $results_notag12 = $kytea_notag12->parse($block->input);
-
- my ($notag1_surface, @notag1_features) = split_results($results_notag1);
- my ($notag2_surface, @notag2_features) = split_results($results_notag2);
- my ($notag12_surface, @notag12_features) = split_results($results_notag12);
-
- is($notag1_surface, $block->expected_notag1_surf);
- is($notag2_surface, $block->expected_notag2_surf);
- is($notag12_surface, $block->expected_notag12_surf);
- is("@notag1_features", $block->expected_notag1_features);
- is("@notag2_features", $block->expected_notag2_features);
- is("@notag12_features", $block->expected_notag12_features);
-};
-
-
-sub split_results
-{
- my $results = shift;
-
- my ($surf, @features);
-
- for my $result (@{$results})
- {
- $surf .= $result->{surface};
-
- for my $tags (@{$result->{tags}})
- {
- if ($tags->[0])
- {
- push(@features, $tags->[0]{feature});
- }
- }
- }
-
- return ($surf, @features);
-}
-
-
-__DATA__
-===
---- input: コーパスの文です。
---- expected_notag1_surf: コーパスの文です。
---- expected_notag2_surf: コーパスの文です。
---- expected_notag12_surf: コーパスの文です。
---- expected_notag1_features: こーぱす の ぶん で す 。
---- expected_notag2_features: 名詞 助詞 名詞 助動詞 語尾 補助記号
---- expected_notag12_features:
-
-===
---- input: もうひとつの文です。
---- expected_notag1_surf: もうひとつの文です。
---- expected_notag2_surf: もうひとつの文です。
---- expected_notag12_surf: もうひとつの文です。
---- expected_notag1_features: もう ひと つ の ぶん で す 。
---- expected_notag2_features: 副詞 名詞 接尾辞 助詞 名詞 助動詞 語尾 補助記号
---- expected_notag12_features:
View
30 t/04_tagmax.t
@@ -1,30 +0,0 @@
-use strict;
-use warnings;
-use Text::KyTea;
-use Test::More;
-
-my $kytea = Text::KyTea->new(
- model => './model/test.mod',
- tagmax => 1,
-);
-
-tagmax_test( $kytea->parse("コーパスの文です。") );
-tagmax_test( $kytea->parse("もうひとつの文です。") );
-
-
-done_testing;
-
-
-sub tagmax_test
-{
- my $results = shift;
-
- for my $result (@{$results})
- {
- my $p_of_s_tag = $result->{tags}[0];
- is(scalar @{$p_of_s_tag}, 1);
-
- my $pron = $result->{tags}[1];
- is(scalar @{$pron}, 1);
- }
-}
View
64 t/05_deftag.t
@@ -1,64 +0,0 @@
-use strict;
-use warnings;
-use Test::Base;
-plan tests => 3 * blocks;
-
-use Text::KyTea;
-
-my $kytea = Text::KyTea->new(
- model => './model/test.mod',
- deftag => '(´・ω・`)',
-);
-
-run
-{
- my $block = shift;
- my $results = $kytea->parse($block->input);
-
- my ($surf, $pron, @p_of_s) = split_results($results);
-
- is($surf, $block->expected_surf);
- is($pron, $block->expected_pron);
- is("@p_of_s", $block->expected_p_of_s);
-};
-
-
-sub split_results
-{
- my $results = shift;
-
- my ($surf, $pron, @p_of_s);
-
- for my $result (@{$results})
- {
- $surf .= $result->{surface};
-
- my $p_of_s_tag = $result->{tags}[0];
- push(@p_of_s, $p_of_s_tag->[0]{feature});
-
- my $pron_tag = $result->{tags}[1];
- $pron .= $pron_tag->[0]{feature};
- }
-
- return ($surf, $pron, @p_of_s);
-}
-
-
-__DATA__
-===
---- input: コーパスの文です。
---- expected_surf: コーパスの文です。
---- expected_pron: こーぱすのぶんです。
---- expected_p_of_s: 名詞 助詞 名詞 助動詞 語尾 補助記号
-
-===
---- input: もうひとつの文です。
---- expected_surf: もうひとつの文です。
---- expected_pron: もうひとつのぶんです。
---- expected_p_of_s: 副詞 名詞 接尾辞 助詞 名詞 助動詞 語尾 補助記号
-
-===
---- input: XXYBA
---- expected_surf: XXYBA
---- expected_pron: (´・ω・`)(´・ω・`)(´・ω・`)(´・ω・`)(´・ω・`)
---- expected_p_of_s: (´・ω・`) (´・ω・`) (´・ω・`) (´・ω・`) (´・ω・`)
Please sign in to comment.
Something went wrong with that request. Please try again.