Skip to content

Commit

Permalink
added db_auto option, changed Lingua::JA::TFWebIDF dependency
Browse files Browse the repository at this point in the history
  • Loading branch information
pawa- committed Jul 30, 2012
1 parent d8f1fe9 commit d9e16f5
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 30 deletions.
3 changes: 2 additions & 1 deletion Makefile.PL
Expand Up @@ -4,11 +4,12 @@ name 'Lingua-JA-TermExtractor';
all_from 'lib/Lingua/JA/TermExtractor.pm';

requires 'Carp';
requires 'Lingua::JA::TFWebIDF' => 0.34;
requires 'Lingua::JA::TFWebIDF' => 0.36;

test_requires 'Test::More' => 0.88; # done_testing
test_requires 'Test::Fatal';
test_requires 'Test::Warn' => 0.20;
test_requires 'Test::Requires';

tests 't/*.t';
author_tests 'xt';
Expand Down
20 changes: 11 additions & 9 deletions README
Expand Up @@ -18,10 +18,10 @@ SYNOPSIS
term_length_min => 2,
tf_min => 2,
df_min => 1_0000,
df_max => 500_0000,
df_max => 1000_0000,
ng_word => [qw/編集 本人 自身 自分 たち さん/],
fetch_unk_word_df => 0,
concatenation_max => 100,
concat_max => 100,
);

p $extractor->extract($document)->dump;
Expand All @@ -38,6 +38,7 @@ DESCRIPTION
Lingua::JA::TermExtractor is a term extractor. This extracts terms from
a document or documents.

METHODS
new( %config || \%config )
Creates a new Lingua::JA::TermExtractor instance.

Expand All @@ -54,11 +55,12 @@ DESCRIPTION
ng_word []
term_length_min 2
term_length_max 30
concatenation_max 30
concat_max 30
tf_min 1
df_min 0
df_max 250_0000_0000
fetch_unk_word_df 0
db_auto 1

idf_type 1
api 'Yahoo'
Expand All @@ -76,8 +78,8 @@ DESCRIPTION
b => $value
The weight of document length normalization.

pos(1|2|3)_filter, ng_word, term_length_(min|max), concatenation_max,
tf_min, df_(min|max), fetch_unk_word_df
pos(1|2|3)_filter, ng_word, term_length_(min|max), concat_max, tf_min,
df_(min|max), fetch_unk_word_df, db_auto
See Lingua::JA::TFWebIDF.

idf_type, api, appid, driver, df_file, fetch_df, expires_in, documents,
Expand All @@ -86,7 +88,7 @@ DESCRIPTION

extract( $document || \@documents )
Extracts terms from $document or \@documents. Word segmentation and POS
tagging is done with MeCab.
tagging are done with MeCab.

tfidf, tf
See Lingua::JA::TFWebIDF.
Expand All @@ -98,11 +100,11 @@ AUTHOR
pawa <pawapawa@cpan.org>

SEE ALSO
Lingua::JA::WebIDF.
Lingua::JA::WebIDF

Lingua::JA::WebIDF::Driver::TokyoTyrant.
Lingua::JA::WebIDF::Driver::TokyoTyrant

Lingua::JA::TFWebIDF.
Lingua::JA::TFWebIDF

LICENSE
This library is free software; you can redistribute it and/or modify it
Expand Down
62 changes: 50 additions & 12 deletions lib/Lingua/JA/TermExtractor.pm
Expand Up @@ -17,14 +17,23 @@ sub new
my %args = (ref $_[0] eq 'HASH' ? %{$_[0]} : @_);

$args{idf_type} = 3 unless defined $args{idf_type};
$args{db_auto} = 1 unless defined $args{db_auto};

my %options;
$options{k1} = defined $args{k1} ? delete $args{k1} : 2.0;
$options{b} = defined $args{b} ? delete $args{b} : 0.75;

if ($args{db_auto})
{
$options{db_auto_child} = 1;
$args{db_auto} = 0;
}
else { $options{db_auto_child} = 0; }

my $self = $class->SUPER::new(\%args);
$self->{k1} = $options{k1};
$self->{b} = $options{b};
$self->{k1} = $options{k1};
$self->{b} = $options{b};
$self->{db_auto_child} = $options{db_auto_child};

return $self;
}
Expand All @@ -34,6 +43,9 @@ sub extract
my ($self, $arg) = @_;

my ($k1, $b) = ($self->{k1}, $self->{b});
my $db_auto = $self->{db_auto_child};
my $fetch_df = $self->{fetch_df};
my $fetch_unk_word_df = $self->{fetch_unk_word_df};

my $data = {};

Expand All @@ -52,9 +64,16 @@ sub extract

my $dl_avg = $dl_sum / scalar @{$arg};


if ($db_auto)
{
if ($fetch_df || $fetch_unk_word_df) { $self->db_open('write'); }
else { $self->db_open('read'); }
}

for my $text (@{$arg})
{
my $tfidf = $self->tfidf($text)->dump;
my $tfidf = $self->SUPER::tfidf($text)->dump;
my $dl = length $text;

for my $word (keys %{$tfidf})
Expand All @@ -80,7 +99,14 @@ sub extract
}
else
{
$data = $self->tfidf($arg)->dump;

if ($db_auto)
{
if ($fetch_df || $fetch_unk_word_df) { $self->db_open('write'); }
else { $self->db_open('read'); }
}

$data = $self->SUPER::tfidf($arg)->dump;

#for my $word (keys %{$data})
#{
Expand All @@ -97,6 +123,15 @@ sub extract
#}
}

$self->db_close if $db_auto;

return Lingua::JA::TermExtractor::Result->new($data);
}

sub tfidf
{
my ($self, $arg) = @_;
my $data = $self->SUPER::tfidf($arg, $self->{db_auto_child})->dump;
return Lingua::JA::TermExtractor::Result->new($data);
}

Expand Down Expand Up @@ -131,10 +166,10 @@ my ($appid, $document, @documents);
term_length_min => 2,
tf_min => 2,
df_min => 1_0000,
df_max => 500_0000,
df_max => 1000_0000,
ng_word => [qw/編集 本人 自身 自分 たち さん/],
fetch_unk_word_df => 0,
concatenation_max => 100,
concat_max => 100,
);
p $extractor->extract($document)->dump;
Expand All @@ -153,6 +188,8 @@ my ($appid, $document, @documents);
Lingua::JA::TermExtractor is a term extractor.
This extracts terms from a document or documents.
=head1 METHODS
=head2 new( %config || \%config )
Creates a new Lingua::JA::TermExtractor instance.
Expand All @@ -170,11 +207,12 @@ The following configuration is used if you don't set %config.
ng_word []
term_length_min 2
term_length_max 30
concatenation_max 30
concat_max 30
tf_min 1
df_min 0
df_max 250_0000_0000
fetch_unk_word_df 0
db_auto 1
idf_type 1
api 'Yahoo'
Expand All @@ -196,7 +234,7 @@ The weight of term frequency(TF).
The weight of document length normalization.
=item pos(1|2|3)_filter, ng_word, term_length_(min|max), concatenation_max, tf_min, df_(min|max), fetch_unk_word_df
=item pos(1|2|3)_filter, ng_word, term_length_(min|max), concat_max, tf_min, df_(min|max), fetch_unk_word_df, db_auto
See L<Lingua::JA::TFWebIDF>.
Expand All @@ -209,7 +247,7 @@ See L<Lingua::JA::WebIDF>.
=head2 extract( $document || \@documents )
Extracts terms from $document or \@documents.
Word segmentation and POS tagging is done with MeCab.
Word segmentation and POS tagging are done with MeCab.
=head2 tfidf, tf
Expand All @@ -225,11 +263,11 @@ pawa E<lt>pawapawa@cpan.orgE<gt>
=head1 SEE ALSO
L<Lingua::JA::WebIDF>.
L<Lingua::JA::WebIDF>
L<Lingua::JA::WebIDF::Driver::TokyoTyrant>.
L<Lingua::JA::WebIDF::Driver::TokyoTyrant>
L<Lingua::JA::TFWebIDF>.
L<Lingua::JA::TFWebIDF>
=head1 LICENSE
Expand Down
24 changes: 16 additions & 8 deletions t/04_data_structure.t
Expand Up @@ -19,26 +19,31 @@ my %config = (
term_length_min => 1,
term_length_max => 30,
df_min => 0,
concatenation_max => 0,
concat_max => 0,
);

my $texts = texts();

my $extractor = Lingua::JA::TermExtractor->new(\%config);
ds_check( $extractor->tfidf('テスト')->dump, 'SCALAR', 'TFIDF' );
ds_check( $extractor->extract('テスト')->dump, 'SCALAR' );
ds_check( $extractor->extract($texts)->dump, '' );
ds_check( $extractor->extract($texts)->dump );

$config{concatenation_max} = 100;
$config{concat_max} = 100;
$extractor = Lingua::JA::TermExtractor->new(\%config);
ds_check_concat( $extractor->tfidf('テスト')->dump, 'SCALAR', 'TFIDF' );
ds_check_concat( $extractor->extract('テスト')->dump, 'SCALAR' );
ds_check_concat( $extractor->extract($texts)->dump, '' );
ds_check_concat( $extractor->extract($texts)->dump );

done_testing;


sub ds_check
{
my ($data, $type) = @_;
my ($data, $type, $algorithm) = @_;

$type = 'ARRAY' unless defined $type;
$algorithm = 'BM25' unless defined $algorithm;

for my $word (keys %{$data})
{
Expand All @@ -48,13 +53,16 @@ sub ds_check
like($data->{$word}{unknown}, qr/^[01]$/, 'unknown');
like($data->{$word}{tf}, qr/^[0-9]+$/, 'tf');
like($data->{$word}{tfidf}, qr/^[\.0-9]+$/, 'tfidf') if $type eq 'SCALAR';
like($data->{$word}{bm25}, qr/^[\.0-9]+$/, 'bm25') if $type ne 'SCALAR';
like($data->{$word}{bm25}, qr/^[\.0-9]+$/, 'bm25') if $type ne 'SCALAR' && $algorithm ne 'TFIDF';
}
}

sub ds_check_concat
{
my ($data, $type) = @_;
my ($data, $type, $algorithm) = @_;

$type = 'ARRAY' unless defined $type;
$algorithm = 'BM25' unless defined $algorithm;

for my $word (keys %{$data})
{
Expand All @@ -64,7 +72,7 @@ sub ds_check_concat
is(ref $data->{$word}{unknown}, 'ARRAY', 'unknown');
like($data->{$word}{tf}, qr/^[0-9]+$/, 'tf');
like($data->{$word}{tfidf}, qr/^[\.0-9]+$/, 'tfidf') if $type eq 'SCALAR';
like($data->{$word}{bm25}, qr/^[\.0-9]+$/, 'bm25') if $type ne 'SCALAR';
like($data->{$word}{bm25}, qr/^[\.0-9]+$/, 'bm25') if $type ne 'SCALAR' && $algorithm ne 'TFIDF';
}
}

Expand Down
56 changes: 56 additions & 0 deletions t/05_db_auto.t
@@ -0,0 +1,56 @@
use strict;
use warnings;
use utf8;
use Lingua::JA::TermExtractor;
use Test::More;
use Test::Fatal;
use Test::Requires qw/TokyoCabinet/;

binmode Test::More->builder->$_ => ':utf8'
for qw/output failure_output todo_output/;


unlink './test.tch';

my %config = (
appid => 'test',
fetch_df => 0,
driver => 'TokyoCabinet',
df_file => './test.tch',
pos1_filter => [],
pos2_filter => [],
pos3_filter => [],
ng_word => [],
tf_min => 1,
term_length_min => 1,
term_length_max => 30,
df_min => 0,
db_auto => 0,
);

my $extractor = Lingua::JA::TermExtractor->new(\%config);
my $exception = exception{ $extractor->extract('テスト'); };
like($exception, qr/not opened/, 'not opened');

$extractor->db_open('read');
$exception = exception{ $extractor->extract('テスト'); };
is($exception, undef, 'opened');
$extractor->db_close;

$extractor = Lingua::JA::TermExtractor->new(\%config);
$exception = exception{ $extractor->tfidf('テスト'); };
like($exception, qr/not opened/, 'not opened (tfidf)');

$extractor->db_open('read');
$exception = exception{ $extractor->tfidf('テスト'); };
is($exception, undef, 'opened (tfidf)');
$extractor->db_close;

$config{'db_auto'} = 1;
$extractor = Lingua::JA::TermExtractor->new(\%config);
$exception = exception{ $extractor->tfidf('テスト'); };
is($exception, undef, 'tfidf auto db open');

unlink './test.tch';

done_testing;

0 comments on commit d9e16f5

Please sign in to comment.