added db_auto option, changed Lingua::JA::TFWebIDF dependency

pawa- · Jul 30, 2012 · d9e16f5 · d9e16f5
1 parent d8f1fe9
commit d9e16f5
Show file tree

Hide file tree

Showing 5 changed files with 135 additions and 30 deletions.
diff --git a/Makefile.PL b/Makefile.PL
@@ -4,11 +4,12 @@ name     'Lingua-JA-TermExtractor';
 all_from 'lib/Lingua/JA/TermExtractor.pm';
 
 requires 'Carp';
-requires 'Lingua::JA::TFWebIDF' => 0.34;
+requires 'Lingua::JA::TFWebIDF' => 0.36;
 
 test_requires 'Test::More' => 0.88; # done_testing
 test_requires 'Test::Fatal';
 test_requires 'Test::Warn' => 0.20;
+test_requires 'Test::Requires';
 
 tests 't/*.t';
 author_tests 'xt';

diff --git a/README b/README
@@ -18,10 +18,10 @@ SYNOPSIS
           term_length_min   => 2,
           tf_min            => 2,
           df_min            => 1_0000,
-          df_max            => 500_0000,
+          df_max            => 1000_0000,
           ng_word           => [qw/編集 本人 自身 自分 たち さん/],
           fetch_unk_word_df => 0,
-          concatenation_max => 100,
+          concat_max        => 100,
       );
 
       p $extractor->extract($document)->dump;
@@ -38,6 +38,7 @@ DESCRIPTION
     Lingua::JA::TermExtractor is a term extractor. This extracts terms from
     a document or documents.
 
+METHODS
   new( %config || \%config )
     Creates a new Lingua::JA::TermExtractor instance.
 
@@ -54,11 +55,12 @@ DESCRIPTION
       ng_word             []
       term_length_min     2
       term_length_max     30
-      concatenation_max   30
+      concat_max          30
       tf_min              1
       df_min              0
       df_max              250_0000_0000
       fetch_unk_word_df   0
+      db_auto             1
 
       idf_type            1
       api                 'Yahoo'
@@ -76,8 +78,8 @@ DESCRIPTION
     b => $value
         The weight of document length normalization.
 
-    pos(1|2|3)_filter, ng_word, term_length_(min|max), concatenation_max,
-    tf_min, df_(min|max), fetch_unk_word_df
+    pos(1|2|3)_filter, ng_word, term_length_(min|max), concat_max, tf_min,
+    df_(min|max), fetch_unk_word_df, db_auto
         See Lingua::JA::TFWebIDF.
 
     idf_type, api, appid, driver, df_file, fetch_df, expires_in, documents,
@@ -86,7 +88,7 @@ DESCRIPTION
 
   extract( $document || \@documents )
     Extracts terms from $document or \@documents. Word segmentation and POS
-    tagging is done with MeCab.
+    tagging are done with MeCab.
 
   tfidf, tf
     See Lingua::JA::TFWebIDF.
@@ -98,11 +100,11 @@ AUTHOR
     pawa <pawapawa@cpan.org>
 
 SEE ALSO
-    Lingua::JA::WebIDF.
+    Lingua::JA::WebIDF
 
-    Lingua::JA::WebIDF::Driver::TokyoTyrant.
+    Lingua::JA::WebIDF::Driver::TokyoTyrant
 
-    Lingua::JA::TFWebIDF.
+    Lingua::JA::TFWebIDF
 
 LICENSE
     This library is free software; you can redistribute it and/or modify it

diff --git a/lib/Lingua/JA/TermExtractor.pm b/lib/Lingua/JA/TermExtractor.pm
@@ -17,14 +17,23 @@ sub new
     my %args  = (ref $_[0] eq 'HASH' ? %{$_[0]} : @_);
 
     $args{idf_type} = 3 unless defined $args{idf_type};
+    $args{db_auto}  = 1 unless defined $args{db_auto};
 
     my %options;
     $options{k1} = defined $args{k1} ? delete $args{k1} : 2.0;
     $options{b}  = defined $args{b}  ? delete $args{b}  : 0.75;
 
+    if ($args{db_auto})
+    {
+        $options{db_auto_child} = 1;
+        $args{db_auto}          = 0;
+    }
+    else { $options{db_auto_child} = 0; }
+
     my $self = $class->SUPER::new(\%args);
-    $self->{k1} = $options{k1};
-    $self->{b}  = $options{b};
+    $self->{k1}            = $options{k1};
+    $self->{b}             = $options{b};
+    $self->{db_auto_child} = $options{db_auto_child};
 
     return $self;
 }
@@ -34,6 +43,9 @@ sub extract
     my ($self, $arg) = @_;
 
     my ($k1, $b) = ($self->{k1}, $self->{b});
+    my $db_auto  = $self->{db_auto_child};
+    my $fetch_df = $self->{fetch_df};
+    my $fetch_unk_word_df = $self->{fetch_unk_word_df};
 
     my $data = {};
 
@@ -52,9 +64,16 @@ sub extract
 
         my $dl_avg = $dl_sum / scalar @{$arg};
 
+
+        if ($db_auto)
+        {
+            if ($fetch_df || $fetch_unk_word_df) { $self->db_open('write'); }
+            else                                 { $self->db_open('read');  }
+        }
+
         for my $text (@{$arg})
         {
-            my $tfidf = $self->tfidf($text)->dump;
+            my $tfidf = $self->SUPER::tfidf($text)->dump;
             my $dl    = length $text;
 
             for my $word (keys %{$tfidf})
@@ -80,7 +99,14 @@ sub extract
     }
     else
     {
-        $data = $self->tfidf($arg)->dump;
+
+        if ($db_auto)
+        {
+            if ($fetch_df || $fetch_unk_word_df) { $self->db_open('write'); }
+            else                                 { $self->db_open('read');  }
+        }
+
+        $data = $self->SUPER::tfidf($arg)->dump;
 
         #for my $word (keys %{$data})
         #{
@@ -97,6 +123,15 @@ sub extract
         #}
     }
 
+    $self->db_close if $db_auto;
+
+    return Lingua::JA::TermExtractor::Result->new($data);
+}
+
+sub tfidf
+{
+    my ($self, $arg) = @_;
+    my $data = $self->SUPER::tfidf($arg, $self->{db_auto_child})->dump;
     return Lingua::JA::TermExtractor::Result->new($data);
 }
 
@@ -131,10 +166,10 @@ my ($appid, $document, @documents);
       term_length_min   => 2,
       tf_min            => 2,
       df_min            => 1_0000,
-      df_max            => 500_0000,
+      df_max            => 1000_0000,
       ng_word           => [qw/編集 本人 自身 自分 たち さん/],
       fetch_unk_word_df => 0,
-      concatenation_max => 100,
+      concat_max        => 100,
   );
 
   p $extractor->extract($document)->dump;
@@ -153,6 +188,8 @@ my ($appid, $document, @documents);
 Lingua::JA::TermExtractor is a term extractor.
 This extracts terms from a document or documents.
 
+=head1 METHODS
+
 =head2 new( %config || \%config )
 
 Creates a new Lingua::JA::TermExtractor instance.
@@ -170,11 +207,12 @@ The following configuration is used if you don't set %config.
   ng_word             []
   term_length_min     2
   term_length_max     30
-  concatenation_max   30
+  concat_max          30
   tf_min              1
   df_min              0
   df_max              250_0000_0000
   fetch_unk_word_df   0
+  db_auto             1
 
   idf_type            1
   api                 'Yahoo'
@@ -196,7 +234,7 @@ The weight of term frequency(TF).
 
 The weight of document length normalization.
 
-=item pos(1|2|3)_filter, ng_word, term_length_(min|max), concatenation_max, tf_min, df_(min|max), fetch_unk_word_df
+=item pos(1|2|3)_filter, ng_word, term_length_(min|max), concat_max, tf_min, df_(min|max), fetch_unk_word_df, db_auto
 
 See L<Lingua::JA::TFWebIDF>.
 
@@ -209,7 +247,7 @@ See L<Lingua::JA::WebIDF>.
 =head2 extract( $document || \@documents )
 
 Extracts terms from $document or \@documents.
-Word segmentation and POS tagging is done with MeCab.
+Word segmentation and POS tagging are done with MeCab.
 
 =head2 tfidf, tf
 
@@ -225,11 +263,11 @@ pawa E<lt>pawapawa@cpan.orgE<gt>
 
 =head1 SEE ALSO
 
-L<Lingua::JA::WebIDF>.
+L<Lingua::JA::WebIDF>
 
-L<Lingua::JA::WebIDF::Driver::TokyoTyrant>.
+L<Lingua::JA::WebIDF::Driver::TokyoTyrant>
 
-L<Lingua::JA::TFWebIDF>.
+L<Lingua::JA::TFWebIDF>
 
 =head1 LICENSE
 

diff --git a/t/04_data_structure.t b/t/04_data_structure.t
@@ -19,26 +19,31 @@ my %config = (
     term_length_min   => 1,
     term_length_max   => 30,
     df_min            => 0,
-    concatenation_max => 0,
+    concat_max        => 0,
 );
 
 my $texts = texts();
 
 my $extractor = Lingua::JA::TermExtractor->new(\%config);
+ds_check( $extractor->tfidf('テスト')->dump, 'SCALAR', 'TFIDF' );
 ds_check( $extractor->extract('テスト')->dump, 'SCALAR' );
-ds_check( $extractor->extract($texts)->dump, '' );
+ds_check( $extractor->extract($texts)->dump );
 
-$config{concatenation_max} = 100;
+$config{concat_max} = 100;
 $extractor = Lingua::JA::TermExtractor->new(\%config);
+ds_check_concat( $extractor->tfidf('テスト')->dump, 'SCALAR', 'TFIDF' );
 ds_check_concat( $extractor->extract('テスト')->dump, 'SCALAR' );
-ds_check_concat( $extractor->extract($texts)->dump, '' );
+ds_check_concat( $extractor->extract($texts)->dump );
 
 done_testing;
 
 
 sub ds_check
 {
-    my ($data, $type) = @_;
+    my ($data, $type, $algorithm) = @_;
+
+    $type      = 'ARRAY' unless defined $type;
+    $algorithm = 'BM25'  unless defined $algorithm;
 
     for my $word (keys %{$data})
     {
@@ -48,13 +53,16 @@ sub ds_check
         like($data->{$word}{unknown}, qr/^[01]$/,      'unknown');
         like($data->{$word}{tf},      qr/^[0-9]+$/,    'tf');
         like($data->{$word}{tfidf},   qr/^[\.0-9]+$/,  'tfidf') if $type eq 'SCALAR';
-        like($data->{$word}{bm25},    qr/^[\.0-9]+$/,  'bm25')  if $type ne 'SCALAR';
+        like($data->{$word}{bm25},    qr/^[\.0-9]+$/,  'bm25')  if $type ne 'SCALAR' && $algorithm ne 'TFIDF';
     }
 }
 
 sub ds_check_concat
 {
-    my ($data, $type) = @_;
+    my ($data, $type, $algorithm) = @_;
+
+    $type      = 'ARRAY' unless defined $type;
+    $algorithm = 'BM25'  unless defined $algorithm;
 
     for my $word (keys %{$data})
     {
@@ -64,7 +72,7 @@ sub ds_check_concat
         is(ref $data->{$word}{unknown}, 'ARRAY',    'unknown');
         like($data->{$word}{tf},    qr/^[0-9]+$/,   'tf');
         like($data->{$word}{tfidf}, qr/^[\.0-9]+$/, 'tfidf') if $type eq 'SCALAR';
-        like($data->{$word}{bm25},  qr/^[\.0-9]+$/, 'bm25')  if $type ne 'SCALAR';
+        like($data->{$word}{bm25},  qr/^[\.0-9]+$/, 'bm25')  if $type ne 'SCALAR' && $algorithm ne 'TFIDF';
     }
 }
 

diff --git a/t/05_db_auto.t b/t/05_db_auto.t
@@ -0,0 +1,56 @@
+use strict;
+use warnings;
+use utf8;
+use Lingua::JA::TermExtractor;
+use Test::More;
+use Test::Fatal;
+use Test::Requires qw/TokyoCabinet/;
+
+binmode Test::More->builder->$_ => ':utf8'
+    for qw/output failure_output todo_output/;
+
+
+unlink './test.tch';
+
+my %config = (
+    appid           => 'test',
+    fetch_df        => 0,
+    driver          => 'TokyoCabinet',
+    df_file         => './test.tch',
+    pos1_filter     => [],
+    pos2_filter     => [],
+    pos3_filter     => [],
+    ng_word         => [],
+    tf_min          => 1,
+    term_length_min => 1,
+    term_length_max => 30,
+    df_min          => 0,
+    db_auto         => 0,
+);
+
+my $extractor = Lingua::JA::TermExtractor->new(\%config);
+my $exception = exception{ $extractor->extract('テスト'); };
+like($exception, qr/not opened/, 'not opened');
+
+$extractor->db_open('read');
+$exception = exception{ $extractor->extract('テスト'); };
+is($exception, undef, 'opened');
+$extractor->db_close;
+
+$extractor = Lingua::JA::TermExtractor->new(\%config);
+$exception = exception{ $extractor->tfidf('テスト'); };
+like($exception, qr/not opened/, 'not opened (tfidf)');
+
+$extractor->db_open('read');
+$exception = exception{ $extractor->tfidf('テスト'); };
+is($exception, undef, 'opened (tfidf)');
+$extractor->db_close;
+
+$config{'db_auto'} = 1;
+$extractor = Lingua::JA::TermExtractor->new(\%config);
+$exception = exception{ $extractor->tfidf('テスト'); };
+is($exception, undef, 'tfidf auto db open');
+
+unlink './test.tch';
+
+done_testing;