Started new design

patspam · May 17, 2010 · bc93eab · bc93eab
1 parent 8f6f292
commit bc93eab
Show file tree

Hide file tree

Showing 8 changed files with 155 additions and 151 deletions.
diff --git a/Changes b/Changes
@@ -1,5 +1,4 @@
-Revision history for App::BookIndex
+Revision history for Book-Index
 
 {{$NEXT}}
-    First version
-
+    First version with new design
diff --git a/bin/indexer b/bin/indexer
diff --git a/dist.ini b/dist.ini
@@ -1,4 +1,4 @@
-name    = Indexer
+name    = Book-Index
 author  = Patrick Donelan <pat@patspam.com>
 license = Perl_5
 copyright_holder = Patrick Donelan

diff --git a/lib/Book/Index.pm b/lib/Book/Index.pm
@@ -0,0 +1,102 @@
+package Book::Index;
+
+# ABSTRACT: Create an index for a book manuscript
+
+=head1 DESCRIPTION
+
+=cut
+
+use 5.010;
+use strict;
+use warnings;
+
+use Book::Index::Tables;
+
+# use File::Slurp qw(read_file);
+# use Lingua::EN::Splitter;
+# use Lingua::Stem::Snowball;
+# use Lingua::EN::StopWords qw(%StopWords);
+# 
+# use ORLite {
+    # file     => 'sqlite.db',
+    # readonly => 0,
+    # create   => sub {
+        # my $dbh = shift;
+        # $dbh->do(<<END_SQL);
+# CREATE TABLE word ( 
+    # id INTEGER PRIMARY KEY, 
+    # word TEXT, count INTEGER, 
+    # page INTEGER
+# )
+# END_SQL
+      # }
+# };
+# 
+# sub process {
+    # my ( $class, $filename ) = @_;
+# 
+    # my $doc      = read_file($filename);
+    # my $splitter = new Lingua::EN::Splitter;
+    # my $stemmer  = Lingua::Stem::Snowball->new( lang => 'en' );
+# 
+    # my @pages = split "\f", $doc;
+# 
+    # say "Generating index for file: $filename";
+# 
+    # for my $page ( 0 .. $#pages ) {
+        # say "Processing page $page..";
+# 
+        # my @words =
+          # grep { !$StopWords{$_} }
+          # map  { lc } @{ $splitter->words( $pages[$page] ) };
+# 
+        # $stemmer->stem_in_place( \@words );
+# 
+        # my %freq;
+        # map { $freq{$_}++ } grep { !$StopWords{$_} } @words;
+# 
+        # for my $word ( keys %freq ) {
+            # Indexer::Word->new(
+                # word  => $word,
+                # count => $freq{$word},
+                # page  => $page
+            # )->insert;
+        # }
+    # }
+# 
+    # say "Finished procesing all pages.";
+# }
+# 
+# sub word {
+    # my ( $class, $word ) = @_;
+    # my $rows = Indexer->selectall_arrayref(
+        # 'select page, count from word where word = ?',
+        # undef, $word );
+    # print "$word: ";
+    # if ( !@$rows ) {
+        # say "not found.";
+        # return;
+    # }
+    # my @hits;
+    # for my $row (@$rows) {
+        # my $word = $row->[1];
+        # my $times = $row->[0];
+        # push @hits, $word . ( $times > 1 ? " (x$times)" : '' );
+    # }
+    # say join ', ', @hits;
+# }
+# 
+# sub top {
+    # my ( $class, $n ) = @_;
+    # $n ||= 10;
+    # my $rows = Indexer->selectall_arrayref(
+# 'select sum(count) as count, word from word group by word order by count desc limit ?',
+        # undef, $n
+    # );
+    # say "Top $n Words:\n";
+    # for my $row (@$rows) {
+        # printf( "%5d %s\n", @$row );
+    # }
+# }
+
+1;
diff --git a/lib/Book/Index/Tables.pm b/lib/Book/Index/Tables.pm
@@ -0,0 +1,45 @@
+package Book::Index::Tables;
+
+# ABSTRACT: Defines ORLite tables
+
+sub USER_VERSION() {1}
+
+sub create {
+    my $dbh = shift;
+
+    $dbh->do( 'PRAGMA user_version = ' . USER_VERSION );
+
+    my %schema = (
+        page             => [qw(page contents)],
+        phrase           => [qw(phrase original primary)],
+        stem             => [qw(stem)],
+        word_page        => [qw(word page n)],
+        phrase_page      => [qw(phrase word)],
+        phrase_stem      => [qw(phrase stem)],
+        phrase_word_page => [qw(phrase word page n)],
+        phrase_stem_page => [qw(phrase stem page n)],
+    );
+
+    for my $table ( sort keys %schema ) {
+        my $cols = join ",\n    ", map { "`$_`" } @{ $schema{$table} };
+        my $sql = <<END_SQL;
+CREATE TABLE `$table` (
+    id INTEGER PRIMARY KEY,
+    $cols
+)
+END_SQL
+        # warn $sql;
+        $dbh->do($sql);
+    }
+}
+
+use ORLite {
+    'package'    => 'Book::Index',
+    file         => 'data/sqlite.db',
+    user_version => USER_VERSION,
+    cleanup      => 'VACUUM',
+    create       => \&create,
+    prune        => 1,                  # while developing
+};
+
+1;
diff --git a/lib/Indexer.pm b/lib/Indexer.pm
diff --git a/lib/Indexer/Cmd.pm b/lib/Indexer/Cmd.pm
diff --git a/t/tables.t b/t/tables.t
@@ -0,0 +1,5 @@
+use strict;
+use Test::More tests => 1;
+use Book::Index;
+
+like(Book::Index->dsn, qr/^dbi:SQLite:/);