Skip to content

Commit

Permalink
Bundled Japanese WordNet Database file
Browse files Browse the repository at this point in the history
  • Loading branch information
pawa- committed Dec 25, 2012
1 parent fc2cb2e commit 5c53419
Show file tree
Hide file tree
Showing 17 changed files with 230 additions and 100 deletions.
118 changes: 77 additions & 41 deletions lib/Lingua/JA/WordNet.pm
Expand Up @@ -4,37 +4,61 @@ use 5.008_001;
use strict;
use warnings;

use Carp ();
use DBI;
use Carp ();
use File::ShareDir ();

our $VERSION = '0.05';

my $DB_FILE = 'wnjpn-1.1.db';


sub _options
{
return {
data => File::ShareDir::dist_file('Lingua-JA-WordNet', $DB_FILE),
enable_utf8 => 0,
verbose => 0,
};
}

sub new
{
my $class = shift;
my %args;

if (scalar @_ == 1) { $args{data} = shift; }
else { %args = @_; }
my $options = $class->_options;

if (scalar @_ == 1) { $options->{data} = shift; }
else
{
my %args = @_;

for my $key (keys %args)
{
if ( ! exists $options->{$key} ) { Carp::croak "Unknown option: '$key'"; }
else { $options->{$key} = $args{$key}; }
}
}

Carp::croak "WordNet data path is not set" if ! $args{data};
Carp::croak "WordNet data is not found" if ! -e $args{data};
Carp::croak 'WordNet data file is not found' unless -f $options->{data};

$args{enable_utf8} = 0 if !exists $args{enable_utf8}; # default is 0
$args{verbose} = 0 if !exists $args{verbose}; # default is 0
my $dbh = DBI->connect("dbi:SQLite:dbname=$options->{data}", '', '', {
#Warn => 0, # get rid of annoying disconnect message
# The Warn attribute enables useful warnings for certain bad practices.
# It is enabled by default and should only be disabled in rare circumstances.
# (see http://search.cpan.org/dist/DBI/DBI.pm#Warn)

my $dbh = DBI->connect("dbi:SQLite:dbname=$args{data}", "", "", {
Warn => 0, # get rid of annoying disconnect message
RaiseError => 1,
PrintError => 0,
AutoCommit => 0,
sqlite_unicode => $args{enable_utf8},
sqlite_unicode => $options->{enable_utf8},
});

bless { dbh => $dbh, verbose => $args{verbose} }, $class;
bless { dbh => $dbh, verbose => $options->{verbose} }, $class;
}

sub DESTROY { shift->{dbh}->disconnect; }

sub Word
{
my ($self, $synset, $lang) = @_;
Expand All @@ -49,9 +73,9 @@ sub Word

$sth->execute($synset, $lang);

my @words = map { $_->[0] =~ s/_/ /go; $_->[0]; } @{$sth->fetchall_arrayref};
my @words = map { $_->[0] =~ s/_/ /g; $_->[0]; } @{$sth->fetchall_arrayref};

Carp::carp "Word: no words for $synset in $lang" if $self->{verbose} && !scalar @words;
Carp::carp "Word: there are no words for $synset in $lang" if $self->{verbose} && ! scalar @words;

return @words;
}
Expand Down Expand Up @@ -79,7 +103,7 @@ sub Synset
push(@synsets, $synset);
}

Carp::carp "Synset: no synsets for $word in $lang" if $self->{verbose} && !scalar @synsets;
Carp::carp "Synset: there are no synsets for $word in $lang" if $self->{verbose} && ! scalar @synsets;

return @synsets;
}
Expand Down Expand Up @@ -108,16 +132,16 @@ sub SynPos
push(@synsets, $synset);
}

Carp::carp "SynPos: no synsets for $word in $lang with pos: $pos" if $self->{verbose} && !scalar @synsets;
Carp::carp "SynPos: there are no synsets for $word corresponding to '$pos' and '$lang'" if $self->{verbose} && ! scalar @synsets;

return @synsets;
}

sub Pos
{
my ($self, $synset) = @_;
return $1 if $synset =~ /^\d\d\d\d\d\d\d\d-([arnv])$/o;
Carp::carp "Pos: $synset is wrong synset format" if $self->{verbose};
return $1 if $synset =~ /^[0-9]{8}-([arnv])$/;
Carp::carp "Pos: '$synset' is wrong synset format" if $self->{verbose};
return;
}

Expand All @@ -137,7 +161,7 @@ sub Rel

my @synsets = map {$_->[0]} @{$sth->fetchall_arrayref};

Carp::carp "Rel: no $rel links for $synset" if $self->{verbose} && !scalar @synsets;
Carp::carp "Rel: there are no $rel links for $synset" if $self->{verbose} && ! scalar @synsets;

return @synsets;
}
Expand Down Expand Up @@ -165,7 +189,7 @@ sub Def
$defs[$sid] = $def;
}

Carp::carp "Def: no definitions for $synset in $lang" if $self->{verbose} && !scalar @defs;
Carp::carp "Def: there are no definition sentences for $synset in $lang" if $self->{verbose} && ! scalar @defs;

return @defs;
}
Expand Down Expand Up @@ -193,7 +217,7 @@ sub Ex
$exs[$sid] = $ex;
}

Carp::carp "Ex: no examples for $synset in $lang" if $self->{verbose} && !scalar @exs;
Carp::carp "Ex: there are no example sentences for $synset in $lang" if $self->{verbose} && ! scalar @exs;

return @exs;
}
Expand Down Expand Up @@ -223,7 +247,7 @@ my ($db_path, %config, $synset, $lang, $pos, $rel);
use Lingua::JA::WordNet;
my $wn = Lingua::JA::WordNet->new('wnjpn-1.1.db');
my $wn = Lingua::JA::WordNet->new;
my @synsets = $wn->Synset('相撲', 'jpn');
my @hypes = $wn->Rel($synsets[0], 'hype');
my @words = $wn->Word($hypes[0], 'jpn');
Expand All @@ -238,66 +262,65 @@ Lingua::JA::WordNet is yet another Perl module to look up
entries in Japanese WordNet.
The original Perl module is WordNet::Multi.
WordNet::Multi is awkward to use and not maintained.
WordNet::Multi is awkward to use and no longer maintained.
Because of this, I uploaded this module.
=head1 METHODS
=head2 new($db_path) or new(%config)
=head2 $wn = new($db_path) or new(%config)
Creates a new Lingua::JA::WordNet instance.
my $wn = Lingua::JA::WordNet->new(
data => $db_path, # default is undef
enable_utf8 => 1, # default is 0 (see sqlite_unicode attribute of DBD::SQLite)
data => $db_path, # default is File::ShareDir::dist_file('Lingua-JA-WordNet', 'wnjpn-1.1.db')
enable_utf8 => 1, # default is 0 (see sqlite_unicode attribute of L<DBD::SQLite>)
verbose => 0, # default is 0 (all warnings are ignored)
);
The data must be Japanese WordNet and English WordNet in an SQLite3 database.
(Please download it from L<http://nlpwww.nict.go.jp/wn-ja/>)
=head2 Word($synset, $lang)
=head2 @words = $wn->Word($synset, $lang)
Returns the words corresponding to $synset and $lang.
=head2 Synset($word, $lang)
=head2 @synsets = $wn->Synset($word, $lang)
Returns the synsets corresponding to $word and $lang.
=head2 SynPos($word, $pos, $lang)
=head2 @synsets = $wn->SynPos($word, $pos, $lang)
Returns the synsets corresponding to $word, $pos and $lang.
=head2 Pos($synset)
=head2 $pos = $wn->Pos($synset)
Returns the part of speech of $synset.
=head2 Rel($synset, $rel)
=head2 @synsets = $wn->Rel($synset, $rel)
Returns the relational synsets corresponding to $synset and $rel.
=head2 Def($synset, $lang)
=head2 @defs = $wn->Def($synset, $lang)
Returns the definition sentences corresponding to $synset and $lang.
=head2 Ex($synset, $lang)
=head2 @exs = $wn->Ex($synset, $lang)
Returns the example sentences corresponding to $synset and $lang,
=head2 AllSynsets()
=head2 @allsynsets = $wn->AllSynsets()
Returns all synsets.
=head2 LANGUAGES
The values which can be set to $lang are 'jpn' and 'eng'.
$lang can take 'jpn' or 'eng'.
=head2 PARTS OF SPEECH
The values which can be set to $pos are left side values of the following table.
$pos can take the left side values of the following table.
a|adjective
r|adverb
Expand All @@ -308,12 +331,12 @@ The values which can be set to $pos are left side values of the following table.
n|名詞
v|動詞
This is the result of SQLite3 command 'SELECT pos, def FROM pos_def'.
This is the result of the SQL query 'SELECT pos, def FROM pos_def'.
=head2 RELATIONS
The values which can be set to $rel are left side values of the following table.
$rel can take the left side values of the following table.
also|See also
syns|Synonyms
Expand Down Expand Up @@ -341,7 +364,7 @@ The values which can be set to $rel are left side values of the following table.
dmtr|In Domain --- Region
ants|Antonyms
This is the result of SQLite3 command 'SELECT link, def FROM link_def'.
This is the result of the SQL query 'SELECT link, def FROM link_def'.
=head1 AUTHOR
Expand All @@ -352,9 +375,22 @@ pawa E<lt>pawapawa@cpan.orgE<gt>
Japanese WordNet: L<http://nlpwww.nict.go.jp/wn-ja/>
L<http://twitter.com/LinguaJAWordNet>
=head1 LICENSE
This library is free software; you can redistribute it and/or modify
This library except bundled WordNet database file is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
The bundled WordNet database file complies with the following licenses:
=over 4
=item * For Japanese data: L<http://nlpwww.nict.go.jp/wn-ja/license.txt>
=item * For English data: L<http://wordnet.princeton.edu/wordnet/license/>
=back
=cut
35 changes: 35 additions & 0 deletions share/LICENSE.txt
@@ -0,0 +1,35 @@
Copyright: 2009, 2010 NICT

Japanese WordNet

This software and database is being provided to you, the LICENSEE, by
the National Institute of Information and Communications Technology
under the following license. By obtaining, using and/or copying this
software and database, you agree that you have read, understood, and
will comply with these terms and conditions:

Permission to use, copy, modify and distribute this software and
database and its documentation for any purpose and without fee or
royalty is hereby granted, provided that you agree to comply with
the following copyright notice and statements, including the
disclaimer, and that the same appear on ALL copies of the software,
database and documentation, including modifications that you make
for internal use or for distribution.

Japanese WordNet Copyright 2009, 2010 by the National Institute of
Information and Communications Technology (NICT). All rights
reserved.

THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND NICT MAKES NO
REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE,
BUT NOT LIMITATION, NICT MAKES NO REPRESENTATIONS OR WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE
OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT INFRINGE
ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS.

The name of the National Institute of Information and Communications
Technology may not be used in advertising or publicity pertaining to
distribution of the software and/or database. Title to copyright in
this software, database and any associated documentation shall at all
times remain with National Institute of Information and Communications
Technology and LICENSEE agrees to preserve same.
Binary file added share/wnjpn-1.1.db
Binary file not shown.
21 changes: 19 additions & 2 deletions t/01_basic.t
Expand Up @@ -4,9 +4,26 @@ use Lingua::JA::WordNet;
use Test::More;
use Test::Exception;

my $wn = Lingua::JA::WordNet->new('./wordnet/test.db');
my $wn = Lingua::JA::WordNet->new;
isa_ok($wn, 'Lingua::JA::WordNet');
can_ok('Lingua::JA::WordNet', qw/Word Synset SynPos Pos Rel Def Ex AllSynsets/);
throws_ok { Lingua::JA::WordNet->new; } qr/WordNet data path is not set/;

throws_ok { Lingua::JA::WordNet->new('./hoge/hage/hige.db'); } qr/WordNet data file is not found/;
throws_ok { Lingua::JA::WordNet->new('./share/'); } qr/WordNet data file is not found/;
throws_ok { Lingua::JA::WordNet->new(data => './hage.db'); } qr/WordNet data file is not found/;
throws_ok { Lingua::JA::WordNet->new(sqlite_unicode => 1); } qr/Unknown option: 'sqlite_unicode'/;


my %config = (
enable_utf8 => 1,
verbose => 1,
);

lives_ok { Lingua::JA::WordNet->new(%config); } qr/valid config/;

lives_ok { $wn = Lingua::JA::WordNet->new(data => './wordnet/test.db'); } qr/valid config/;

my @words = $wn->Word('00000001-n', 'jpn');
is($words[0], 'ミク');

done_testing;
10 changes: 6 additions & 4 deletions t/02_word.t
Expand Up @@ -5,15 +5,17 @@ use Test::More;
use Test::Warn;

my $wn = Lingua::JA::WordNet->new(
data => './wordnet/test.db',
verbose => 1
);

my @words = $wn->Word('00000001-n', 'jpn');
is($words[0], 'ミク');
my @words = $wn->Word('00448232-n', 'jpn');
is_deeply(\@words, [qw/大相撲 角力 角技 相撲/]);

@words = $wn->Word('00448232-n', 'eng');
is_deeply(\@words, [qw/sumo/]);

warning_is { @words = $wn->Word('3939-miku', 'negi') }
'Word: no words for 3939-miku in negi', 'word of unknown synset';
'Word: there are no words for 3939-miku in negi', 'word of unknown synset';

is(scalar @words, 0);

Expand Down
10 changes: 6 additions & 4 deletions t/03_synset.t
Expand Up @@ -5,15 +5,17 @@ use Test::More;
use Test::Warn;

my $wn = Lingua::JA::WordNet->new(
data => './wordnet/test.db',
verbose => 1,
);

my @synsets = $wn->Synset('ミク', 'jpn');
is($synsets[0], '00000001-n');
my @synsets = $wn->Synset('相撲', 'jpn');
is_deeply(\@synsets, [qw/00448232-n 10674713-n/]);

@synsets = $wn->Synset('sumo', 'eng');
is_deeply(\@synsets, [qw/00448232-n/]);

warning_is { @synsets = $wn->Synset('Perl', 'jpn') }
'Synset: no synsets for Perl in jpn', 'synset of unknown word';
'Synset: there are no synsets for Perl in jpn', 'synset of unknown word';

is(scalar @synsets, 0);

Expand Down

0 comments on commit 5c53419

Please sign in to comment.