Skip to content
Permalink
Browse files

Initial import.

  • Loading branch information...
mamorlis committed Nov 17, 2006
0 parents commit b7a8bf2d0c55ab78a8b8cdfecce809d996c6ede6
Showing with 182,300 additions and 0 deletions.
  1. +5 −0 .cvsignore
  2. +5 −0 Makefile
  3. +104 −0 bin/stc_splitter.pl
  4. +146 −0 bin/syncha
  5. +28 −0 ena/README
  6. +65 −0 ena/bin/ena-learn
  7. +84 −0 ena/bin/ena-mkdata
  8. +74 −0 ena/bin/ena-mkmodel
  9. +82 −0 ena/bin/ena-tagger
  10. +1,104 −0 ena/dat/mod/train.svmdata
  11. +973 −0 ena/dat/mod/train.svmmodel
  12. +2,824 −0 ena/lib/Cab.pm
  13. +3,231 −0 ena/lib/Cab2.pm
  14. +902 −0 ena/lib/ENA.pm
  15. +20 −0 ena/lib/ENA/Conf.pm
  16. +582 −0 ena/lib/MyCabocha.pm
  17. +135 −0 ena/lib/add_func_exp.pl
  18. +84 −0 ena/lib/calc_mi.pl
  19. +36 −0 ena/lib/check_edr.pl
  20. +36 −0 ena/lib/check_event.pl
  21. +30 −0 ena/lib/check_opinion.pl
  22. +27 −0 ena/lib/check_pronoun_type.pl
  23. +32 −0 ena/lib/ext_cab.pl
  24. +231 −0 ena/lib/ext_mod.pl
  25. +59 −0 ena/lib/ext_txt.pl
  26. +85 −0 ena/lib/fix_id.pl
  27. +1,452 −0 ena/lib/func_exp.tsv
  28. +1 −0 ena/lib/memo
  29. +85 −0 ena/lib/mod.pm
  30. +281 −0 ena/lib/tgr.pm
  31. +27 −0 resolveZero/README
  32. +85 −0 resolveZero/bin/resolveZero
  33. +1,452 −0 resolveZero/dat/func/func_exp.tsv
  34. +62 −0 resolveZero/hoge
  35. +1 −0 resolveZero/models/TMP_FE_DUMMY
  36. +1 −0 resolveZero/models/TMP_FE_DUMMY2
  37. +4,380 −0 resolveZero/models/fe_0_GA
  38. +4,416 −0 resolveZero/models/fe_0_NI
  39. +4,380 −0 resolveZero/models/fe_0_WO
  40. +2,915 −0 resolveZero/models/fe_inter_0_GA
  41. +2,915 −0 resolveZero/models/fe_inter_0_NI
  42. +2,916 −0 resolveZero/models/fe_inter_0_WO
  43. +511 −0 resolveZero/models/fe_inter_t_0_GA
  44. +80 −0 resolveZero/models/fe_inter_t_0_NI
  45. +275 −0 resolveZero/models/fe_inter_t_0_WO
  46. +5,330 −0 resolveZero/models/fe_t_0_GA
  47. +2,179 −0 resolveZero/models/fe_t_0_NI
  48. +3,708 −0 resolveZero/models/fe_t_0_WO
  49. +4 −0 resolveZero/models/make_ex_bact_comp.opt
  50. +4 −0 resolveZero/models/make_ex_bact_comp2.opt
  51. +10,000 −0 resolveZero/models/model_0_GA
  52. +1,169 −0 resolveZero/models/model_0_GA.O
  53. BIN resolveZero/models/model_0_GA.bin
  54. +10,000 −0 resolveZero/models/model_0_NI
  55. +763 −0 resolveZero/models/model_0_NI.O
  56. BIN resolveZero/models/model_0_NI.bin
  57. +10,000 −0 resolveZero/models/model_0_WO
  58. +957 −0 resolveZero/models/model_0_WO.O
  59. BIN resolveZero/models/model_0_WO.bin
  60. +10,000 −0 resolveZero/models/model_inter_0_GA
  61. +420 −0 resolveZero/models/model_inter_0_GA.O
  62. BIN resolveZero/models/model_inter_0_GA.bin
  63. +10,000 −0 resolveZero/models/model_inter_0_NI
  64. +130 −0 resolveZero/models/model_inter_0_NI.O
  65. BIN resolveZero/models/model_inter_0_NI.bin
  66. +10,000 −0 resolveZero/models/model_inter_0_WO
  67. +355 −0 resolveZero/models/model_inter_0_WO.O
  68. BIN resolveZero/models/model_inter_0_WO.bin
  69. +10,000 −0 resolveZero/models/model_inter_t_0_GA
  70. +246 −0 resolveZero/models/model_inter_t_0_GA.O
  71. BIN resolveZero/models/model_inter_t_0_GA.bin
  72. +10,000 −0 resolveZero/models/model_inter_t_0_NI
  73. +37 −0 resolveZero/models/model_inter_t_0_NI.O
  74. BIN resolveZero/models/model_inter_t_0_NI.bin
  75. +10,000 −0 resolveZero/models/model_inter_t_0_WO
  76. +160 −0 resolveZero/models/model_inter_t_0_WO.O
  77. BIN resolveZero/models/model_inter_t_0_WO.bin
  78. +10,000 −0 resolveZero/models/model_t_0_GA
  79. +781 −0 resolveZero/models/model_t_0_GA.O
  80. BIN resolveZero/models/model_t_0_GA.bin
  81. +10,000 −0 resolveZero/models/model_t_0_NI
  82. +235 −0 resolveZero/models/model_t_0_NI.O
  83. BIN resolveZero/models/model_t_0_NI.bin
  84. +10,000 −0 resolveZero/models/model_t_0_WO
  85. +404 −0 resolveZero/models/model_t_0_WO.O
  86. BIN resolveZero/models/model_t_0_WO.bin
  87. +3,252 −0 resolveZero/script/cab.pl
  88. +118 −0 resolveZero/script/centerList.pl
  89. +1,418 −0 resolveZero/script/common.pl
  90. +1,273 −0 resolveZero/script/extractFeatures.pl
  91. +212 −0 resolveZero/script/makeTrainingInstancesInter.pl
  92. +190 −0 resolveZero/script/makeTrainingInstancesInter2.pl
  93. +172 −0 resolveZero/script/makeTrainingInstancesIntra.pl
  94. +180 −0 resolveZero/script/makeTrainingInstancesIntra2.pl
  95. +138 −0 resolveZero/script/misc/add_func_exp.pl
  96. +114 −0 resolveZero/script/misc/calc_mi.pl
  97. +42 −0 resolveZero/script/misc/check_edr.pl
  98. +104 −0 resolveZero/script/misc/check_ntt.pl
  99. +33 −0 resolveZero/script/misc/check_pronoun_type.pl
  100. +26 −0 resolveZero/script/mkmodelInter.pl
  101. +26 −0 resolveZero/script/mkmodelInter2.pl
  102. +26 −0 resolveZero/script/mkmodelIntra.pl
  103. +26 −0 resolveZero/script/mkmodelIntra2.pl
  104. +79 −0 resolveZero/script/openModFile.pl
  105. +193 −0 resolveZero/script/resolveZero.pl
  106. +26 −0 resolveZero/script/trainInter.pl
  107. +26 −0 resolveZero/script/trainInter2.pl
  108. +26 −0 resolveZero/script/trainIntra.pl
  109. +26 −0 resolveZero/script/trainIntra2.pl
  110. +74 −0 resolveZero/script/view.pl
  111. +399 −0 resolveZero/testout
@@ -0,0 +1,5 @@
CVS
.svn
*.o
*~
*.bak
@@ -0,0 +1,5 @@
dist:
tar -czf syncha-`date +%Y%m%d`.tar.gz -C .. \
--exclude=.svn --exclude=CVS --exclude='*.bak' --exclude='*~' \
syncha/bin syncha/ena syncha/resolveZero \
syncha/sentence_splitter
@@ -0,0 +1,104 @@
#! /usr/local/bin/perl -w

#------------------------------------------------------
# 入力ファイルの文字コードはeuc-jpと仮定.
# ./stc_splitter.pl < file.euc
#------------------------------------------------------

use strict;

use Getopt::Std;
use vars qw($opt_h);

my $usage =<<"USAGE";
This scirpt reads input from stdin.
USAGE

getopts('h');
die $usage if $opt_h;

my $ascii = '[\x00-\x7F]';
my $twoBytes = '[\x8E\xA1-\xFE][\xA1-\xFE]';
my $undef = '[\xA9-\xAF\xF5-\xFE][\xA1-\xFE]';
my $Zdigit = '(?:\xA3[\xB0-\xB9])';
my $Zspace = '(?:\xA1\xA1)'; # EUC-JP

my %table = (
"\xad\xa1" => '(1)', "\xad\xa2" => '(2)',
"\xad\xa3" => '(3)', "\xad\xa4" => '(4)',
"\xad\xa5" => '(5)', "\xad\xa6" => '(6)',
"\xad\xa7" => '(7)', "\xad\xa8" => '(8)',
"\xad\xa9" => '(9)', "\xad\xaa" => '(10)',
"\xad\xab" => '(11)',"\xad\xac" => '(12)',
"\xad\xad" => '(13)',"\xad\xae" => '(14)',
"\xad\xaf" => '(15)',"\xad\xb0" => '(16)',
"\xad\xb1" => '(17)',"\xad\xb2" => '(18)',
"\xad\xb3" => '(19)',"\xad\xb4" => '(20)',
"\xad\xb5" => '', "\xad\xb6" => 'II',
"\xad\xb7" => 'III', "\xad\xb8" => 'IV',
"\xad\xb9" => '', "\xad\xba" => 'VI',
"\xad\xbb" => 'VII', "\xad\xbc" => 'VIII',
"\xad\xbd" => 'IX', "\xad\xbe" => '',
"\xad\xc0" => 'ミリ', "\xad\xc1" => 'キロ',
"\xad\xc2" => 'センチ', "\xad\xc3" => 'メートル', "\xad\xc4" => 'グラム',
"\xad\xc5" => 'トン', "\xad\xc6" => 'アール',
"\xad\xc7" => 'ヘクタール',"\xad\xc8" => 'リットル',
"\xad\xc9" => 'ワット', "\xad\xca" => 'カロリー',
"\xad\xcb" => 'ドル', "\xad\xcc" => 'センチ',
"\xad\xcd" => 'パーセント', "\xad\xce" => 'ミリバール',
"\xad\xcf" => 'ページ', "\xad\xd0" => 'mm',
"\xad\xd1" => 'cm', "\xad\xd2" => 'km',
"\xad\xd3" => 'mg', "\xad\xd4" => 'kg',
"\xad\xd5" => 'cc', "\xad\xd6" => '平方メートル',
"\xad\xdc" => '平成', "\xad\xe0" => '',
"\xad\xe1" => '', "\xad\xe2" => 'No.',
"\xad\xe3" => 'K.K.',"\xad\xe4" => 'TEL',
"\xad\xe5" => '(上)', "\xad\xe6" => '(中)',
"\xad\xe7" => '(下)', "\xad\xe8" => '(左)',
"\xad\xe9" => '(右)', "\xad\xea" => '(株)',
"\xad\xeb" => '(有)', "\xad\xec" => '(代)',
"\xad\xed" => '明治', "\xad\xee" => '大正',
"\xad\xef" => '昭和', "\xad\xf0" => '',
"\xad\xf1" => '', "\xad\xf2" => '',
"\xad\xf3" => 'c∫', "\xad\xf4" => 'Σ',
"\xad\xf5" => '', "\xad\xf6" => '',
"\xad\xf7" => '', "\xad\xf8" => '',
"\xad\xf9" => 'Δ', "\xad\xfa" => '',
"\xad\xfb" => '', "\xad\xfc" => '',
);

&main;

sub main {
sentence_splitter();
}

#----------------------------------------------------------------------------------------------------
sub sentence_splitter {
while(my $stc = <>) {
chomp($stc);
my @s_array = (); my $c = '';
my $flag = 0;
while($stc =~ /($twoBytes|$ascii|$undef)/) {
$stc = $';
my $one = $1;
if($flag) {
if($one !~ /^(?:。|!|?|.)/) {
#--- 後ろが括弧ならば切らない ---#
if($one eq ')' or $one eq '' or $one eq '' or $one eq '') { $c .= $one }
#--- 後ろが数字ならば切らない ---#
elsif($one =~ /(:?[0-9]|$Zdigit)/) { $c .= $one }
else { push @s_array, $c; $c = $one; }
$flag = 0;
}
} else {
#--- 機種依存文字を適当に変換 ---#
if(exists $table{$one}) { $c .= $table{$one} }
else {$c .= $one }
$flag = 1 if($one eq '' or $one eq '' or $one eq '' or $one eq '');
}
}
push @s_array, $c;
foreach my $k (@s_array) { print $k,"\n" if($k) }
}
}
@@ -0,0 +1,146 @@
#!/usr/bin/env perl
#òµÌ¥ò³ò´

use strict;
use warnings;

our $VERSION = qw(1.2);

use Carp qw(croak);
use File::Temp qw(tempfile tempdir);
use Getopt::Std;

my $usage =<<"__USAGE__";
USAGE: $0 [-a intraParam] [-e interParam] [-r chasenrc] [-s sentence splitter] [-hv123] [input-file(s)]
__USAGE__

my %options;
getopts("a:e:hvr:s:123", \%options);
die $usage if $options{h};
print "Syncha version $VERSION\n" and exit 0 if $options{v};

use FindBin qw($Bin);
chdir "$Bin/.." or croak "Cannot chdir to $Bin/..: $!";
my $resolve_zero = 'resolveZero/script/resolveZero.pl';
my $ena_tagger = 'ena/bin/ena-tagger';
my $splitter_cxx = 'sentence_splitter/prog_split/sentence_splitter';
my $splitter_pl = 'bin/stc_splitter.pl';
my $sent_model = 'sentence_splitter/classifier/study.model';
my $chasen = 'chasen';
my $chasen_flag = $options{r} ? $options{r} : '-j';
my $cabocha = 'cabocha';
my $fix_id = 'ena/lib/fix_id.pl';
my $intra_param = $options{a} || 0;
my $inter_param = $options{e} || 0;

=pod
=head1 NAME
Syncha -- SYNtactic CHunck Annotator
=head1 SYNOPSIS
syncha [-aeo] [input-file]
=head1 DESCRIPTION
Syncha is a Japanese syntactic chunck annotator. It detects predicate-
argument stricture of verb, adjective, and verbial noun. The task
includes:
=item * recognize event noun
=item * fill zero anaphora
=item * identify predicate-argument structure
=item * annotate opinion marker (optional)
=head1 OPTIONS
=item -a
-a option specifies a threshold (0 - 1) to find intra-sentential arguments.
Defaults to 0.
=item -e
-e option specifies a threshold (0 - 1) to find inter-sentential arguments.
Defaults to 0.
=item -s
-s option specifies which sentence splitter to use. Defaults to cxx.
Takes either cxx or pl.
=item -r
-r option is passed to chasen. Defaults to '-j'.
=cut

local $/ = undef;
while (<>) {
my $fh = tempfile();
my $temp_file;
($fh, $temp_file) = tempfile();
print $fh $_;
close $fh;

my $splitter;
if ($options{s} and $options{s} eq 'cxx') {
$splitter = "$splitter_cxx $sent_model";
} elsif ($options{s} and $options{s} eq 'pl') {
$splitter = $splitter_pl;
} else {
$splitter = "$splitter_cxx $sent_model";
}

my $cab = `$splitter $temp_file | $chasen $chasen_flag | $cabocha -I1 -f1`
or croak "Cannot exec $cabocha:$!";
unlink $temp_file;
if ($options{'1'}) {
print $cab;
exit 1;
}

$fh = tempfile();
($fh, $temp_file) = tempfile();
print $fh $cab;
close $fh;

my $ena = `$ena_tagger $temp_file` or croak "Cannot exec $ena_tagger:$!";
unlink $temp_file;
if ($options{'2'}) {
print $ena;
exit 2;
}

$fh = tempfile();
($fh, $temp_file) = tempfile();
print $fh $ena;
close $fh;

my $dir = tempdir( CLEANUP => 1, );
system("cp resolveZero/models/* $dir") == 0
or croak "Cannot exec cp resolveZero/models/:$!";
my $zero_flag = "-d $dir -a ${intra_param} -e ${inter_param}";
my $zero = `cat $temp_file | $resolve_zero $zero_flag`
or croak "Cannot exec $resolve_zero:$!";
unlink $temp_file;
if ($options{'3'}) {
print $zero;
exit 3;
}

$fh = tempfile();
($fh, $temp_file) = tempfile();
print $fh $zero;
close $fh;

#my $result = `cat $temp_file` or croak "Cannot exec $fix_id:$!";
my $result = `cat $temp_file | $fix_id` or croak "Cannot exec $fix_id:$!";
unlink $temp_file;
print $result;
}
@@ -0,0 +1,28 @@
README

* ư���ǧ����Ƥ���Ķ�

���֤�ɤΤ褦�� UNIX �� OS �Ǥ�ư��ޤ�����ư���ǧ����Ƥ���Τ�
�ʲ��δĶ��Ǥ���(64bit �Ķ��� PPC �ʤɥ������ƥ���㤬�㤦��ư��ޤ���)

- Gentoo Linux 2005.1 (gcc-3.4.5, glibc-2.3.6-r3, 2.6.14-gentoo-r5 i686)

* ɬ�פʥ��եȥ�����

- TinySVM http://chasen.org/~taku/software/TinySVM/
- BACT http://chasen.og/~taku/software/bact/
- Perl �⥸�塼�� CDB_File

* ������ץ�

* ena-mkmodel

SVM �Υ�ǥ�Ȥʤ�����ꥹ�ȹ���

* ena-learn

ʬ������ؽ�

* ena-tagger

cabocha �ν��Ϥ���ä� event ���ɤ����Ǥ��Ф�
@@ -0,0 +1,65 @@
#!/usr/bin/perl -w

=head1 NAME
ENA -- Event Noun Annotator
$Id$
Copyright (C) 2005-2006 Mamoru KOMACHI <mamoru-k@is.naist.jp>
=head1 SYNOPSIS
ena-learn [-m file.svmmodel] file.svmdata
=head1 DESCRIPTION
Ena-learn reads SVM features and constructs SVM models for event noun
annotation. You can obtain features from ena-mkmodel, and feed them
to this program. You can specify the model file name by -m flag, or
it is calculated from given data file name.
=cut

use strict;
use warnings;

use Carp qw(croak carp);
use Data::Dumper;
use File::Temp qw(tempdir);

use FindBin qw($Bin);
use lib "$Bin/../lib";
#use ENA;

use Getopt::Std;
my %options;
getopts("hm:", \%options);

my $usage = <<__USAGE__;
USAGE: $0 [-m file.svmmodel] file.svmdata
__USAGE__

die $usage if @ARGV ne 1;
die $usage if $options{h};

# main
my $train_file = shift;
my $model_file;
if (defined $options{m}) {
$model_file = $options{m};
} else {
($model_file = $train_file) =~ s/\.*/\.svmmodel/;
}
my $temp_dir = tempdir( CLEANUP => 1 );

sub main {
my @learn = ("svm_learn", "-t", "1", "-d", "3",
"$train_file", "$model_file");

`@learn` or croak "@learn failed: $?";
}

main();

1;

0 comments on commit b7a8bf2

Please sign in to comment.
You can’t perform that action at this time.