Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
add moses' truecaser
  • Loading branch information
pks committed Nov 12, 2015
1 parent d9896c2 commit 17f5ee8
Show file tree
Hide file tree
Showing 3 changed files with 304 additions and 0 deletions.
88 changes: 88 additions & 0 deletions detruecase.perl
@@ -0,0 +1,88 @@
#!/usr/bin/perl -w

use strict;
use Getopt::Long "GetOptions";

binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");

my ($SRC,$INFILE,$UNBUFFERED);
die("detruecase.perl < in > out")
unless &GetOptions('headline=s' => \$SRC,
'in=s' => \$INFILE,
'b|unbuffered' => \$UNBUFFERED);
if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }

my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&quot;"=>1,"&apos;"=>1,"&#91;"=>1,"&#93;"=>1);

# lowercase even in headline
my %ALWAYS_LOWER;
foreach ("a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with") { $ALWAYS_LOWER{$_} = 1; }

# find out about the headlines
my @HEADLINE;
if (defined($SRC)) {
open(SRC,$SRC);
my $headline_flag = 0;
while(<SRC>) {
$headline_flag = 1 if /<hl>/;
$headline_flag = 0 if /<.hl>/;
next unless /^<seg/;
push @HEADLINE, $headline_flag;
}
close(SRC);
}

my $sentence = 0;
if ($INFILE) {
open(IN,$INFILE) || die("ERROR: could not open file '$INFILE'");
binmode(IN, ":utf8");
while(<IN>) {
&process($_,$sentence++);
}
close(IN);
}
else {
while(<STDIN>) {
&process($_,$sentence++);
}
}

sub process {
my $line = $_[0];
chomp($line);
$line =~ s/^\s+//;
$line =~ s/\s+$//;
my @WORD = split(/\s+/,$line);

# uppercase at sentence start
my $sentence_start = 1;
for(my $i=0;$i<scalar(@WORD);$i++) {
&uppercase(\$WORD[$i]) if $sentence_start;
if (defined($SENTENCE_END{ $WORD[$i] })) { $sentence_start = 1; }
elsif (!defined($DELAYED_SENTENCE_START{$WORD[$i] })) { $sentence_start = 0; }
}

# uppercase headlines {
if (defined($SRC) && $HEADLINE[$sentence]) {
foreach (@WORD) {
&uppercase(\$_) unless $ALWAYS_LOWER{$_};
}
}

# output
my $first = 1;
foreach (@WORD) {
print " " unless $first;
$first = 0;
print $_;
}
print "\n";
$sentence++;
}

sub uppercase {
my ($W) = @_;
$$W = uc(substr($$W,0,1)).substr($$W,1);
}
112 changes: 112 additions & 0 deletions train-truecaser.perl
@@ -0,0 +1,112 @@
#!/usr/bin/perl -w

# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $

#
# Options:
#
# --possiblyUseFirstToken : boolean option; the default behaviour (when this option is not provided) is that the first token of a sentence is ignored, on the basis that the first word of a sentence is always capitalized; if this option is provided then: a) if a sentence-initial token is *not* capitalized, then it is counted, and b) if a capitalized sentence-initial token is the only token of the segment, then it is counted, but with only 10% of the weight of a normal token.
#

use strict;
use Getopt::Long "GetOptions";

# apply switches
my ($MODEL,$CORPUS);
die("train-truecaser.perl --model truecaser --corpus cased [--possiblyUseFirstToken]")
unless &GetOptions('corpus=s' => \$CORPUS,
'model=s' => \$MODEL,
'possiblyUseFirstToken' => \(my $possiblyUseFirstToken = 0))
&& defined($CORPUS) && defined($MODEL);
my %CASING;
my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&apos;"=>1,"&quot;"=>1,"&#91;"=>1,"&#93;"=>1);
open(CORPUS,$CORPUS) || die("ERROR: could not open '$CORPUS'");
binmode(CORPUS, ":utf8");
while(<CORPUS>) {
chop;
my ($WORD,$MARKUP) = split_xml($_);
my $start = 0;
while($start<=$#$WORD && defined($DELAYED_SENTENCE_START{$$WORD[$start]})) { $start++; }
my $firstWordOfSentence = 1;
for(my $i=$start;$i<=$#$WORD;$i++) {
my $currentWord = $$WORD[$i];
if (! $firstWordOfSentence && defined($SENTENCE_END{$$WORD[$i-1]})) {
$firstWordOfSentence = 1;
}

my $currentWordWeight = 0;
if (! $firstWordOfSentence) {
$currentWordWeight = 1;
} elsif ($possiblyUseFirstToken) {
# gated special handling of first word of sentence
my $firstChar = substr($currentWord, 0, 1);
if (lc($firstChar) eq $firstChar) {
# if the first character is not upper case, count the token as full evidence (because if it's not capitalized, then there's no reason to be wary that the given casing is only due to being sentence-initial)
$currentWordWeight = 1;
} elsif (scalar(@$WORD) == 1) {
# if the first character is upper case, but the current token is the only token of the segment, then count the token as partial evidence (because the segment is presumably not a sentence and the token is therefore not the first word of a sentence and is possibly in its natural case)
$currentWordWeight = 0.1;
}
}
if ($currentWordWeight > 0) {
$CASING{ lc($currentWord) }{ $currentWord } += $currentWordWeight;
}

$firstWordOfSentence = 0;
}
}
close(CORPUS);

open(MODEL,">$MODEL") || die("ERROR: could not create '$MODEL'");
binmode(MODEL, ":utf8");
foreach my $type (keys %CASING) {
my ($score,$total,$best) = (-1,0,"");
foreach my $word (keys %{$CASING{$type}}) {
my $count = $CASING{$type}{$word};
$total += $count;
if ($count > $score) {
$best = $word;
$score = $count;
}
}
print MODEL "$best ($score/$total)";
foreach my $word (keys %{$CASING{$type}}) {
print MODEL " $word ($CASING{$type}{$word})" unless $word eq $best;
}
print MODEL "\n";
}
close(MODEL);


# store away xml markup
sub split_xml {
my ($line) = @_;
my (@WORD,@MARKUP);
my $i = 0;
$MARKUP[0] = "";
while($line =~ /\S/) {
# XML tag
if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
$MARKUP[$i] .= $1." ";
$line = $2;
}
# non-XML text
elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
$WORD[$i++] = $1;
$MARKUP[$i] = "";
$line = $2;
}
# '<' or '>' occurs in word, but it's not an XML tag
elsif ($line =~ /^\s*(\S+)(.*)$/) {
$WORD[$i++] = $1;
$MARKUP[$i] = "";
$line = $2;
}
else {
die("ERROR: huh? $line\n");
}
}
chop($MARKUP[$#MARKUP]);
return (\@WORD,\@MARKUP);
}
104 changes: 104 additions & 0 deletions truecase.perl
@@ -0,0 +1,104 @@
#!/usr/bin/perl -w

# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
use strict;
use Getopt::Long "GetOptions";

binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");

# apply switches
my ($MODEL, $UNBUFFERED);
die("truecase.perl --model MODEL [-b] < in > out")
unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED)
&& defined($MODEL);
if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }

my (%BEST,%KNOWN);
open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'");
binmode(MODEL, ":utf8");
while(<MODEL>) {
my ($word,@OPTIONS) = split;
$BEST{ lc($word) } = $word;
$KNOWN{ $word } = 1;
for(my $i=1;$i<$#OPTIONS;$i+=2) {
$KNOWN{ $OPTIONS[$i] } = 1;
}
}
close(MODEL);

my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&apos;"=>1,"&quot;"=>1,"&#91;"=>1,"&#93;"=>1);

while(<STDIN>) {
chop;
my ($WORD,$MARKUP) = split_xml($_);
my $sentence_start = 1;
for(my $i=0;$i<=$#$WORD;$i++) {
print " " if $i && $$MARKUP[$i] eq '';
print $$MARKUP[$i];

my ($word,$otherfactors);
if ($$WORD[$i] =~ /^([^\|]+)(.*)/)
{
$word = $1;
$otherfactors = $2;
}
else
{
$word = $$WORD[$i];
$otherfactors = "";
}

if ($sentence_start && defined($BEST{lc($word)})) {
print $BEST{lc($word)}; # truecase sentence start
}
elsif (defined($KNOWN{$word})) {
print $word; # don't change known words
}
elsif (defined($BEST{lc($word)})) {
print $BEST{lc($word)}; # truecase otherwise unknown words
}
else {
print $word; # unknown, nothing to do
}
print $otherfactors;

if ( defined($SENTENCE_END{ $word })) { $sentence_start = 1; }
elsif (!defined($DELAYED_SENTENCE_START{ $word })) { $sentence_start = 0; }
}
print $$MARKUP[$#$MARKUP];
print "\n";
}

# store away xml markup
sub split_xml {
my ($line) = @_;
my (@WORD,@MARKUP);
my $i = 0;
$MARKUP[0] = "";
while($line =~ /\S/) {
# XML tag
if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
$MARKUP[$i] .= $1." ";
$line = $2;
}
# non-XML text
elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
$WORD[$i++] = $1;
$MARKUP[$i] = "";
$line = $2;
}
# '<' or '>' occurs in word, but it's not an XML tag
elsif ($line =~ /^\s*(\S+)(.*)$/) {
$WORD[$i++] = $1;
$MARKUP[$i] = "";
$line = $2;
}
else {
die("ERROR: huh? $line\n");
}
}
chop($MARKUP[$#MARKUP]);
return (\@WORD,\@MARKUP);
}

0 comments on commit 17f5ee8

Please sign in to comment.