/
naive_bayesian.pl
92 lines (78 loc) · 2.27 KB
/
naive_bayesian.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
use v6;
my %words;
sub load_db returns Void {
return() unless "words.db.pl" ~~ :e;
my $db = open("words.db.pl") orelse die "Cannot open the words.db.pl file: $!";
for $db.lines -> $_line {
my $line = $_line;
my ($key, $value) = split("\t", $line);
%words{"$key"} = $value;
}
$db.close();
}
sub save_db returns Void {
my $db = open("words.db.pl", :w) orelse die "Cannot open the words.db.pl file: $!";
for (%words.kv) -> $key, $value {
$db.say($key ~ "\t" ~ $value);
}
$db.close();
}
sub parse_file (Str $file) returns Hash {
my %words_in_file;
my $fh = open("$file") orelse die "Cannot open the '$file' file: $!";
for $fh.lines -> $_line {
my $line = $_line;
while ($line ~~ s:P5/(\w+)[ \t\n\r]//) {
%words_in_file{lc($0)}++;
}
}
$fh.close;
return %words_in_file;
}
sub add_words (Str $category, %words_in_file) returns Void {
for (%words_in_file.kv) -> $key, $value {
%words{"$category-$key"} += $value;
}
}
sub classify (%words_in_file) returns Void {
my %count;
my $total = 0;
for (%words.kv) -> $key, $value {
$key ~~ rx:P5/^(.+)-(.+)$/;
%count{$0} += $value;
$total += $value;
}
my %score;
for (%words_in_file.keys) -> $word {
for (%count.kv) -> $category, $count {
if (defined(%words{"$category-$word"})) {
%score{$category} += log(%words{"$category-$word"} / $count);
}
else {
%score{$category} += log(0.01 / $count);
}
}
}
for (%count.kv) -> $category, $count {
%score{$category} += log($count / $total)
}
# do this weird sort block because:
# %score{$^a} <=> %score{$^b}
# does not currently work
for (%count.keys.sort: { %score{$^a} == %score{$^b} ?? 0 !! %score{$^a} > %score{$^b} ?? -1 !! 1 }) -> $category {
say("$category %score{$category}");
}
}
load_db();
if (@*ARGS[0] eq 'add' && +@*ARGS == 3) {
add_words(@*ARGS[1], parse_file(@*ARGS[2]));
}
elsif (@*ARGS[0] eq 'classify' && +@*ARGS == 2) {
classify(parse_file(@*ARGS[1]));
}
else {
say("USAGE:
add <category> <file>
classify <file>");
}
save_db();