/
usage.pl
105 lines (88 loc) · 3.89 KB
/
usage.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/perl -w
use utf8;
use strict;
use warnings;
use diagnostics;
use HTMLPage;
use Benchmark::Timer;
use Encode;
use constant SEPARATOR => '&#&';
open(FILE, 'input.html');
my $html = join('', <FILE>);
close(FILE);
my $replacements = {
'Nylofor' =>
'<a class="replaced" href="test-nylofor-1.html">{keyword}</a>' . SEPARATOR .
'<a class="replaced" href="test-nylofor-2.html">{keyword}</a>' . SEPARATOR .
'<a class="replaced" href="test-nylofor-3.html">{keyword}</a>' . SEPARATOR .
'<a class="replaced" href="test-nylofor-4.html">{keyword}</a>' . SEPARATOR .
'<a class="replaced" href="test-nylofor-5.html">{keyword}</a>' . SEPARATOR .
'<a class="replaced" href="test-nylofor-6.html">{keyword}</a>',
'kontakt\\w*' =>
'<a class="replaced" href="test-kontakt-1.html">{keyword}</a>' . SEPARATOR .
'<a class="replaced" href="test-kontakt-2.html">{keyword}</a>' . SEPARATOR .
'<a class="replaced" href="test-kontakt-3.html">{keyword}</a>' . SEPARATOR .
'<a class="replaced" href="test-kontakt-4.html">{keyword}</a>',
'video' => '<a class="replaced" href="test-video.html">{keyword}</a>',
'lze vůbec vybrat' => '<a class="replaced" href="test-lze-vybrat.html">{keyword}</a>',
'špatném utkání' => '<a class="replaced" href="test-spatnem-utkani.html">{keyword}</a>',
'klubu však zabodl' => '<a class="replaced" href="test-klubu-zabodl.html">{keyword}</a>',
'zázra\\w*' => '<a class="replaced" href="test-zazra.html">{keyword}</a>',
'komplet\\w* péč\\w*' => '<a class="replaced" href="test-komplet-pec.html">{keyword}</a>',
'dámsk\\w* mód\\w*' => '<a class="replaced" href="test-damsk-mod.html">{keyword}</a>',
'oblečení' => '<a class="replaced" href="test-obleceni.html">{keyword}</a>',
'křesl\\w*' => '<a class="replaced" href="test-kresl.html">{keyword}</a>',
# 'nabíz\\w*' => '<a class="replaced" href="test-nabiz.html">{keyword}</a>',
'nabíz\\w*' =>
'<a class="replaced" href="test-nabiz-novy-ako-slak_priorita.html">{keyword}</a>' . SEPARATOR .
'<a class="replaced" href="test-nabiz.html">{keyword}</a>',
'hydin\\w*' =>
'<a class="replaced" href="hydina-1">{keyword}</a>' . '####' .
'<a class="replaced" href="hydina-2">{keyword}</a>',
'o’neill' => '<a class="replaced" href="test-regexp.html">{keyword}</a>',
'dolce & gabbana' => '<a class="replaced" href="test-regexp.html">{keyword}</a>',
'croc´s' => '<a class="replaced" href="test-regexp.html">{keyword}</a>',
'traction 17,5“ 2012' => '<a class="replaced" href="test-regexp.html">{keyword}</a>',
'aslı çakır alptekinová' => '<a class="replaced" href="test-regexp.html">{keyword}</a>',
'petr rezek (zpěvák)' => '<a class="replaced" href="test-regexp.html">{keyword}</a>',
'black+blum' => '<a class="replaced" href="test-regexp.html">{keyword}</a>',
'!solid' => '<a class="replaced" href="test-regexp.html">{keyword}</a>',
'smash!' => '<a class="replaced" href="test-regexp.html">{keyword}</a>',
};
my $searches = [
'video',
'chelsea',
'důkladn\\w* rozčešte'
];
binmode(STDERR, ":utf8");
binmode(STDOUT, ":utf8");
my $timer = Benchmark::Timer->new();
$timer->start('creating page');
$html = Encode::decode_utf8($html);
my $page = HTMLPage->from_string($html);
$timer->stop;
eval {
$timer->start('parsing words');
$page->replace($replacements, $searches, SEPARATOR);
$timer->stop;
};
if(my $e = $@) {
print STDERR "Logging of unexpected exception: ", $e;
}
else {
$html = $page->get_html();
}
$timer->report;
print $html;
print STDERR "---------- replacements -------\n";
my @replacements = @{$page->{'word_replacements'}};
my @snippets = @{$page->{'replaced_snippets'}};
for my $index (0 .. $page->get_replacements_count() - 1) {
my $word = $replacements[$index];
my $snippet = $snippets[$index];
print STDERR "$word: $snippet\n";
}
print STDERR "---------- all found -------\n";
foreach my $word (@{$page->{'all_words'}}) {
print STDERR "$word\n";
}