/
Default.pm
243 lines (203 loc) · 6.61 KB
/
Default.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
package HTML::Robot::Scrapper::UserAgent::Default;
use Moose;
use Data::Printer;
use HTTP::Tiny;
use HTTP::Headers::Util qw(split_header_words);
use Digest::SHA1 qw(sha1 sha1_hex sha1_base64);
#use utf8;
has robot => ( is => 'rw', );
has engine => ( is => 'rw', );
=head1 DESCRIPTION
This is the user agent class. It is responsible to handle the page visit,
and page content/parsing calls.
=cut
has [ qw/
headers
request_headers
response_headers
content
content_type
charset
url
/ ] => ( is => 'rw' );
=head2 ua
The default ua is HTTP::Tiny. However, it is possible to create a new class
just like this one and make it work with other user agents.
=cut
has ua => ( is => 'rw', default => sub { HTTP::Tiny->new() } );
sub _headers {
my ( $self, $headers ) = @_;
$self->headers( $headers ) if defined $headers;
return $self->headers;
}
sub _request_headers {
my ( $self, $headers ) = @_;
$self->request_headers( $headers ) if defined $headers;
return $self->request_headers;
}
sub _response_headers {
my ( $self, $headers ) = @_;
$self->response_headers( $headers ) if defined $headers;
return $self->response_headers;
}
sub _content {
my ( $self, $content ) = @_;
$self->content( $content ) if defined $content;
return $self->content;
}
sub _content_type {
my ( $self, $content_type ) = @_;
$self->content_type( $content_type ) if defined $content_type;
return $self->content_type;
}
sub _charset {
my ( $self, $charset ) = @_;
$self->charset( $charset ) if defined $charset;
return $self->charset;
}
sub _url {
my ( $self, $url ) = @_;
$self->url( $url ) if defined $url;
return $self->url;
}
#visit the url and load into xpath and redirects to the method
=head2 visit
Will visit the url you appended/prepended to the queue
ex.
$self->robot->queue->append( search => 'http://www.url.com', {
passed_key_values => {
send => 'var across requests',
some => 'vars i collected here...... and ....',
i_will => 'pass them to the next page because ...',
i_need => 'stuff from this page and the other ',
},
request => [ <---- OPTIONAL... force custom request
'GET',
'http://www.lopes.com.br/imoveis/busca/-/'.$estado.'/-/-/-/aluguel-de-0-a-10000/de-0-ate-1000-m2/-/60',
{
headers => {
'Content-Type' => 'application/x-www-form-urlencoded',
},
content => '',
}
]
} );
=cut
sub visit {
my ( $self, $item ) = @_;
if ( defined $self->robot->cache ) {
my $sha1 = Digest::SHA1->new;
$self->url( $item->{ url } );
$sha1->add( $item->{ url } );
$self->robot->queue->add_visited( $item->{ url } );
my $sha1_key = $sha1->hexdigest;
my $res = $self->robot->cache->get( $sha1_key );
if ( ! $res ) {
$res = $self->_visit( $item );
$self->robot->cache->set( $sha1_key, $res );
$self->parse_response( $res ); #todo: passar parametros melhor. ex: 10minutos pro cache..
return $res;
} else {
$self->parse_response( $res );
return $res;
}
} else {
$self->url( $item->{ url } ) if ( exists $item->{ url } );
my $res = $self->_visit( $item );
$self->parse_response( $res );
return $res;
}
}
sub _visit {
my ( $self, $item ) = @_;
my $res = undef;
if ( exists $item->{ request } and
ref $item->{ request } eq ref [] )
{
$res = $self->ua->request( @{ $item->{ request } } );
$self->url( $item->{ request }[1] );
}
else
{
$res = $self->ua->get( $item->{ url } );
$self->url( $item->{ url } );
}
$self->parse_response( $res );
return $res;
}
sub parse_response {
my ( $self, $res ) = @_;
my $headers = $res->{ headers };
$self->content( $res->{ content } );
$self->parse_content( $res );
}
=head2 parse_content
Here the useragent will loop over defined content types and
will call the proper subroutine to treat page->content based
on content type.
=cut
sub parse_content {
my ( $self, $res ) = @_;
my $content_types_avail = $self->robot->parser->content_types;
#set headers
$self->headers( $res->{ headers } );
$self->response_headers( $res->{ headers } );
#content type
my $content_type =$res->{headers}->{'content-type'};
$self->content_type( $content_type );
#charset
my $content_charset = $self->charset_from_headers( $res->{ headers } );
$self->charset( $content_charset );
my $content_type_found = 0;
foreach my $ct (keys %$content_types_avail ) {
foreach my $parser ( @{ $content_types_avail->{$ct} } ) {
next unless $content_type =~ m/^$ct/ig;
my $parse_method = $parser->{parse_method};
# my $content = $res->{content};
$self->robot->parser->$parse_method( $self->content );
$content_type_found = 1;
}
}
print "**** Content type not set for: " . $content_type . '... please configure it correctly adding a parser for that content type'."\n" if !$content_type_found;
# foreach my $ct ( keys $self->parser_content_type ) {
# if ( $self->response->{ headers }->{'content-type'} =~ m|^$ct|g ) {
# my $parser_method = $self->parser_methods->{ $self->parser_content_type->{ $ct } };
# $self->$parser_method();
# }
# }
# my $reader_method = $item->{method};
# $self->$reader_method; #redirects back to method
}
sub charset_from_headers {
my ( $self, $headers ) = @_;
my $ct = $headers->{'content-type'};
my $charset ;
if ( $ct =~ m/charset=([^;|^ ]+)/ig ) {
$charset = $1;
}
return $charset;
}
sub normalize_url {
my ( $self, $url ) = @_;
# if ( ref $self->before_normalize_url eq ref {}
# and exists $self->before_normalize_url->{is_active}
# and $self->before_normalize_url->{is_active} == 1
# and exists $self->before_normalize_url->{code}
# ) {
# $url = $self->before_normalize_url->{code}->( $url );
# }
if ( defined $url ) {
$self->url( $url ) if ! defined $self->url;
my $final_url = URI->new_abs( $url , $self->url );
return $final_url->as_string();
}
}
before 'visit' => sub {
my ( $self ) = @_;
$self->robot->benchmark->method_start('visit');
};
after 'visit' => sub {
my ( $self ) = @_;
$self->robot->benchmark->method_finish('visit');
};
1;