Skip to content
Newer
Older
100755 322 lines (251 sloc) 10 KB
6d294d1 initial import
Maurice Aubrey authored
1 #!/usr/bin/perl -w
2
3 # ip2host - Resolve IPs to hostnames in web server logs
4 # Maurice Aubrey <maurice.aubrey+ip2host@gmail.com>
5 #
6 # Usage: ip2host [OPTIONS] [cache_file] < infile > outfile
7 #
8 # For a usage summary, run: ip2host --usage
9 # For documentation, run: perldoc ip2host
10
11 our $CHILDREN = 40; # Number of processes to fork
12 our $TIMEOUT = 20; # DNS timeout
13 our $BUFFER = 50000; # Maximum number of log lines to keep in memory
14 our $FLUSH = 500; # Flush output buffer every $FLUSH lines
15 our $CACHE = ''; # Optional disk cache file to use
16 our $TTL = 86400 * 7; # Seconds until disk cached ips are expired
17 our $DEBUG = 0;
23e29d7 @gwolf Add IPv6 support
gwolf authored
18 # Regular expression for matching either an IPv4 or an IPv6 address
6d294d1 initial import
Maurice Aubrey authored
19 # $1 should be the IP
23e29d7 @gwolf Add IPv6 support
gwolf authored
20 # IPv6 regex from http://people.spodhuis.org/phil.pennock/software/emit_ipv6_regexp-0.304
21 # Just added the IPv4 bit at the beginning.
22 our $REGEX = '\b (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|(?^x:(?:(?:(?:(?:(?^:(?:[0-9a-fA-F]{1,4})):){6})(?^:(?:(?:(?^:(?:[0-9a-fA-F]{1,4})):(?^:(?:[0-9a-fA-F]{1,4})))|(?^:(?:(?:(?^:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?^:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:::(?:(?^:(?:[0-9a-fA-F]{1,4})):){5})(?^:(?:(?:(?^:(?:[0-9a-fA-F]{1,4})):(?^:(?:[0-9a-fA-F]{1,4})))|(?^:(?:(?:(?^:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?^:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?^:(?:[0-9a-fA-F]{1,4})))?::(?:(?^:(?:[0-9a-fA-F]{1,4})):){4})(?^:(?:(?:(?^:(?:[0-9a-fA-F]{1,4})):(?^:(?:[0-9a-fA-F]{1,4})))|(?^:(?:(?:(?^:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?^:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?^:(?:[0-9a-fA-F]{1,4})):){0,1}(?^:(?:[0-9a-fA-F]{1,4})))?::(?:(?^:(?:[0-9a-fA-F]{1,4})):){3})(?^:(?:(?:(?^:(?:[0-9a-fA-F]{1,4})):(?^:(?:[0-9a-fA-F]{1,4})))|(?^:(?:(?:(?^:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?^:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?^:(?:[0-9a-fA-F]{1,4})):){0,2}(?^:(?:[0-9a-fA-F]{1,4})))?::(?:(?^:(?:[0-9a-fA-F]{1,4})):){2})(?^:(?:(?:(?^:(?:[0-9a-fA-F]{1,4})):(?^:(?:[0-9a-fA-F]{1,4})))|(?^:(?:(?:(?^:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?^:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?^:(?:[0-9a-fA-F]{1,4})):){0,3}(?^:(?:[0-9a-fA-F]{1,4})))?::(?^:(?:[0-9a-fA-F]{1,4})):)(?^:(?:(?:(?^:(?:[0-9a-fA-F]{1,4})):(?^:(?:[0-9a-fA-F]{1,4})))|(?^:(?:(?:(?^:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?^:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?^:(?:[0-9a-fA-F]{1,4})):){0,4}(?^:(?:[0-9a-fA-F]{1,4})))?::)(?^:(?:(?:(?^:(?:[0-9a-fA-F]{1,4})):(?^:(?:[0-9a-fA-F]{1,4})))|(?^:(?:(?:(?^:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?^:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?^:(?:[0-9a-fA-F]{1,4})):){0,5}(?^:(?:[0-9a-fA-F]{1,4})))?::)(?^:(?:[0-9a-fA-F]{1,4})))|(?:(?:(?:(?:(?^:(?:[0-9a-fA-F]{1,4})):){0,6}(?^:(?:[0-9a-fA-F]{1,4})))?::))))) \b';
6d294d1 initial import
Maurice Aubrey authored
23
24 use strict;
25 use vars qw( %Opt %Buffer %Pending %Cache $Output_Line $Input_Line );
26 use Socket;
27 use IO::Handle;
28 use IO::Select;
29 use Getopt::Long;
30
0bec752 @mla merge gunnar wolf's ipv6 changes
authored
31 our $VERSION = '1.12';
6d294d1 initial import
Maurice Aubrey authored
32
33 BEGIN {
34 package IP_Cache;
35
36 use strict;
37
38 our $TTL;
39 our @ISA = qw/ DB_File /;
40
41 sub TIEHASH {
42 my $class = shift;
43 $TTL = shift;
44
45 require DB_File;
46 $class->SUPER::TIEHASH(@_);
47 }
48
49 # In order to implement EXISTS, we'd need to parse
50 # the value to see if the IP has expired, which is just
51 # as expensive as FETCH. So we'll just make sure we
52 # never use it.
53 sub EXISTS { die 'exists not implemented!' }
54
55 sub _DB_FETCH {
56 my $self = shift;
57 my $ip = shift;
58
59 my $val = $self->SUPER::FETCH($ip) or return;
60
61 my($utc, $host) = split /:/, $val, 2;
62 time - $utc < $TTL or return;
63
64 return $host;
65 }
66
67 {
68 my %cache;
69
70 sub FETCH {
71 my $self = shift;
72 my $ip = shift;
73
74 return $cache{ $ip } if exists $cache{ $ip };
75 return $cache{ $ip } = $self->_DB_FETCH($ip);
76 }
77
78 sub STORE {
79 my $self = shift;
80 my($ip, $host) = @_;
81
82 $cache{ $ip } = $host;
83 $self->SUPER::STORE( $ip => (time . ':' . $host) );
84 }
85 }
86 }
87
88 sub usage {
89 my $exit = shift || 0;
90
91 print STDERR <<EOF;
92 $0 version $VERSION
93
94 Usage: $0 [OPTIONS] [cache_file] < input_log > output_log
95
96 See the POD for more details: perldoc $0
97
f860971 @mla update copyright
authored
98 Copyright 1999-2012, Maurice Aubrey <maurice\@hevanet.com>
6d294d1 initial import
Maurice Aubrey authored
99
100 This module is free software; you may redistribute it and/or
101 modify it under the same terms as Perl itself.
102 EOF
103
104 exit $exit;
105 }
106
107 sub resolve_ips($) {
108 my $parent_fh = shift;
109
110 $SIG{'ALRM'} = sub { die 'alarmed' };
111
112 while(my $ip = <$parent_fh>) { # Get IP to resolve
113 chomp($ip);
114 my $host = undef;
115 eval { # Try to resolve, but give up after $TIMEOUT seconds
116 alarm($Opt{timeout});
0bec752 @mla merge gunnar wolf's ipv6 changes
authored
117
118 my $ip_struct;
23e29d7 @gwolf Add IPv6 support
gwolf authored
119 if ($ip_struct = inet_aton($ip)) {
0bec752 @mla merge gunnar wolf's ipv6 changes
authored
120 $host = gethostbyaddr $ip_struct, AF_INET;
23e29d7 @gwolf Add IPv6 support
gwolf authored
121 } elsif ($ip_struct = Socket::inet_pton(AF_INET6, $ip)) {
0bec752 @mla merge gunnar wolf's ipv6 changes
authored
122 $host = gethostbyaddr $ip_struct, AF_INET6;
046fc29 apply patch to silence warnings
Maurice Aubrey authored
123 }
0bec752 @mla merge gunnar wolf's ipv6 changes
authored
124
6d294d1 initial import
Maurice Aubrey authored
125 alarm(0);
126 };
127 # XXX Debug
128 if ($Opt{debug} and $@ =~ /alarmed/) {
129 $host ||= 'TIMEOUT';
130 # warn "Alarming ($ip)...\n";
131 }
132 $host ||= $ip;
133 print $parent_fh "$ip $host\n";
134 }
135 }
136
137 sub fork_children($) {
138 my $num_children = shift;
139
140 my $read_set = IO::Select->new;
141 my $write_set = IO::Select->new;
142
143 for(my $child = 1; $child <= $num_children; $child++) {
144 my($child_fh, $parent_fh) = (IO::Handle->new, IO::Handle->new);
145 socketpair($child_fh, $parent_fh, AF_UNIX, SOCK_STREAM, PF_UNSPEC)
146 or die "socketpair failed: $!";
147 $child_fh->autoflush;
148 $parent_fh->autoflush;
149
150 if (my $pid = fork) { # parent
151 close $parent_fh;
152 $write_set->add($child_fh); # Start out writing to children
153 } else { # child
154 defined $pid or die "fork failed: $!";
155 close $child_fh; close STDIN; close STDOUT;
156 resolve_ips($parent_fh);
157 exit 0;
158 }
159 }
160
161 return ($read_set, $write_set);
162 }
163
164 # Write as many lines as we can until we come across one
165 # that's missing (that means it's still pending DNS).
166 sub flush_output {
167 for (; exists $Buffer{ $Output_Line }; $Output_Line++) {
168 print delete $Buffer{ $Output_Line };
169 }
170 }
171
172 %Opt = (
173 children => $CHILDREN,
174 timeout => $TIMEOUT,
175 buffer => $BUFFER,
176 flush => $FLUSH,
177 cache => $CACHE,
178 ttl => $TTL,
179 debug => $DEBUG,
180 regex => $REGEX,
181 );
182 GetOptions(\%Opt,
183 "children|kids=i",
184 "timeout=i",
185 "buffer=i",
186 "flush=i",
187 "ttl=i",
188 "cache=s",
189 "usage|help|version",
190 "debug",
191 "regex=s",
192 );
193 usage(0) if $Opt{usage};
194 usage(1) if @ARGV > 1;
195 $Opt{cache} = shift if @ARGV;
196 $Opt{regex} = qr/$Opt{regex}/sx;
197
198 my($read_set, $write_set) = fork_children($Opt{children});
199
200 if ($Opt{cache}) { # Cache results to disk if asked
201 tie %Cache, 'IP_Cache', $Opt{ttl}, $Opt{cache}
202 or die "unable to tie '$Opt{cache}': $!";
203 }
204
205 $Output_Line = 1;
206 $Input_Line = 0;
207 while (1) {
208 my $buffer_full = $Input_Line - $Output_Line >= $Opt{buffer};
209
210 my($readable, $writable) = IO::Select->select(
211 $read_set,
212 $buffer_full ? undef : $write_set, # Throttle if buffer is full
213 undef
214 );
215
216 while (@$writable) { # One or more children ready for IP
217 my $line = <STDIN>;
218 $Input_Line++;
219 unless (defined $line) {
220 undef $write_set;
221 last;
222 }
223 my($ip) = ($line =~ /$Opt{regex}/);
224 my($before, $after) = ($`, $');
225 if (not defined $ip) { # No IP seen, pass it through unmolested
226 $Buffer{ $Input_Line } = $line;
227 } elsif (my $host = $Cache{ $ip }) { # We found this answer already
228 $Buffer{ $Input_Line } = "$before$host$after";
229 } elsif (exists $Pending{ $ip }) { # We're still looking
230 push @{ $Pending{ $ip } }, [ $Input_Line, $before, $after ];
231 } else { # Send IP to child
232 my $fh = shift @$writable;
233 print $fh "$ip\n";
234 $Pending{ $ip } = [ [ $Input_Line, $before, $after ] ];
235 $write_set->remove($fh);
236 $read_set->add($fh); # Move to read set to wait for answer
237 }
238
239 flush_output if $Input_Line % $Opt{flush} == 0;
240 }
241
242 while (@$readable) { # One or more children have an answer
243 my $fh = shift @$readable;
244 chomp(my $str = <$fh>);
245 my($ip, $host) = split / /, $str, 2;
246 $Cache{ $ip } = $host;
247 # Take all the lines that were pending for this IP and
248 # toss them into the output buffer
249 foreach my $pending (@{ $Pending{ $ip } }) {
250 $Buffer{ $pending->[0] } = "$pending->[1]$host$pending->[2]";
251 }
252 delete $Pending{ $ip };
253 $read_set->remove($fh);
254 $write_set->add($fh) if defined $write_set; # Ready for new question
255 }
256
257 last if not defined $write_set and not keys %Pending;
258 flush_output if $buffer_full;
259 }
260
261 flush_output;
262 exit 0;
263
264 =pod
265
266 =head1 NAME
267
268 ip2host - Resolves IPs to hostnames in web server logs
269
270 =head1 SYNOPSIS
271
272 ip2host [OPTIONS] [cache_file] < infile > outfile
273
274 infile - Web server log file.
275
276 outfile - Same as input file, but with IPs resolved to hostnames.
277
278 Options:
279
280 --children=... Number of child processes to spawn (default: 40)
281 --timeout=... Seconds to wait on DNS response (default: 20)
282 --buffer=... Maximum number of log lines to keep in
283 memory (default: 50000)
284 --flush=... Number of lines to process before flushing
285 output buffer (default: 500)
286 --cache=... Filename to use as disk cache (default: none)
287 --ttl=... Number of seconds before IPs cached on disk are expired
288 (default: 604800 - One week)
289
290 =head1 DESCRIPTION
291
292 This is a faster, drop-in replacement for the logresolve
293 utility distributed with the Apache web server.
294
295 It's been reported to work under Linux, FreeBSD, Solaris,
296 Tru64, and IRIX.
297
298 =head1 AUTHOR
299
300 Maurice Aubrey E<lt>maurice.aubrey+ip2host@gmail.comE<gt>
301
302 Based on the logresolve.pl script by Rob Hartill.
303
304 =head1 COPYRIGHT
305
f860971 @mla update copyright
authored
306 Copyright 1999-2012, Maurice Aubrey E<lt>maurice.aubrey+ip2host@gmail.comE<gt>.
6d294d1 initial import
Maurice Aubrey authored
307
308 This module is free software; you may redistribute it and/or
309 modify it under the same terms as Perl itself.
310
311 =head1 README
312
313 Resolves IPs to hostnames in web server logs.
314 This is a faster, drop-in replacement for the logresolve utility
315 distributed with the Apache web server.
316
317 =head1 SCRIPT CATEGORIES
318
319 Web
320
321 =cut
Something went wrong with that request. Please try again.