Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 117 lines (89 sloc) 3.51 KB
#!/usr/bin/php
<?php
//
//
// By Pete Warden <pete@petewarden.com>, freely reusable, see http://petewarden.typepad.com for more
require_once('parallelcurl.php');
require_once('cliargs.php');
define('SCHOOL_RE', '@<a href="([^"]+)"><div class="va-search-item"> <div class="clear" > <span style="font-size:15px; font-weight: bold" >([^<]+)</span> </div> <p style="font-style:italic;">([^<]+)</p></div>@');
// This function gets called back for each request that completes
function on_request_done($content, $url, $ch, $data) {
$httpcode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if ($httpcode !== 200) {
print "Fetch error $httpcode for '$url'\n";
return;
}
$output_handle = $data['output_handle'];
$rank = $data['rank'];
$text = str_replace("\n", "", $content);
if (!preg_match_all(SCHOOL_RE, $text, $matches, PREG_SET_ORDER))
{
error_log("Failed to match RE with '$text'");
return;
}
foreach ($matches as $match)
{
$url_suffix = trim($match[1]);
$name = trim($match[2]);
$address = trim($match[3]);
$tooltip = $name.' - '.$rank;
$full_url = 'http://projects.latimes.com'.$url_suffix;
$output = array($address, $tooltip, $full_url, $rank);
fputcsv($output_handle, $output);
}
}
$cliargs = array(
'output' => array(
'short' => 'o',
'type' => 'optional',
'description' => 'The file to write the output list of URLs to - if unset will write to stdout',
'default' => 'php://stdout',
),
'maxrequests' => array(
'short' => 'm',
'type' => 'optional',
'description' => 'How many requests to run in parallel',
'default' => '10',
),
'organization' => array(
'short' => 'r',
'type' => 'required',
'description' => 'The name of the organization or company running this crawler',
),
'email' => array(
'short' => 'e',
'type' => 'required',
'description' => 'An email address where server owners can report any problems with this crawler',
),
);
ini_set('memory_limit', '-1');
$options = cliargs_get_options($cliargs);
$output = $options['output'];
$max_requests = $options['maxrequests'];
$organization = $options['organization'];
$email = $options['email'];
if (empty($organization) || empty($email) || (!strpos($email, '@')))
die("You need to specify a valid organization and email address (found '$organization', '$email')\n");
$agent = 'Crawler from '.$organization;
$agent .= ' - contact '.$email;
$agent .= ' to report any problems with my crawling. Based on code from http://petewarden.typepad.com';
$curl_options = array(
CURLOPT_SSL_VERIFYPEER => FALSE,
CURLOPT_SSL_VERIFYHOST => FALSE,
CURLOPT_FOLLOWLOCATION => TRUE,
CURLOPT_USERAGENT => $agent,
);
$base_url = 'http://projects.latimes.com/value-added/rank/school/';
$ranks = range(1, 5);
$output_handle = fopen($output, 'w');
fputcsv($output_handle, array('address', 'tooltip', 'url', 'value'));
$parallel_curl = new ParallelCurl($max_requests, $curl_options);
foreach ($ranks as $rank) {
$full_url = $base_url.$rank.'/';
$data = array('output_handle' => $output_handle, 'rank' => $rank);
$parallel_curl->startRequest($full_url, 'on_request_done', $data);
}
// This should be called when you need to wait for the requests to finish.
// This will automatically run on destruct of the ParallelCurl object, so the next line is optional.
$parallel_curl->finishAllRequests();
?>
You can’t perform that action at this time.