Permalink
Fetching contributors…
Cannot retrieve contributors at this time
337 lines (273 sloc) 8.82 KB
<?php
//
// Open Web Analytics - An Open Source Web Analytics Framework
//
// Copyright 2006 Peter Adams. All rights reserved.
//
// Licensed under GPL v2.0 http://www.gnu.org/copyleft/gpl.html
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// $Id$
//
if(!class_exists('Snoopy')) {
require_once(OWA_INCLUDE_DIR.'/Snoopy.class.php');
}
require_once(OWA_HTTPCLIENT_DIR.'http.php');
/**
* Wrapper for Snoopy http request class
*
* @author Peter Adams <peter@openwebanalytics.com>
* @copyright Copyright &copy; 2006 Peter Adams <peter@openwebanalytics.com>
* @license http://www.gnu.org/copyleft/gpl.html GPL v2.0
* @category owa
* @package owa
* @version $Revision$
* @since owa 1.0.0
*/
class owa_http {
/**
* Configuration
*
* @var array
*/
var $config;
/**
* Error handler
*
* @var object
*/
var $e;
/**
* The length of text contained in the snippet
*
* @var string
*/
var $snip_len = 100;
/**
* The string that is added to the beginning and
* end of snippet text.
*
* @var string
*/
var $snip_str = '...';
/**
* Anchor information for a particular link
*
* @var array
*/
var $anchor_info;
var $crawler;
var $testcrawler;
var $http;
var $response;
var $response_headers;
var $response_code;
var $request_headers;
function __construct() {
$c = owa_coreAPI::configSingleton();
$this->config = $c->fetch('base');
$this->e = owa_coreAPI::errorSingleton();
$this->crawler = new Snoopy;
// do not allow snoopy to follow links
$this->crawler->maxredirs = 5;
$this->crawler->agent = owa_coreAPI::getSetting('base', 'owa_user_agent');
//$this->crawler->agent = "Firefox";
//owa_coreAPI::debug('hello from owa_http constructor');
return;
}
function fetch($uri) {
//owa_coreAPI::debug('hello from owa_http fetch');
return $this->crawler->fetch($uri);
}
function testFetch($url) {
$http= new http_class;
owa_coreAPI::debug('hello owa_http testfetch method');
/* Connection timeout */
$http->timeout=0;
/* Data transfer timeout */
$http->data_timeout=0;
/* Output debugging information about the progress of the connection */
$http->debug=1;
$http->user_agent = owa_coreAPI::getSetting('base', 'owa_user_agent');
$http->follow_redirect=1;
$http->redirection_limit=5;
$http->exclude_address="";
$http->prefer_curl=0;
$arguments = array();
$error=$http->GetRequestArguments($url,$arguments);
$error=$http->Open($arguments);
//for(;;)
// {
$error=$http->ReadReplyBody($body,50000);
if($error!="" || strlen($body)==0)
owa_coreAPI::debug(HtmlSpecialChars($body));
// }
}
/**
* Searches a fetched html document for the anchor of a specific url
*
* @param string $link
*/
function extract_anchor($link) {
$matches = '';
$regex = '/<a[^>]*href=\"%s\"[^>]*>(.*?)<\/a>/i';
//$escaped_link = str_replace(array("/", "?"), array("\/", "\?"), $link);
$pattern = trim(sprintf($regex, preg_quote($link, '/')));
$search = preg_match($pattern, $this->response, $matches);
//$this->e->debug('pattern: '.$pattern);
//$this->e->debug('link: '.$link);
if (empty($matches)) {
if (substr($link, -1) === '/') {
$link = substr($link, 0, -1);
$pattern = trim(sprintf($regex, preg_quote($link, '/')));
$search = preg_match($pattern, $this->response, $matches);
//$this->e->debug('pattern: '.$pattern);
//$this->e->debug('link: '.$link);
}
}
$this->e->debug('ref search: '.$search);
//$this->e->debug('ref matches: '.print_r($this->results, true));
//$this->e->debug('ref matches: '.print_r($matches, true));
if (isset($matches[0])) {
$this->anchor_info = array('anchor_tag' => $matches[0], 'anchor_text' => owa_lib::inputFilter($matches[0]));
$this->e->debug('Anchor info: '.print_r($this->anchor_info, true));
}
}
/**
* Creates a text snippet of the portion of page where the
* specific link is found.
*
* Takes fully qualified URL for the link to search for.
*
* @param string $link
* @return string
*/
function extract_anchor_snippet($link){
// Search the page for a specific anchor
$this->extract_anchor($link);
if(!empty($this->anchor_info['anchor_tag'])) {
// drop certain HTML entitities and their content
$nohtml = $this->strip_selected_tags(
$this->response,
array('title',
'head',
'script',
'object',
'style',
'meta',
'link',
'rdf:'),
true);
//$this->e->debug('Refering page content after certain html entities were dropped: '.$this->results);
// calc len of the anchor text
$atext_len = strlen($this->anchor_info['anchor_tag']);
// find position within document of the anchor text
$start = strpos($nohtml, $this->anchor_info['anchor_tag']);
if ($start < $this->snip_len) {
$part1_start_pos = 0;
$part1_snip_len = $start;
} else {
$part1_start_pos = $start;
$part1_snip_len = $this->snip_len;
}
$replace_items = array("\r\n", "\n\n", "\t", "\r", "\n");
// Create first segment of snippet
$first_part = substr($nohtml, 0, $part1_start_pos);
$first_part = str_replace($replace_items, '', $first_part);
$first_part = strip_tags(owa_lib::inputFilter($first_part));
//$part1 = trim(substr($nohtml, $part1_start_pos, $part1_snip_len));
$part1 = substr($first_part,-$part1_snip_len, $part1_snip_len);
//$part1 = str_replace(array('\r\n', '\n\n', '\t', '\r', '\n'), '', $part1);
//$part1 = owa_lib::inputFilter($part1);
// Create second segment of snippet
$part2 = trim(substr($nohtml, $start + $atext_len, $this->snip_len+300));
$part2 = str_replace($replace_items, '', $part2);
$part2 = substr(strip_tags(owa_lib::inputFilter($part2)),0, $this->snip_len);
// Put humpty dumpy back together again and create actual snippet
$snippet = $this->snip_str.$part1.' <span class="snippet_anchor">'.owa_lib::inputFilter($this->anchor_info['anchor_tag']).'</span> '.$part2.$this->snip_str;
} else {
$snippet = '';
}
return $snippet;
}
function extract_title() {
preg_match('~(</head>|<body>|(<title>\s*(.*?)\s*</title>))~i', $this->response, $m);
$this->e->debug("referer title extract: ". print_r($m, true));
return $m[3];
}
function strip_selected_tags($str, $tags = array(), $stripContent = false) {
foreach ($tags as $k => $tag){
if ($stripContent == true) {
$pattern = sprintf('#(<%s.*?>)(.*?)(<\/%s.*?>)#is', preg_quote($tag), preg_quote($tag));
$str = preg_replace($pattern,"",$str);
}
$str = preg_replace($pattern, '${2}',$str);
}
return $str;
}
function SetupHTTP()
{
if(!IsSet($this->http))
{
$this->http = new http_class;
$this->http->follow_redirect = 1;
$this->http->debug = 0;
$this->http->debug_response_body = 0;
$this->http->html_debug = 1;
$this->http->user_agent = owa_coreAPI::getSetting('base', 'owa_user_agent');
$this->http->timeout = 3;
$this->http->data_timeout = 3;
}
}
function OpenRequest($arguments, &$headers)
{
if(strlen($this->error=$this->http->Open($arguments)))
return(0);
if(strlen($this->error=$this->http->SendRequest($arguments))
|| strlen($this->error=$this->http->ReadReplyHeaders($headers)))
{
$this->http->Close();
return(0);
}
if($this->http->response_status!=200)
{
$this->error = 'the HTTP request returned the status '.$this->http->response_status;
$this->http->Close();
return(0);
}
return(1);
}
function GetRequestResponse(&$response)
{
for($response = ''; ; )
{
if(strlen($this->error=$this->http->ReadReplyBody($body, 500000)))
{
$this->http->Close();
return(0);
}
if(strlen($body)==0)
break;
$response .= $body;
}
$this->http->Close();
owa_coreAPI::debug('http response code: '.$this->http->response_status);
return($response);
}
function getRequest($url, $arguments = '', $response = '') {
$this->SetupHTTP();
$this->http->GetRequestArguments($url, $arguments);
$arguments['RequestMethod']='GET';
if(!$this->OpenRequest($arguments, $headers)) {
return(0);
}
$this->response = $this->GetRequestResponse($response);
return($this->response);
}
}
?>