Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

312 lines (275 sloc) 11.807 kb
<!---
Sitemap 1.1
@description:
Spider to create site navigation tree in query format or xml google sitemap format.
@orginal author:
merlinox
@current updater:
mike henke
@project site: http://googlesitemapxmlgenerator.riaforge.org/
@dateLastMod:
1.33 - 2007/09/13
Added Escaping Entities
Added Technorati.com pinging
1.32 - 2007/09/11
Added blogsearch.google.com pinging
Corrected an external link check issue
1.31 - 2007/09/08
Added MSN to pinging
1.3 - 2007/09/05
Added a submitting sitemap.xml pinging function for Yahoo, Google, and Ask.
Updated the sitemap protocal and made sure the xml validates
1.2 - 2006/07/11
insertion of the title it of the page in the xml of the contents
1.1 - 2006/01/30
modernization of the script of analysis to the aim to before analyze to all link the first page
then the below pages
1.0 - 2006/01/27
--->
<cfcomponent>
<cfprocessingdirective pageencoding="UTF-8">
<cffunction access="private" name="scan" output="false" returntype="any" description="
Scan page and extract all html link (<a href=""..."" ...>...</a>)
">
<cfargument name="root" required="yes" type="string" hint="Root of analyzed site: http + domain + path (es.: http://www.example.com/site1/page)">
<cfargument name="url" required="yes" type="string" hint="Start page of site (es.: index.cfm)">
<cfargument name="depth" required="yes" type="numeric" hint="Click number (depth) of the page from start page">
<!--- local variables --->
<cfset var reg="<a [^>]*href=""([^""]+)""[^>]*>">
<cfset var res="">
<cfset var rootTmp="">
<cfset var text="">
<cfset var link="">
<cfset var temp="">
<cfset var levelNew = depth + 1>
<cfset var startPos=0>
<cfset var qry_insert="">
<cfset var qry_update="">
<cfset var qry_check="">
<!--- verify of path syntax. if wrong it repairs it --->
<cfif (root is not "") and (right(root,1) is not "/") and (left(url,1) is not "/")>
<cfset rootTmp = root & "/">
<cfelse>
<cfset rootTmp = root>
</cfif>
<cftry>
<cfhttp url="#rootTmp##url#"></cfhttp>
<cfset text=cfhttp.FileContent>
<cfcatch>
<!--- scan wrong --->
<cfset registro=registro & rootTmp & " - " & depth & " - " & "<b>lettura fallita</b><br>">
<cfreturn>
</cfcatch>
</cftry>
<cfset registro=registro & "<br>">
<!--- verify if page is indexable --->
<cfif text contains "<noindex>">
<cfreturn>
</cfif>
<!--- set page checked on result query (remove it from memory query and create new one checked) --->
<cfquery name="qry_pageList" dbtype="query">
SELECT *
FROM qry_pageList
WHERE page<>'#url#'
</cfquery>
<cfset temp=queryAddRow(qry_pageList)>
<cfset temp=querySetCell(qry_pageList,"page",url)>
<cfset temp=querySetCell(qry_pageList,"depth",depth)>
<cfset temp=querySetCell(qry_pageList,"analyzed",1)>
<!--- search all matches (link) with regular expression and save them on result query --->
<cfloop condition="true">
<!--- search match --->
<cfset res=REFindNoCase(reg,text,startPos,true)>
<cfif res.pos[1] is 0>
<!--- noone --->
<cfreturn>
<cfelse>
<!--- search link and saves match on temp variables --->
<cfset startPos = res.pos[1] + res.len[1]>
<cfset link= mid(text,res.pos[2],res.len[2])>
<!--- se il link è senza il nome della pagina, lo aggiungo --->
<cfif left(link,1) is "?">
<cfset link = url & link>
</cfif>
</cfif>
<!--- save page if doesn't still exist --->
<cfquery name="qry_check" dbtype="query">
SELECT *
FROM qry_pageList
WHERE page = '#link#'
</cfquery>
<cfif qry_check.recordCount is 0>
<!--- check fully qualified link and remove root --->
<cfif link CONTAINS root>
<cfset link = "#replace(link,root,de(''))#">
</cfif>
<cfif (len(link) gt 5) and
(link does not contain "://") and
(link does not contain "mailto:") and
(link does not contain "javascript:") >
<!--- sava match --->
<cfif not (right(link,4) is ".pdf" OR
right(link,4) is ".doc" OR
right(link,4) is ".xls" OR
right(link,4) is ".doc" OR
right(link,4) is ".txt")>
<cfset temp=queryAddRow(qry_pageList)>
<cfset temp=querySetCell(qry_pageList,"page",link)>
<cfset temp=querySetCell(qry_pageList,"depth",levelNew)>
<cfset temp=querySetCell(qry_pageList,"analyzed",0)>
</cfif>
</cfif>
</cfif>
</cfloop>
</cffunction>
<!--- cerco links --->
<cffunction access="remote" name="sitemap" output="true" returntype="query" description="
Start spider to scan all pages from start page
to all link reachable with *depthMax* click number
">
<cfargument name="root" required="yes" type="string" hint="Root of analyzed site: http + domain + path (es.: http://www.example.com/site1/page)">
<cfargument name="url" required="yes" type="string" hint="Start page of site (es.: index.cfm)">
<cfargument name="depthMax" required="yes" type="numeric" hint="Max click number (depth) of the page from start page">
<!--- regular exprezzion --->
<cfset var reg="<a [^>]*href=""([^""]+)""[^>]*>">
<!--- local variables --->
<cfset var temp="">
<cfset var qry_root="">
<cfset var qry_cerca="">
<cfset var depth=0>
<!--- algorith registry --->
<cfset registro="">
<!--- result query creation --->
<cfset qry_pageList=queryNew("page,depth,analyzed","VarChar,Integer,Bit")>
<!--- insert root on result query (not analyzed) --->
<cfset temp=queryAddRow(qry_pageList)>
<cfset temp=querySetCell(qry_pageList,"page",url)>
<cfset temp=querySetCell(qry_pageList,"depth",0)>
<cfset temp=querySetCell(qry_pageList,"analyzed",0)>
<cfset registro=registro & "<h3>Inizio scansione</h3>">
<!--- start spider scan (loop since depth less then depthMax) --->
<cfloop condition="depth lt depthMax">
<!--- load scannable pages --->
<cfquery name="qry_cerca" dbtype="query">
SELECT *
FROM qry_pageList
WHERE analyzed=0
</cfquery>
<!--- start page scan --->
<cfloop query="qry_cerca">
<cfset registro=registro & root & " - " & page & " - " & depth & " - ">
<cfset temp=scan(root,page,depth,indexIt)>
</cfloop>
<!--- depth increment --->
<cfset depth=depth+1>
</cfloop>
<cfreturn qry_pageList>
</cffunction>
<cffunction name="googleSitemap" access="remote" output="true" description="
Google Sitemap Creation from sitemap result query
Page who called ""googleSitemap"" functon may prepend:
<cfcontent type=""text/xml; charset=UTF-8"">
">
<cfargument name="query" required="yes" type="query" hint="Result query: it needs ""page"" data column">
<cfargument name="root" required="yes" type="string" hint="Root of analyzed site: http + domain + path (es.: http://www.example.com/site1/page)">
<cfset var qry_view="">
<!--- build sitemap xml --->
<cfquery name="qry_view" dbtype="query">
SELECT *
FROM query
ORDER BY depth
</cfquery>
<cfsavecontent variable="sitemap">
<cfoutput><?xml version="1.0" encoding="UTF-8" ?>
<urlset
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
<cfloop query="qry_view">
<!--- Entity Escaping --->
<cfset cleanURL = replace(page,'&','&amp;',"all") >
<cfset cleanURL = replace(cleanURL,"'",'&apos;',"all") >
<cfset cleanURL = replace(cleanURL,'"','&quot;',"all") >
<cfset cleanURL = replace(cleanURL,'>','&gt;',"all") >
<cfset cleanURL = replace(cleanURL,'<','&lt;',"all") >
<url>
<loc>#root#/#cleanURL#</loc>
<changefreq>daily</changefreq>
<priority>0.5</priority>
</url>
</cfloop></urlset></cfoutput>
</cfsavecontent>
<cfreturn sitemap>
</cffunction>
<cffunction name="indexIt" access="remote" output="false" description="
Spider who builds query with pages name and all texts in plain/text format
starting from navigation tree query (page column)
Result query may insert on a verity collection to create search engine data source
">
<cfargument name="root" required="yes" type="string" hint="Root of analyzed site: http + domain + path (es.: http://www.example.com/site1/page)">
<cfargument name="query" required="yes" type="query" hint="Result query: it needs ""page"" data column">
<cfargument name="maxrows" required="no" type="numeric" default="500" hint="Max number of indexed pages">
<cfset var date=now()>
<cfset var pagine="">
<cfset var qry_indice = QueryNew("date, page, text, titleS") >
<cfset var i=1>
<cfset var text="">
<cfset var tmp="">
<cfloop query="query" startrow="1" endrow="#maxrows#">
<!--- verify of path syntax. if wrong it repairs it --->
<cfif (root is not "") and (right(root,1) is not "/") and (left(page,1) is not "/")>
<cfset rootTmp = root & "/">
<cfelse>
<cfset rootTmp = root>
</cfif>
<!--- load page contents --->
<cfif findNoCase("?",page) gt 0>
<cfset pageTmp = page & "&debug=false">
<cfelse>
<cfset pageTmp = page & "?debug=false">
</cfif>
<cfhttp url="#rootTmp##pageTmp#"></cfhttp>
<cfset text=cfhttp.FileContent>
<cfif cfhttp.errorDetail is "">
<!--- rewrite title appending variable --->
<cfset temp=ReFindNoCase("<title>(.*)</title>",text,1,true)>
<!--- check if some occurence exist --->
<cfif isDefined("temp.len") and temp.len[1] gt 0>
<cfset titleS=mid(text,temp.pos[2],temp.len[2])>
<cfelse>
<cfset titleS="">
</cfif>
<!--- remove comment head & script tags --->
<cfset text=ReReplaceNoCase(text, "<head>.*</head>","","all")>
<cfset text=ReReplaceNoCase(text, "<script>.*</script>","","all")>
<!--- remove html tags --->
<cfset text=ReReplaceNoCase(text, "<[^>]*>", "", "ALL")>
<!--- result query building --->
<cfset temp = QueryAddRow(qry_indice)>
<cfset temp = QuerySetCell(qry_indice,"date",date)>
<cfset temp = QuerySetCell(qry_indice,"page",page)>
<cfset temp = QuerySetCell(qry_indice,"text",text)>
<cfset temp = QuerySetCell(qry_indice,"titleS",titleS)>
</cfif>
</cfloop>
<cfreturn qry_indice>
</cffunction>
<cffunction name="submitSitemap" access="remote" output="false" description="
Submits sitemap.xml to Ask.com, Google, and Yahoo
">
<cfargument name="url" required="yes" type="string" hint="Location of sitemap.xml (es.: http://www.domain.com/sitemap.xml)">
<!--- Ask.com --->
<cfhttp url="http://submissions.ask.com/ping?sitemap=#urlencodedformat(url, 'utf-8')#" >
<!--- Google --->
<cfhttp url="http://www.google.com/webmasters/sitemaps/ping?sitemap=#urlencodedformat(url, 'utf-8')#">
<!--- Yahoo --->
<cfhttp url="http://search.yahooapis.com/SiteExplorerService/V1/updateNotification?appid=YahooDemo&url=#url#">
<!--- MSN (moreover.com for inclusion within the MSN Content Search)--->
<cfhttp url="http://api.moreover.com/ping?u=#url#">
<!--- blogsearch.google.com --->
<cfhttp url="http://blogsearch.google.com/ping?URL=#urlencodedformat(url, 'utf-8')#">
<!--- Technorati --->
<cfhttp url="http://technorati.com/ping/?url=#urlencodedformat(url, 'utf-8')#">
</cffunction>
</cfcomponent>
Jump to Line
Something went wrong with that request. Please try again.