/
_sitemap.cfc
311 lines (275 loc) · 11.5 KB
/
_sitemap.cfc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
<!---
Sitemap 1.1
@description:
Spider to create site navigation tree in query format or xml google sitemap format.
@orginal author:
merlinox
@current updater:
mike henke
@project site: http://googlesitemapxmlgenerator.riaforge.org/
@dateLastMod:
1.33 - 2007/09/13
Added Escaping Entities
Added Technorati.com pinging
1.32 - 2007/09/11
Added blogsearch.google.com pinging
Corrected an external link check issue
1.31 - 2007/09/08
Added MSN to pinging
1.3 - 2007/09/05
Added a submitting sitemap.xml pinging function for Yahoo, Google, and Ask.
Updated the sitemap protocal and made sure the xml validates
1.2 - 2006/07/11
insertion of the title it of the page in the xml of the contents
1.1 - 2006/01/30
modernization of the script of analysis to the aim to before analyze to all link the first page
then the below pages
1.0 - 2006/01/27
--->
<cfcomponent>
<cfprocessingdirective pageencoding="UTF-8">
<cffunction access="private" name="scan" output="false" returntype="any" description="
Scan page and extract all html link (<a href=""..."" ...>...</a>)
">
<cfargument name="root" required="yes" type="string" hint="Root of analyzed site: http + domain + path (es.: http://www.example.com/site1/page)">
<cfargument name="url" required="yes" type="string" hint="Start page of site (es.: index.cfm)">
<cfargument name="depth" required="yes" type="numeric" hint="Click number (depth) of the page from start page">
<!--- local variables --->
<cfset var reg="<a [^>]*href=""([^""]+)""[^>]*>">
<cfset var res="">
<cfset var rootTmp="">
<cfset var text="">
<cfset var link="">
<cfset var temp="">
<cfset var levelNew = depth + 1>
<cfset var startPos=0>
<cfset var qry_insert="">
<cfset var qry_update="">
<cfset var qry_check="">
<!--- verify of path syntax. if wrong it repairs it --->
<cfif (root is not "") and (right(root,1) is not "/") and (left(url,1) is not "/")>
<cfset rootTmp = root & "/">
<cfelse>
<cfset rootTmp = root>
</cfif>
<cftry>
<cfhttp url="#rootTmp##url#"></cfhttp>
<cfset text=cfhttp.FileContent>
<cfcatch>
<!--- scan wrong --->
<cfset registro=registro & rootTmp & " - " & depth & " - " & "<b>lettura fallita</b><br>">
<cfreturn>
</cfcatch>
</cftry>
<cfset registro=registro & "<br>">
<!--- verify if page is indexable --->
<cfif text contains "<noindex>">
<cfreturn>
</cfif>
<!--- set page checked on result query (remove it from memory query and create new one checked) --->
<cfquery name="qry_pageList" dbtype="query">
SELECT *
FROM qry_pageList
WHERE page<>'#url#'
</cfquery>
<cfset temp=queryAddRow(qry_pageList)>
<cfset temp=querySetCell(qry_pageList,"page",url)>
<cfset temp=querySetCell(qry_pageList,"depth",depth)>
<cfset temp=querySetCell(qry_pageList,"analyzed",1)>
<!--- search all matches (link) with regular expression and save them on result query --->
<cfloop condition="true">
<!--- search match --->
<cfset res=REFindNoCase(reg,text,startPos,true)>
<cfif res.pos[1] is 0>
<!--- noone --->
<cfreturn>
<cfelse>
<!--- search link and saves match on temp variables --->
<cfset startPos = res.pos[1] + res.len[1]>
<cfset link= mid(text,res.pos[2],res.len[2])>
<!--- se il link è senza il nome della pagina, lo aggiungo --->
<cfif left(link,1) is "?">
<cfset link = url & link>
</cfif>
</cfif>
<!--- save page if doesn't still exist --->
<cfquery name="qry_check" dbtype="query">
SELECT *
FROM qry_pageList
WHERE page = '#link#'
</cfquery>
<cfif qry_check.recordCount is 0>
<!--- check fully qualified link and remove root --->
<cfif link CONTAINS root>
<cfset link = "#replace(link,root,de(''))#">
</cfif>
<cfif (len(link) gt 5) and
(link does not contain "://") and
(link does not contain "mailto:") and
(link does not contain "javascript:") >
<!--- sava match --->
<cfif not (right(link,4) is ".pdf" OR
right(link,4) is ".doc" OR
right(link,4) is ".xls" OR
right(link,4) is ".doc" OR
right(link,4) is ".txt")>
<cfset temp=queryAddRow(qry_pageList)>
<cfset temp=querySetCell(qry_pageList,"page",link)>
<cfset temp=querySetCell(qry_pageList,"depth",levelNew)>
<cfset temp=querySetCell(qry_pageList,"analyzed",0)>
</cfif>
</cfif>
</cfif>
</cfloop>
</cffunction>
<!--- cerco links --->
<cffunction access="remote" name="sitemap" output="true" returntype="query" description="
Start spider to scan all pages from start page
to all link reachable with *depthMax* click number
">
<cfargument name="root" required="yes" type="string" hint="Root of analyzed site: http + domain + path (es.: http://www.example.com/site1/page)">
<cfargument name="url" required="yes" type="string" hint="Start page of site (es.: index.cfm)">
<cfargument name="depthMax" required="yes" type="numeric" hint="Max click number (depth) of the page from start page">
<!--- regular exprezzion --->
<cfset var reg="<a [^>]*href=""([^""]+)""[^>]*>">
<!--- local variables --->
<cfset var temp="">
<cfset var qry_root="">
<cfset var qry_cerca="">
<cfset var depth=0>
<!--- algorith registry --->
<cfset registro="">
<!--- result query creation --->
<cfset qry_pageList=queryNew("page,depth,analyzed","VarChar,Integer,Bit")>
<!--- insert root on result query (not analyzed) --->
<cfset temp=queryAddRow(qry_pageList)>
<cfset temp=querySetCell(qry_pageList,"page",url)>
<cfset temp=querySetCell(qry_pageList,"depth",0)>
<cfset temp=querySetCell(qry_pageList,"analyzed",0)>
<cfset registro=registro & "<h3>Inizio scansione</h3>">
<!--- start spider scan (loop since depth less then depthMax) --->
<cfloop condition="depth lt depthMax">
<!--- load scannable pages --->
<cfquery name="qry_cerca" dbtype="query">
SELECT *
FROM qry_pageList
WHERE analyzed=0
</cfquery>
<!--- start page scan --->
<cfloop query="qry_cerca">
<cfset registro=registro & root & " - " & page & " - " & depth & " - ">
<cfset temp=scan(root,page,depth,indexIt)>
</cfloop>
<!--- depth increment --->
<cfset depth=depth+1>
</cfloop>
<cfreturn qry_pageList>
</cffunction>
<cffunction name="googleSitemap" access="remote" output="true" description="
Google Sitemap Creation from sitemap result query
Page who called ""googleSitemap"" functon may prepend:
<cfcontent type=""text/xml; charset=UTF-8"">
">
<cfargument name="query" required="yes" type="query" hint="Result query: it needs ""page"" data column">
<cfargument name="root" required="yes" type="string" hint="Root of analyzed site: http + domain + path (es.: http://www.example.com/site1/page)">
<cfset var qry_view="">
<!--- build sitemap xml --->
<cfquery name="qry_view" dbtype="query">
SELECT *
FROM query
ORDER BY depth
</cfquery>
<cfsavecontent variable="sitemap">
<cfoutput><?xml version="1.0" encoding="UTF-8" ?>
<urlset
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
<cfloop query="qry_view">
<!--- Entity Escaping --->
<cfset cleanURL = replace(page,'&','&',"all") >
<cfset cleanURL = replace(cleanURL,"'",''',"all") >
<cfset cleanURL = replace(cleanURL,'"','"',"all") >
<cfset cleanURL = replace(cleanURL,'>','>',"all") >
<cfset cleanURL = replace(cleanURL,'<','<',"all") >
<url>
<loc>#root#/#cleanURL#</loc>
<changefreq>daily</changefreq>
<priority>0.5</priority>
</url>
</cfloop></urlset></cfoutput>
</cfsavecontent>
<cfreturn sitemap>
</cffunction>
<cffunction name="indexIt" access="remote" output="false" description="
Spider who builds query with pages name and all texts in plain/text format
starting from navigation tree query (page column)
Result query may insert on a verity collection to create search engine data source
">
<cfargument name="root" required="yes" type="string" hint="Root of analyzed site: http + domain + path (es.: http://www.example.com/site1/page)">
<cfargument name="query" required="yes" type="query" hint="Result query: it needs ""page"" data column">
<cfargument name="maxrows" required="no" type="numeric" default="500" hint="Max number of indexed pages">
<cfset var date=now()>
<cfset var pagine="">
<cfset var qry_indice = QueryNew("date, page, text, titleS") >
<cfset var i=1>
<cfset var text="">
<cfset var tmp="">
<cfloop query="query" startrow="1" endrow="#maxrows#">
<!--- verify of path syntax. if wrong it repairs it --->
<cfif (root is not "") and (right(root,1) is not "/") and (left(page,1) is not "/")>
<cfset rootTmp = root & "/">
<cfelse>
<cfset rootTmp = root>
</cfif>
<!--- load page contents --->
<cfif findNoCase("?",page) gt 0>
<cfset pageTmp = page & "&debug=false">
<cfelse>
<cfset pageTmp = page & "?debug=false">
</cfif>
<cfhttp url="#rootTmp##pageTmp#"></cfhttp>
<cfset text=cfhttp.FileContent>
<cfif cfhttp.errorDetail is "">
<!--- rewrite title appending variable --->
<cfset temp=ReFindNoCase("<title>(.*)</title>",text,1,true)>
<!--- check if some occurence exist --->
<cfif isDefined("temp.len") and temp.len[1] gt 0>
<cfset titleS=mid(text,temp.pos[2],temp.len[2])>
<cfelse>
<cfset titleS="">
</cfif>
<!--- remove comment head & script tags --->
<cfset text=ReReplaceNoCase(text, "<head>.*</head>","","all")>
<cfset text=ReReplaceNoCase(text, "<script>.*</script>","","all")>
<!--- remove html tags --->
<cfset text=ReReplaceNoCase(text, "<[^>]*>", "", "ALL")>
<!--- result query building --->
<cfset temp = QueryAddRow(qry_indice)>
<cfset temp = QuerySetCell(qry_indice,"date",date)>
<cfset temp = QuerySetCell(qry_indice,"page",page)>
<cfset temp = QuerySetCell(qry_indice,"text",text)>
<cfset temp = QuerySetCell(qry_indice,"titleS",titleS)>
</cfif>
</cfloop>
<cfreturn qry_indice>
</cffunction>
<cffunction name="submitSitemap" access="remote" output="false" description="
Submits sitemap.xml to Ask.com, Google, and Yahoo
">
<cfargument name="url" required="yes" type="string" hint="Location of sitemap.xml (es.: http://www.domain.com/sitemap.xml)">
<!--- Ask.com --->
<cfhttp url="http://submissions.ask.com/ping?sitemap=#urlencodedformat(url, 'utf-8')#" >
<!--- Google --->
<cfhttp url="http://www.google.com/webmasters/sitemaps/ping?sitemap=#urlencodedformat(url, 'utf-8')#">
<!--- Yahoo --->
<cfhttp url="http://search.yahooapis.com/SiteExplorerService/V1/updateNotification?appid=YahooDemo&url=#url#">
<!--- MSN (moreover.com for inclusion within the MSN Content Search)--->
<cfhttp url="http://api.moreover.com/ping?u=#url#">
<!--- blogsearch.google.com --->
<cfhttp url="http://blogsearch.google.com/ping?URL=#urlencodedformat(url, 'utf-8')#">
<!--- Technorati --->
<cfhttp url="http://technorati.com/ping/?url=#urlencodedformat(url, 'utf-8')#">
</cffunction>
</cfcomponent>