-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
88 lines (71 loc) · 2.61 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from lxml import etree
import requests
import re
import rssmanager as RSS
import time
import sys
from ConfigParser import ConfigParser
cp=ConfigParser()
cp.read('alboPretorio.cfg')
BASE_OUT_URL=cp.get('settings', 'ALBO_BASE_URL')
DIRNAME=cp.get('settings', 'FILES_BASE_PATH')
SAGA_BASE="http://pubblicazioni.saga.it"
SAGA_ORGS=SAGA_BASE+"/orgs/"
if len(BASE_OUT_URL) == 0 or len(DIRNAME) == 0:
print "ERROR SETTINGS"
sys.exit(1)
def outputRSS(alboName,data,dirName):
rssName="alboPOP - Comune - " + alboName.title()
rssTitle="*non ufficiale* RSS feed dell'Albo Pretorio del Comune di " + alboName.title()
rss=RSS.rssElaboraNuovi(name=rssName,title=rssTitle,url=BASE_OUT_URL+"/alboPOP"+alboName+".xml",categoryType="Comune",categoryName="Comune di "+alboName.title())
for row in data:
nReg=row[0]
dataReg=row[1]
nAtto=row[2]
tipo=row[3]
oggetto=re.sub(r'\s*/g',' ',row[4])
inizioPub,finePub=row[6].split('-')
inizioPub=re.sub(r'(\d*)\/(\d*)\/(\d*)',r'\1/\2/20\3',inizioPub.strip())
finePub=re.sub(r'(\d*)\/(\d*)\/(\d*)',r'\1/\2/20\3',finePub.strip())
link=row[7].strip()
rss.do_rss(nReg,dataReg, tipo, oggetto,link,inizioPub,finePub)
outputFileName=dirName+'/'+'alboPOP'+alboName+'.xml'
print "writing to",outputFileName
rss.out_rss(outputFileName)
def main():
page = requests.get(SAGA_ORGS)
tree = etree.HTML(page.content)
links=tree.xpath('//td/a/@href')
links=map(lambda x:x.replace('/publishing/','/publishing/AP/'),links)
for l in links:
nomealbo=re.sub(r'.*org=','',l)
print l,nomealbo
url=SAGA_BASE+l
page = requests.get(url)
myparser=etree.HTMLParser(encoding='utf-8')
tree = etree.HTML(page.content,parser=myparser)
tables=tree.xpath('//table')
if len(tables)>0:
table = tables[0]
data=[]
rows=table.xpath('*/tr')
for r in rows[1:]:
datarow=[]
for cell in r.xpath('td'):
href=cell.xpath('a/@href')
text=u''.join(cell.xpath('.//text()')).strip()
if len(href) > 0:
detailurl=SAGA_BASE+re.sub(r'jsessionid=\S*\?','?',href[0])
datarow.append(detailurl)
else:
datarow.append(text)
data.append(datarow)
print data
outputRSS(nomealbo,data,DIRNAME)
time.sleep(2)
else:
print "ERROR"
if __name__=='__main__':
main()