Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100755 365 lines (307 sloc) 10.86 kb
0c574be9 »
2009-10-24 Initial commit
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
3 from BeautifulSoup import BeautifulSoup , SoupStrainer
4 import re
5 import urllib2
6 import codecs
7 import sys
50e9ab18 »
2009-12-06 Added the todo list for the project.
8 import copy
0c574be9 »
2009-10-24 Initial commit
9 import string
10
11 from datetime import datetime, date, time
12
13 ###########
77d14d3d »
2009-10-24 Prettyfied a little the generated html
14
2b581ddc »
2009-10-24 Prettyfied the generated html, made the information source more expli…
15 #constants
77d14d3d »
2009-10-24 Prettyfied a little the generated html
16 base_dir = "./"
17 namesFile = "./names.txt"
b4a462f0 »
2009-10-25 Added the README file
18 listFile = "index.html"
2ff2c4a1 »
2009-12-05 Modified weather.py to be able to use it from the google app engine :
19
3b4faa61 »
2010-01-11 Changed the subdomain for the source of the pictures to mobile so tha…
20 imgDomainMobile = "http://mobile.meteofrance.com/"
2b581ddc »
2009-10-24 Prettyfied the generated html, made the information source more expli…
21 # header is geared toward iphone
6af56ba3 »
2009-12-03 Moved to storing the output text as a list of unicode string tokens u…
22 head = u"""
77d14d3d »
2009-10-24 Prettyfied a little the generated html
23 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
24 <meta name="viewport" content="width=350, user-scalable=yes">
2ff2c4a1 »
2009-12-05 Modified weather.py to be able to use it from the google app engine :
25 <link rel="stylesheet" href="css/weather.css" type="text/css" />
77d14d3d »
2009-10-24 Prettyfied a little the generated html
26 """
34ec621d »
2009-12-06 Added a footer and some copy about the source.
27 foot =u"""
28 <div class="footer">
29 Les pr&eacute;visions m&eacute;t&eacute;o sont extraites du site de M&eacute;t&eacute;o-France. Elles sont reproduites en accord avec les informations pr&eacute;sentes sur la page concernant les <a href="http://france.meteofrance.com/france/accueil/informations_publiques">informations publiques</a>.
30 </div>
31 """
32 #timeFormat = "%A %d %B %Y - %H:%M:%S "
33 timeFormat = "%Y/%m/%d &agrave; %H:%M:%S "
77d14d3d »
2009-10-24 Prettyfied a little the generated html
34
35
36 ###########
6af56ba3 »
2009-12-03 Moved to storing the output text as a list of unicode string tokens u…
37 def parseAndDisplay(period,line,domain):
38 list=[u'']
0c574be9 »
2009-10-24 Initial commit
39 weather = line.contents[2]
40 ###########################
41 temperatures = weather.contents[1]
42 weatherName = weather.img['alt']
43 weatherImg = weather.img['src']
44 ###########################
45 wind = line.contents[3]
46 i = wind.contents[0]
47 windSpeed = wind.contents[1]
48 windDir = wind.img['title']
49 windImg = wind.img['src']
50 ###########################
51 rafales = line.contents[4]
52 windMax=rafales.contents[0]
53 ###########################
6af56ba3 »
2009-12-03 Moved to storing the output text as a list of unicode string tokens u…
54 weatherName = unicode.strip(weatherName)
55 weatherImg = unicode.strip(weatherImg)
0c574be9 »
2009-10-24 Initial commit
56
6af56ba3 »
2009-12-03 Moved to storing the output text as a list of unicode string tokens u…
57 windD = unicode.strip(windDir)
58 windI = unicode.strip(windImg)
59 windS = unicode.strip(windSpeed)[3:]
60 windM = unicode.strip(windMax)
0c574be9 »
2009-10-24 Initial commit
61
62 temp = temperatures
6af56ba3 »
2009-12-03 Moved to storing the output text as a list of unicode string tokens u…
63 temp = unicode.strip(temp)
0c574be9 »
2009-10-24 Initial commit
64
65 classLine="period"
66 if temp.find("/")>0 :
67 classLine="day"
68
69 #http://france.meteofrance.com/meteo/pictos/web/SITE/16/sud-sud-ouest.gif
70 #http://france.meteofrance.com/meteo/pictos/web/SITE/30/32_c.gif
4209cddc »
2009-12-02 Added parsing of a single destination
71 # I prefer smaller icons
72 weatherImg = weatherImg.replace("CARTE/40","SITE/30")
73
6af56ba3 »
2009-12-03 Moved to storing the output text as a list of unicode string tokens u…
74 TD=u'</td>\n\t<td class="'+classLine+'">'
75 RTD=u'</td>\n\t<td class="'+classLine+'" align="right">'
0c574be9 »
2009-10-24 Initial commit
76
6af56ba3 »
2009-12-03 Moved to storing the output text as a list of unicode string tokens u…
77 list.append(u'<tr class="'+classLine+'">\n\t')
78 list.append(u'<td class="'+classLine+'1">')
79 list.append(period)
3b4faa61 »
2010-01-11 Changed the subdomain for the source of the pictures to mobile so tha…
80 list.append(TD+'<img src="'+imgDomainMobile+weatherImg+'" width="30" height="30" ')
ad2f7a05 »
2010-01-10 Added google analytics tracking.
81 list.append(u' alt="')
6af56ba3 »
2009-12-03 Moved to storing the output text as a list of unicode string tokens u…
82 list.append(weatherName)
83 list.append(u'" title="')
84 list.append(weatherName)
85 list.append(u'" />' + RTD )
86 list.append(temp)
3b4faa61 »
2010-01-11 Changed the subdomain for the source of the pictures to mobile so tha…
87 list.append(TD + u'<img src="'+imgDomainMobile+windImg+'"')
ad2f7a05 »
2010-01-10 Added google analytics tracking.
88 list.append(u' width="16" height="16" alt="')
6af56ba3 »
2009-12-03 Moved to storing the output text as a list of unicode string tokens u…
89 list.append(windDir)
90 list.append(u'"title="')
91 list.append(windDir)
92 list.append(u'" />'+RTD)
93 list.append(windSpeed)
94 list.append(RTD)
95 list.append(windM)
96 list.append(u"</td>\n</tr>\n")
97 return list
0c574be9 »
2009-10-24 Initial commit
98
99
100 ###########
ad2f7a05 »
2010-01-10 Added google analytics tracking.
101 def parseMeteoPage(dico,content,tracking=""):
2ff2c4a1 »
2009-12-05 Modified weather.py to be able to use it from the google app engine :
102 name = dico["name"]
103 domain = dico["domain"]
104 suffix = dico["suffix"]
6af56ba3 »
2009-12-03 Moved to storing the output text as a list of unicode string tokens u…
105 list = [u'']
106
0c574be9 »
2009-10-24 Initial commit
107 day=""
108 indent=" "
109 indent2=indent+indent
50e9ab18 »
2009-12-06 Added the todo list for the project.
110
111 cityInfosFilter = SoupStrainer('div',{'class':'infos'})
112 otherSoup=BeautifulSoup(content,parseOnlyThese=cityInfosFilter)
113 infosPage = otherSoup("p",text=True)
114 #no post code on foreign cities
115 cityName = infosPage[0]
116 if len(infosPage) == 3 :
117 lastUpdate = infosPage[2]
118 else :
119 #cityPostcode = infosPage[1]
120 lastUpdate = infosPage[3]
121
e15428c3 »
2010-03-08 Updated the filter to only read infos from the weather table
122 links = SoupStrainer('table',{'class':'tableWeather'})
50e9ab18 »
2009-12-06 Added the todo list for the project.
123 soup= BeautifulSoup(content, parseOnlyThese=links)
124
125 pageName = u"Pr&eacute;visions m&eacute;t&eacute;o pour "+cityName+" ".encode('utf-8')
126
127 title = u"<html>\n<head>"+head+"<title>"+pageName+"</title>\n</head>\n<body>\n<div class=\"content\">\n"
34ec621d »
2009-12-06 Added a footer and some copy about the source.
128 title +=u"<h1>"+pageName+"</h1>\n".encode('utf-8')
129 title +=u"<h3>R&eacute;cuper&eacute; le "+datetime.now().strftime(timeFormat)+"</h3>\n"
50e9ab18 »
2009-12-06 Added the todo list for the project.
130
34ec621d »
2009-12-06 Added a footer and some copy about the source.
131 source = getSourceSentence(domain+suffix,cityName)
50e9ab18 »
2009-12-06 Added the todo list for the project.
132
6af56ba3 »
2009-12-03 Moved to storing the output text as a list of unicode string tokens u…
133 list.append(title)
134 list.append(source)
50e9ab18 »
2009-12-06 Added the todo list for the project.
135 list.append(u"<h3>"+lastUpdate+"</h3>")
6af56ba3 »
2009-12-03 Moved to storing the output text as a list of unicode string tokens u…
136 list.append(u"<table>")
0c574be9 »
2009-10-24 Initial commit
137
138 for line in soup("tr"):
139 period=""
140 # for the last line
141 if len(line) < 4 :
142 continue
4209cddc »
2009-12-02 Added parsing of a single destination
143 # for the daily summary
0c574be9 »
2009-10-24 Initial commit
144 if(line.__dict__['attrs'] != None and len(line.__dict__['attrs'])>0 ):
145 s = str(line.attrs[0][1]).decode('iso-8859-1')
146 n= s.find(" ")
147 if n>=0:
148 if day!=s[:n] :
149 day=s[:n]
150 else:#summary of that day or empty line?
151 sDay=s
152 weather = line.contents[2]
153 if len(weather.contents)==0:
154 #for the first line
155 continue
6af56ba3 »
2009-12-03 Moved to storing the output text as a list of unicode string tokens u…
156 periodLine = parseAndDisplay("# "+sDay,line,domain)
157 list.extend(periodLine)
0c574be9 »
2009-10-24 Initial commit
158 continue
159 ###########################
160 ## morning, afternoon etc ...
161 period=line.contents[1]
162 if len(period)==0:
163 continue
164 period=period.contents[0]
165 ############################
166 ## render
6af56ba3 »
2009-12-03 Moved to storing the output text as a list of unicode string tokens u…
167 periodLine = parseAndDisplay(period,line,domain)
168 list.extend(periodLine)
ad2f7a05 »
2010-01-10 Added google analytics tracking.
169
170 list.append(u"</table>\n"+foot+"\n<div>\n")
171 list.append(u"<div clas=\"nav\">Retourner &agrave <a href=\"/\">la list des villes</a></div>\n")
172 list.append(tracking)
173 list.append("</body>\n</html>")
2ff2c4a1 »
2009-12-05 Modified weather.py to be able to use it from the google app engine :
174 return list
2b581ddc »
2009-10-24 Prettyfied the generated html, made the information source more expli…
175
2e974724 »
2010-08-12 started working on parsing the new page design of french city on mete…
176 def parseMeteoPageFrance(dico,content,tracking=""):
177 name = dico["name"]
178 domain = dico["domain"]
179 suffix = dico["suffix"]
180 list = [u'']
181
182 day=""
183 indent=" "
184 indent2=indent+indent
185
186 cityInfosFilter = SoupStrainer('div',{'class':'choix'})
187 otherSoup=BeautifulSoup(content,parseOnlyThese=cityInfosFilter)
188 infosPage = otherSoup("p",text=True)
189 #no post code on foreign cities
190 cityName = infosPage[0]
191
192 if len(infosPage) == 3 :
193 lastUpdate = infosPage[2]
194 else :
195 #cityPostcode = infosPage[1]
196 lastUpdate = infosPage[3]
197
ef7c0590 »
2010-08-13 Remove a few stray print statments. Added some documentation of the n…
198 links = SoupStrainer('div',{'class':'bloc_details'})
2e974724 »
2010-08-12 started working on parsing the new page design of french city on mete…
199 soup= BeautifulSoup(content, parseOnlyThese=links)
200
201 pageName = u"Pr&eacute;visions m&eacute;t&eacute;o pour "+cityName+" ".encode('utf-8')
202
203 title = u"<html>\n<head>"+head+"<title>"+pageName+"</title>\n</head>\n<body>\n<div class=\"content\">\n"
204 title +=u"<h1>"+pageName+"</h1>\n".encode('utf-8')
205 title +=u"<h3>R&eacute;cuper&eacute; le "+datetime.now().strftime(timeFormat)+"</h3>\n"
206
207 source = getSourceSentence(domain+suffix,cityName)
208
209 list.append(title)
210 list.append(source)
211 list.append(u"<h3>"+lastUpdate+"</h3>")
212 list.append(u"<table>")
213
ef7c0590 »
2010-08-13 Remove a few stray print statments. Added some documentation of the n…
214 # current format in july-august / 2010
215 # summaries in divs
216 # id -> jour0, jour1, jour2
217 # class -> "jour show_first" then "jour"
218 # daily forecast in divs
219 # id -> blockDetails0, blockDetails1, blockDetails2
220 # class -> bloc_details
221 # tendances in ul
222 # id -> "suivants "
223 # class -> listeJoursLE
224 # class (li) -> lijourle0,lijourle1,lijourle2,lijourle3 ... 5
2e974724 »
2010-08-12 started working on parsing the new page design of french city on mete…
225 for line in soup("tr"):
226 period=""
227 # for the last line
228 if len(line) < 4 :
229 continue
230 # for the daily summary
231 if(line.__dict__['attrs'] != None and len(line.__dict__['attrs'])>0 ):
232 s = str(line.attrs[0][1]).decode('iso-8859-1')
233 n= s.find(" ")
234 if n>=0:
235 if day!=s[:n] :
236 day=s[:n]
237 else:#summary of that day or empty line?
238 sDay=s
239 weather = line.contents[2]
240 if len(weather.contents)==0:
241 #for the first line
242 continue
243 periodLine = parseAndDisplay("# "+sDay,line,domain)
244 list.extend(periodLine)
245 continue
246 ###########################
247 ## morning, afternoon etc ...
248 period=line.contents[1]
249 if len(period)==0:
250 continue
251 period=period.contents[0]
252 ############################
253 ## render
254 periodLine = parseAndDisplay(period,line,domain)
255 list.extend(periodLine)
256
257 list.append(u"</table>\n"+foot+"\n<div>\n")
258 list.append(u"<div clas=\"nav\">Retourner &agrave <a href=\"/\">la list des villes</a></div>\n")
259 list.append(tracking)
260 list.append("</body>\n</html>")
261 return list
262
50e9ab18 »
2009-12-06 Added the todo list for the project.
263
34ec621d »
2009-12-06 Added a footer and some copy about the source.
264 def getSourceSentence(sourceUrl,pageName):
265 return u"<div class=\"source\">Les informations sur cette page proviennent de la page de pr&eacute;visions de <a href=\""+sourceUrl+"\">M&eacute;t&eacute;o-France pour "+pageName+"</a>.</div>\n"
2b581ddc »
2009-10-24 Prettyfied the generated html, made the information source more expli…
266
267
ad2f7a05 »
2010-01-10 Added google analytics tracking.
268 def generateIndex(infos,pagesAreFiles,tracking=""):
6af56ba3 »
2009-12-03 Moved to storing the output text as a list of unicode string tokens u…
269 list=[]
270 list.append(u"<html>\n<head>")
34ec621d »
2009-12-06 Added a footer and some copy about the source.
271 list.append(u"\t<title>M&eacute;t&eacute;o extraite des sites de M&eacute;t&eacute;o-France</title>")
6af56ba3 »
2009-12-03 Moved to storing the output text as a list of unicode string tokens u…
272 list.append(head)
15f6e851 »
2009-12-10 Added caching of the pages in the google app engine datastore. Added …
273 list.append(u"</head>\n<body><div class=\"content\"><p>Ce site web a &eacute;t&eacute; &eacute;crit pour pouvoir consulter les pr&eacute;visions de M&eacute;t&eacute;o-France sans crasher le navigateur web de l'iPhone.</p><div class=\"source\">Source des pr&eacute;visions : <a href=\"http://www.meteofrance.com/\">M&eacute;t&eacute;o-France</a></div>\n")
34ec621d »
2009-12-06 Added a footer and some copy about the source.
274 list.append(u"<table>\n<thead><tr><th>Simple</th><th>Original</th></tr></thead>\n")
b4a462f0 »
2009-10-25 Added the README file
275
0c574be9 »
2009-10-24 Initial commit
276 for name,dico in infos.iteritems():
2ff2c4a1 »
2009-12-05 Modified weather.py to be able to use it from the google app engine :
277 pageAddress = name
278 if pagesAreFiles :
279 pageAddress=dico["file"]
280
281 list.append(u"<tr>\n\t<td><a href=\""+pageAddress+"\">"+name+"</a></td>\n")
34ec621d »
2009-12-06 Added a footer and some copy about the source.
282 list.append(u"\t<td><a href=\""+dico["domain"]+dico["suffix"]+"\">"+name+" chez M&eacute;t&eacute;o-France</a></td>\n</tr>\n")
0c574be9 »
2009-10-24 Initial commit
283
ad2f7a05 »
2010-01-10 Added google analytics tracking.
284 list.append(u"</table>\n"+foot+"\n</div>\n"+tracking+"\n<body></html>")
6af56ba3 »
2009-12-03 Moved to storing the output text as a list of unicode string tokens u…
285 return list
77d14d3d »
2009-10-24 Prettyfied a little the generated html
286
50e9ab18 »
2009-12-06 Added the todo list for the project.
287
2ff2c4a1 »
2009-12-05 Modified weather.py to be able to use it from the google app engine :
288 def getInfos():
0c574be9 »
2009-10-24 Initial commit
289 infos = {}
2ff2c4a1 »
2009-12-05 Modified weather.py to be able to use it from the google app engine :
290 f = open(namesFile)
291 for line in f:
292 l = line.split(',')
293 infos[l[0]] = {"name":l[0],"domain":l[1],"suffix":l[2],"file":l[3]}
0c574be9 »
2009-10-24 Initial commit
294 #print "name : ",l[0]
295 #print "domain: ",l[1]
296 #print "suffix : ",l[2]
297 #print "fileOut:,",l[3]
2ff2c4a1 »
2009-12-05 Modified weather.py to be able to use it from the google app engine :
298 return infos
299
50e9ab18 »
2009-12-06 Added the todo list for the project.
300
301 def getContent(dico):
302 resp = urllib2.urlopen(dico["domain"]+dico["suffix"])
303 return resp.read()
304
305
2ff2c4a1 »
2009-12-05 Modified weather.py to be able to use it from the google app engine :
306 def writeFile(fileName,contentList):
307 fileOut = open(base_dir+fileName,'w')
308 outText = u''.join(contentList)
309 text = outText.encode("iso-8859-1")
310 fileOut.write(text)
311 fileOut.close()
50e9ab18 »
2009-12-06 Added the todo list for the project.
312
313 ###############################
2ff2c4a1 »
2009-12-05 Modified weather.py to be able to use it from the google app engine :
314 def main():
315 import time
316 #get the files informations
317 infos = getInfos()
318
4209cddc »
2009-12-02 Added parsing of a single destination
319 if len(sys.argv)>1 and sys.argv[1] in infos:
320 page = sys.argv[1]
321 dico = infos[page]
2ff2c4a1 »
2009-12-05 Modified weather.py to be able to use it from the google app engine :
322
50e9ab18 »
2009-12-06 Added the todo list for the project.
323 content= getContent(dico)
2e974724 »
2010-08-12 started working on parsing the new page design of french city on mete…
324 if("monde" in dico["domain"]):
325 list = parseMeteoPage(dico,content)
326 else:
327 #france web page layout is very different
328 list = parseMeteoPageFrance(dico,content)
2ff2c4a1 »
2009-12-05 Modified weather.py to be able to use it from the google app engine :
329 writeFile(dico["file"],list)
330 print "did parse one of one page :",page
4209cddc »
2009-12-02 Added parsing of a single destination
331 return
332 else:
333 print "Darn, asking for a nil or non existing page. Parsing all the pages every hours"
334
2ff2c4a1 »
2009-12-05 Modified weather.py to be able to use it from the google app engine :
335 indexStrings = generateIndex(infos,True)
336 writeFile(listFile,indexStrings)
50e9ab18 »
2009-12-06 Added the todo list for the project.
337 i=0
0c574be9 »
2009-10-24 Initial commit
338 while 1:
339 try:
340 for name,dico in infos.iteritems():
50e9ab18 »
2009-12-06 Added the todo list for the project.
341 content= getContent(dico)
2e974724 »
2010-08-12 started working on parsing the new page design of french city on mete…
342 list = ""
343 if("monde" in dico["domain"]):
344 list = parseMeteoPage(dico,content)
345 else:
346 #france web page layout is very different
347 list = parseMeteoPageFrance(dico,content)
348
2ff2c4a1 »
2009-12-05 Modified weather.py to be able to use it from the google app engine :
349 writeFile(dico["file"],list)
350 print "did parse one of many page : ",name
50e9ab18 »
2009-12-06 Added the todo list for the project.
351
2ff2c4a1 »
2009-12-05 Modified weather.py to be able to use it from the google app engine :
352 except Exception, e:
0c574be9 »
2009-10-24 Initial commit
353 print "Error while trying to parse/write the webpage : "
354 print type(e) # the exception instance
355 print e.args # arguments stored in .args
6af56ba3 »
2009-12-03 Moved to storing the output text as a list of unicode string tokens u…
356 print e
0c574be9 »
2009-10-24 Initial commit
357
77d14d3d »
2009-10-24 Prettyfied a little the generated html
358 print "## " + datetime.now().strftime(timeFormat) + " # Sleeping for an hour."
0c574be9 »
2009-10-24 Initial commit
359 time.sleep(3600)
360
361
362
363
364 if __name__== '__main__':
365 main()
Something went wrong with that request. Please try again.