Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100755 366 lines (307 sloc) 10.86 kb
0c574be Nicolas Hoibian Initial commit
authored
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
3 from BeautifulSoup import BeautifulSoup , SoupStrainer
4 import re
5 import urllib2
6 import codecs
7 import sys
50e9ab1 Nicolas Hoibian Added the todo list for the project.
authored
8 import copy
0c574be Nicolas Hoibian Initial commit
authored
9 import string
10
11 from datetime import datetime, date, time
12
13 ###########
77d14d3 Nicolas Hoibian Prettyfied a little the generated html
authored
14
2b581dd Nicolas Hoibian Prettyfied the generated html, made the information source more explicit...
authored
15 #constants
77d14d3 Nicolas Hoibian Prettyfied a little the generated html
authored
16 base_dir = "./"
17 namesFile = "./names.txt"
b4a462f Nicolas Hoibian Added the README file
authored
18 listFile = "index.html"
2ff2c4a Nicolas Hoibian Modified weather.py to be able to use it from the google app engine :
authored
19
3b4faa6 Nicolas Hoibian Changed the subdomain for the source of the pictures to mobile so that t...
authored
20 imgDomainMobile = "http://mobile.meteofrance.com/"
2b581dd Nicolas Hoibian Prettyfied the generated html, made the information source more explicit...
authored
21 # header is geared toward iphone
6af56ba Nicolas Hoibian Moved to storing the output text as a list of unicode string tokens unti...
authored
22 head = u"""
77d14d3 Nicolas Hoibian Prettyfied a little the generated html
authored
23 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
24 <meta name="viewport" content="width=350, user-scalable=yes">
2ff2c4a Nicolas Hoibian Modified weather.py to be able to use it from the google app engine :
authored
25 <link rel="stylesheet" href="css/weather.css" type="text/css" />
77d14d3 Nicolas Hoibian Prettyfied a little the generated html
authored
26 """
34ec621 Nicolas Hoibian Added a footer and some copy about the source.
authored
27 foot =u"""
28 <div class="footer">
29 Les pr&eacute;visions m&eacute;t&eacute;o sont extraites du site de M&eacute;t&eacute;o-France. Elles sont reproduites en accord avec les informations pr&eacute;sentes sur la page concernant les <a href="http://france.meteofrance.com/france/accueil/informations_publiques">informations publiques</a>.
30 </div>
31 """
32 #timeFormat = "%A %d %B %Y - %H:%M:%S "
33 timeFormat = "%Y/%m/%d &agrave; %H:%M:%S "
77d14d3 Nicolas Hoibian Prettyfied a little the generated html
authored
34
35
36 ###########
6af56ba Nicolas Hoibian Moved to storing the output text as a list of unicode string tokens unti...
authored
37 def parseAndDisplay(period,line,domain):
38 list=[u'']
0c574be Nicolas Hoibian Initial commit
authored
39 weather = line.contents[2]
40 ###########################
41 temperatures = weather.contents[1]
42 weatherName = weather.img['alt']
43 weatherImg = weather.img['src']
44 ###########################
45 wind = line.contents[3]
46 i = wind.contents[0]
47 windSpeed = wind.contents[1]
48 windDir = wind.img['title']
49 windImg = wind.img['src']
50 ###########################
51 rafales = line.contents[4]
52 windMax=rafales.contents[0]
53 ###########################
6af56ba Nicolas Hoibian Moved to storing the output text as a list of unicode string tokens unti...
authored
54 weatherName = unicode.strip(weatherName)
55 weatherImg = unicode.strip(weatherImg)
0c574be Nicolas Hoibian Initial commit
authored
56
6af56ba Nicolas Hoibian Moved to storing the output text as a list of unicode string tokens unti...
authored
57 windD = unicode.strip(windDir)
58 windI = unicode.strip(windImg)
59 windS = unicode.strip(windSpeed)[3:]
60 windM = unicode.strip(windMax)
0c574be Nicolas Hoibian Initial commit
authored
61
62 temp = temperatures
6af56ba Nicolas Hoibian Moved to storing the output text as a list of unicode string tokens unti...
authored
63 temp = unicode.strip(temp)
0c574be Nicolas Hoibian Initial commit
authored
64
65 classLine="period"
66 if temp.find("/")>0 :
67 classLine="day"
68
69 #http://france.meteofrance.com/meteo/pictos/web/SITE/16/sud-sud-ouest.gif
70 #http://france.meteofrance.com/meteo/pictos/web/SITE/30/32_c.gif
4209cdd Nicolas Hoibian Added parsing of a single destination
authored
71 # I prefer smaller icons
72 weatherImg = weatherImg.replace("CARTE/40","SITE/30")
73
6af56ba Nicolas Hoibian Moved to storing the output text as a list of unicode string tokens unti...
authored
74 TD=u'</td>\n\t<td class="'+classLine+'">'
75 RTD=u'</td>\n\t<td class="'+classLine+'" align="right">'
0c574be Nicolas Hoibian Initial commit
authored
76
6af56ba Nicolas Hoibian Moved to storing the output text as a list of unicode string tokens unti...
authored
77 list.append(u'<tr class="'+classLine+'">\n\t')
78 list.append(u'<td class="'+classLine+'1">')
79 list.append(period)
3b4faa6 Nicolas Hoibian Changed the subdomain for the source of the pictures to mobile so that t...
authored
80 list.append(TD+'<img src="'+imgDomainMobile+weatherImg+'" width="30" height="30" ')
ad2f7a0 Nicolas Hoibian Added google analytics tracking.
authored
81 list.append(u' alt="')
6af56ba Nicolas Hoibian Moved to storing the output text as a list of unicode string tokens unti...
authored
82 list.append(weatherName)
83 list.append(u'" title="')
84 list.append(weatherName)
85 list.append(u'" />' + RTD )
86 list.append(temp)
3b4faa6 Nicolas Hoibian Changed the subdomain for the source of the pictures to mobile so that t...
authored
87 list.append(TD + u'<img src="'+imgDomainMobile+windImg+'"')
ad2f7a0 Nicolas Hoibian Added google analytics tracking.
authored
88 list.append(u' width="16" height="16" alt="')
6af56ba Nicolas Hoibian Moved to storing the output text as a list of unicode string tokens unti...
authored
89 list.append(windDir)
90 list.append(u'"title="')
91 list.append(windDir)
92 list.append(u'" />'+RTD)
93 list.append(windSpeed)
94 list.append(RTD)
95 list.append(windM)
96 list.append(u"</td>\n</tr>\n")
97 return list
0c574be Nicolas Hoibian Initial commit
authored
98
99
100 ###########
ad2f7a0 Nicolas Hoibian Added google analytics tracking.
authored
101 def parseMeteoPage(dico,content,tracking=""):
2ff2c4a Nicolas Hoibian Modified weather.py to be able to use it from the google app engine :
authored
102 name = dico["name"]
103 domain = dico["domain"]
104 suffix = dico["suffix"]
6af56ba Nicolas Hoibian Moved to storing the output text as a list of unicode string tokens unti...
authored
105 list = [u'']
106
0c574be Nicolas Hoibian Initial commit
authored
107 day=""
108 indent=" "
109 indent2=indent+indent
50e9ab1 Nicolas Hoibian Added the todo list for the project.
authored
110
111 cityInfosFilter = SoupStrainer('div',{'class':'infos'})
112 otherSoup=BeautifulSoup(content,parseOnlyThese=cityInfosFilter)
113 infosPage = otherSoup("p",text=True)
114 #no post code on foreign cities
115 cityName = infosPage[0]
116 if len(infosPage) == 3 :
117 lastUpdate = infosPage[2]
118 else :
119 #cityPostcode = infosPage[1]
120 lastUpdate = infosPage[3]
121
e15428c Nicolas Hoibian Updated the filter to only read infos from the weather table
authored
122 links = SoupStrainer('table',{'class':'tableWeather'})
50e9ab1 Nicolas Hoibian Added the todo list for the project.
authored
123 soup= BeautifulSoup(content, parseOnlyThese=links)
124
125 pageName = u"Pr&eacute;visions m&eacute;t&eacute;o pour "+cityName+" ".encode('utf-8')
126
127 title = u"<html>\n<head>"+head+"<title>"+pageName+"</title>\n</head>\n<body>\n<div class=\"content\">\n"
34ec621 Nicolas Hoibian Added a footer and some copy about the source.
authored
128 title +=u"<h1>"+pageName+"</h1>\n".encode('utf-8')
129 title +=u"<h3>R&eacute;cuper&eacute; le "+datetime.now().strftime(timeFormat)+"</h3>\n"
50e9ab1 Nicolas Hoibian Added the todo list for the project.
authored
130
34ec621 Nicolas Hoibian Added a footer and some copy about the source.
authored
131 source = getSourceSentence(domain+suffix,cityName)
50e9ab1 Nicolas Hoibian Added the todo list for the project.
authored
132
6af56ba Nicolas Hoibian Moved to storing the output text as a list of unicode string tokens unti...
authored
133 list.append(title)
134 list.append(source)
50e9ab1 Nicolas Hoibian Added the todo list for the project.
authored
135 list.append(u"<h3>"+lastUpdate+"</h3>")
6af56ba Nicolas Hoibian Moved to storing the output text as a list of unicode string tokens unti...
authored
136 list.append(u"<table>")
0c574be Nicolas Hoibian Initial commit
authored
137
138 for line in soup("tr"):
139 period=""
140 # for the last line
141 if len(line) < 4 :
142 continue
4209cdd Nicolas Hoibian Added parsing of a single destination
authored
143 # for the daily summary
0c574be Nicolas Hoibian Initial commit
authored
144 if(line.__dict__['attrs'] != None and len(line.__dict__['attrs'])>0 ):
145 s = str(line.attrs[0][1]).decode('iso-8859-1')
146 n= s.find(" ")
147 if n>=0:
148 if day!=s[:n] :
149 day=s[:n]
150 else:#summary of that day or empty line?
151 sDay=s
152 weather = line.contents[2]
153 if len(weather.contents)==0:
154 #for the first line
155 continue
6af56ba Nicolas Hoibian Moved to storing the output text as a list of unicode string tokens unti...
authored
156 periodLine = parseAndDisplay("# "+sDay,line,domain)
157 list.extend(periodLine)
0c574be Nicolas Hoibian Initial commit
authored
158 continue
159 ###########################
160 ## morning, afternoon etc ...
161 period=line.contents[1]
162 if len(period)==0:
163 continue
164 period=period.contents[0]
165 ############################
166 ## render
6af56ba Nicolas Hoibian Moved to storing the output text as a list of unicode string tokens unti...
authored
167 periodLine = parseAndDisplay(period,line,domain)
168 list.extend(periodLine)
ad2f7a0 Nicolas Hoibian Added google analytics tracking.
authored
169
170 list.append(u"</table>\n"+foot+"\n<div>\n")
171 list.append(u"<div clas=\"nav\">Retourner &agrave <a href=\"/\">la list des villes</a></div>\n")
172 list.append(tracking)
173 list.append("</body>\n</html>")
2ff2c4a Nicolas Hoibian Modified weather.py to be able to use it from the google app engine :
authored
174 return list
2b581dd Nicolas Hoibian Prettyfied the generated html, made the information source more explicit...
authored
175
2e97472 Nicolas Hoibian started working on parsing the new page design of french city on meteofr...
authored
176 def parseMeteoPageFrance(dico,content,tracking=""):
177 name = dico["name"]
178 domain = dico["domain"]
179 suffix = dico["suffix"]
180 list = [u'']
181
182 day=""
183 indent=" "
184 indent2=indent+indent
185
186 cityInfosFilter = SoupStrainer('div',{'class':'choix'})
187 otherSoup=BeautifulSoup(content,parseOnlyThese=cityInfosFilter)
188 infosPage = otherSoup("p",text=True)
189 #no post code on foreign cities
190 cityName = infosPage[0]
191
192 if len(infosPage) == 3 :
193 lastUpdate = infosPage[2]
194 else :
195 #cityPostcode = infosPage[1]
196 lastUpdate = infosPage[3]
197
ef7c059 Nicolas Hoibian Remove a few stray print statments. Added some documentation of the new ...
authored
198 links = SoupStrainer('div',{'class':'bloc_details'})
2e97472 Nicolas Hoibian started working on parsing the new page design of french city on meteofr...
authored
199 soup= BeautifulSoup(content, parseOnlyThese=links)
200
201 pageName = u"Pr&eacute;visions m&eacute;t&eacute;o pour "+cityName+" ".encode('utf-8')
202
203 title = u"<html>\n<head>"+head+"<title>"+pageName+"</title>\n</head>\n<body>\n<div class=\"content\">\n"
204 title +=u"<h1>"+pageName+"</h1>\n".encode('utf-8')
205 title +=u"<h3>R&eacute;cuper&eacute; le "+datetime.now().strftime(timeFormat)+"</h3>\n"
206
207 source = getSourceSentence(domain+suffix,cityName)
208
209 list.append(title)
210 list.append(source)
211 list.append(u"<h3>"+lastUpdate+"</h3>")
212 list.append(u"<table>")
213
ef7c059 Nicolas Hoibian Remove a few stray print statments. Added some documentation of the new ...
authored
214 # current format in july-august / 2010
215 # summaries in divs
216 # id -> jour0, jour1, jour2
217 # class -> "jour show_first" then "jour"
218 # daily forecast in divs
219 # id -> blockDetails0, blockDetails1, blockDetails2
220 # class -> bloc_details
221 # tendances in ul
222 # id -> "suivants "
223 # class -> listeJoursLE
224 # class (li) -> lijourle0,lijourle1,lijourle2,lijourle3 ... 5
2e97472 Nicolas Hoibian started working on parsing the new page design of french city on meteofr...
authored
225 for line in soup("tr"):
226 period=""
227 # for the last line
228 if len(line) < 4 :
229 continue
230 # for the daily summary
231 if(line.__dict__['attrs'] != None and len(line.__dict__['attrs'])>0 ):
232 s = str(line.attrs[0][1]).decode('iso-8859-1')
233 n= s.find(" ")
234 if n>=0:
235 if day!=s[:n] :
236 day=s[:n]
237 else:#summary of that day or empty line?
238 sDay=s
239 weather = line.contents[2]
240 if len(weather.contents)==0:
241 #for the first line
242 continue
243 periodLine = parseAndDisplay("# "+sDay,line,domain)
244 list.extend(periodLine)
245 continue
246 ###########################
247 ## morning, afternoon etc ...
248 period=line.contents[1]
249 if len(period)==0:
250 continue
251 period=period.contents[0]
252 ############################
253 ## render
254 periodLine = parseAndDisplay(period,line,domain)
255 list.extend(periodLine)
256
257 list.append(u"</table>\n"+foot+"\n<div>\n")
258 list.append(u"<div clas=\"nav\">Retourner &agrave <a href=\"/\">la list des villes</a></div>\n")
259 list.append(tracking)
260 list.append("</body>\n</html>")
261 return list
262
50e9ab1 Nicolas Hoibian Added the todo list for the project.
authored
263
34ec621 Nicolas Hoibian Added a footer and some copy about the source.
authored
264 def getSourceSentence(sourceUrl,pageName):
265 return u"<div class=\"source\">Les informations sur cette page proviennent de la page de pr&eacute;visions de <a href=\""+sourceUrl+"\">M&eacute;t&eacute;o-France pour "+pageName+"</a>.</div>\n"
2b581dd Nicolas Hoibian Prettyfied the generated html, made the information source more explicit...
authored
266
267
ad2f7a0 Nicolas Hoibian Added google analytics tracking.
authored
268 def generateIndex(infos,pagesAreFiles,tracking=""):
6af56ba Nicolas Hoibian Moved to storing the output text as a list of unicode string tokens unti...
authored
269 list=[]
270 list.append(u"<html>\n<head>")
34ec621 Nicolas Hoibian Added a footer and some copy about the source.
authored
271 list.append(u"\t<title>M&eacute;t&eacute;o extraite des sites de M&eacute;t&eacute;o-France</title>")
6af56ba Nicolas Hoibian Moved to storing the output text as a list of unicode string tokens unti...
authored
272 list.append(head)
15f6e85 Nicolas Hoibian Added caching of the pages in the google app engine datastore. Added a c...
authored
273 list.append(u"</head>\n<body><div class=\"content\"><p>Ce site web a &eacute;t&eacute; &eacute;crit pour pouvoir consulter les pr&eacute;visions de M&eacute;t&eacute;o-France sans crasher le navigateur web de l'iPhone.</p><div class=\"source\">Source des pr&eacute;visions : <a href=\"http://www.meteofrance.com/\">M&eacute;t&eacute;o-France</a></div>\n")
34ec621 Nicolas Hoibian Added a footer and some copy about the source.
authored
274 list.append(u"<table>\n<thead><tr><th>Simple</th><th>Original</th></tr></thead>\n")
b4a462f Nicolas Hoibian Added the README file
authored
275
0c574be Nicolas Hoibian Initial commit
authored
276 for name,dico in infos.iteritems():
2ff2c4a Nicolas Hoibian Modified weather.py to be able to use it from the google app engine :
authored
277 pageAddress = name
278 if pagesAreFiles :
279 pageAddress=dico["file"]
280
281 list.append(u"<tr>\n\t<td><a href=\""+pageAddress+"\">"+name+"</a></td>\n")
34ec621 Nicolas Hoibian Added a footer and some copy about the source.
authored
282 list.append(u"\t<td><a href=\""+dico["domain"]+dico["suffix"]+"\">"+name+" chez M&eacute;t&eacute;o-France</a></td>\n</tr>\n")
0c574be Nicolas Hoibian Initial commit
authored
283
ad2f7a0 Nicolas Hoibian Added google analytics tracking.
authored
284 list.append(u"</table>\n"+foot+"\n</div>\n"+tracking+"\n<body></html>")
6af56ba Nicolas Hoibian Moved to storing the output text as a list of unicode string tokens unti...
authored
285 return list
77d14d3 Nicolas Hoibian Prettyfied a little the generated html
authored
286
50e9ab1 Nicolas Hoibian Added the todo list for the project.
authored
287
2ff2c4a Nicolas Hoibian Modified weather.py to be able to use it from the google app engine :
authored
288 def getInfos():
0c574be Nicolas Hoibian Initial commit
authored
289 infos = {}
2ff2c4a Nicolas Hoibian Modified weather.py to be able to use it from the google app engine :
authored
290 f = open(namesFile)
291 for line in f:
292 l = line.split(',')
293 infos[l[0]] = {"name":l[0],"domain":l[1],"suffix":l[2],"file":l[3]}
0c574be Nicolas Hoibian Initial commit
authored
294 #print "name : ",l[0]
295 #print "domain: ",l[1]
296 #print "suffix : ",l[2]
297 #print "fileOut:,",l[3]
2ff2c4a Nicolas Hoibian Modified weather.py to be able to use it from the google app engine :
authored
298 return infos
299
50e9ab1 Nicolas Hoibian Added the todo list for the project.
authored
300
301 def getContent(dico):
302 resp = urllib2.urlopen(dico["domain"]+dico["suffix"])
303 return resp.read()
304
305
2ff2c4a Nicolas Hoibian Modified weather.py to be able to use it from the google app engine :
authored
306 def writeFile(fileName,contentList):
307 fileOut = open(base_dir+fileName,'w')
308 outText = u''.join(contentList)
309 text = outText.encode("iso-8859-1")
310 fileOut.write(text)
311 fileOut.close()
50e9ab1 Nicolas Hoibian Added the todo list for the project.
authored
312
313 ###############################
2ff2c4a Nicolas Hoibian Modified weather.py to be able to use it from the google app engine :
authored
314 def main():
315 import time
316 #get the files informations
317 infos = getInfos()
318
4209cdd Nicolas Hoibian Added parsing of a single destination
authored
319 if len(sys.argv)>1 and sys.argv[1] in infos:
320 page = sys.argv[1]
321 dico = infos[page]
2ff2c4a Nicolas Hoibian Modified weather.py to be able to use it from the google app engine :
authored
322
50e9ab1 Nicolas Hoibian Added the todo list for the project.
authored
323 content= getContent(dico)
2e97472 Nicolas Hoibian started working on parsing the new page design of french city on meteofr...
authored
324 if("monde" in dico["domain"]):
325 list = parseMeteoPage(dico,content)
326 else:
327 #france web page layout is very different
328 list = parseMeteoPageFrance(dico,content)
2ff2c4a Nicolas Hoibian Modified weather.py to be able to use it from the google app engine :
authored
329 writeFile(dico["file"],list)
330 print "did parse one of one page :",page
4209cdd Nicolas Hoibian Added parsing of a single destination
authored
331 return
332 else:
333 print "Darn, asking for a nil or non existing page. Parsing all the pages every hours"
334
2ff2c4a Nicolas Hoibian Modified weather.py to be able to use it from the google app engine :
authored
335 indexStrings = generateIndex(infos,True)
336 writeFile(listFile,indexStrings)
50e9ab1 Nicolas Hoibian Added the todo list for the project.
authored
337 i=0
0c574be Nicolas Hoibian Initial commit
authored
338 while 1:
339 try:
340 for name,dico in infos.iteritems():
50e9ab1 Nicolas Hoibian Added the todo list for the project.
authored
341 content= getContent(dico)
2e97472 Nicolas Hoibian started working on parsing the new page design of french city on meteofr...
authored
342 list = ""
343 if("monde" in dico["domain"]):
344 list = parseMeteoPage(dico,content)
345 else:
346 #france web page layout is very different
347 list = parseMeteoPageFrance(dico,content)
348
2ff2c4a Nicolas Hoibian Modified weather.py to be able to use it from the google app engine :
authored
349 writeFile(dico["file"],list)
350 print "did parse one of many page : ",name
50e9ab1 Nicolas Hoibian Added the todo list for the project.
authored
351
2ff2c4a Nicolas Hoibian Modified weather.py to be able to use it from the google app engine :
authored
352 except Exception, e:
0c574be Nicolas Hoibian Initial commit
authored
353 print "Error while trying to parse/write the webpage : "
354 print type(e) # the exception instance
355 print e.args # arguments stored in .args
6af56ba Nicolas Hoibian Moved to storing the output text as a list of unicode string tokens unti...
authored
356 print e
0c574be Nicolas Hoibian Initial commit
authored
357
77d14d3 Nicolas Hoibian Prettyfied a little the generated html
authored
358 print "## " + datetime.now().strftime(timeFormat) + " # Sleeping for an hour."
0c574be Nicolas Hoibian Initial commit
authored
359 time.sleep(3600)
360
361
362
363
364 if __name__== '__main__':
365 main()
Something went wrong with that request. Please try again.