From 547ba4823dddd2711c4780125bdec3e554562879 Mon Sep 17 00:00:00 2001 From: "M. Nasimul Haque" Date: Tue, 23 Feb 2010 23:03:31 +0000 Subject: [PATCH 1/2] cse parsing started --- main.py | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index f720a6e..635ad23 100755 --- a/main.py +++ b/main.py @@ -23,10 +23,17 @@ dsesanere = re.compile(r']*>') datere = re.compile(r'[a-zA-Z]{3}\s*\d{2},\s*\d{4}\s*at\s*\d{2}:\d{2}:\d{2}') +cselatest = "http://www.csebd.com/trade/top.htm" +csedatere = re.compile(r'Date: ' + r'[a-zA-Z]{3}\s*\d{2}\s*\d{4}\s*\d{1,2}:\d{1,2}(AM|PM)') +csedatare = re.compile(r'\w+\s*') + fetch_error_message = 'Sorry, there was an error fetching data from main server.' time_key = 'timekey' data_key = 'csvdata' +cse_key = 'csedata' +csedate_key = 'csedate' cache_time = 10 * 60 # ten minutes class DSEHandler(webapp.RequestHandler): @@ -101,10 +108,37 @@ def get(self): logging.info('fetched real data') +class CSEHandler(webapp.RequestHandler): + + def get(self): + last_update = memcache.get(csedate_key) + if last_update: + csvname = 'cse-%s.csv' % last_update.isoformat() + + cseresult = urlfetch.fetch(cselatest) + if not cseresult.status_code == 200: + self.response.out.write(fetch_error_message) + return + + content = cseresult.content + if not last_update: + csvname = csedatere.search(content).group() + logging.info(csvname) + + print content + print csedatare.search(content).group() + + self.response.headers.add_header('content-disposition', + 'attachment', filename=csvname) + self.response.headers['Content-Type'] = 'text/csv' + csvdata = memcache.get(cse_key) + if csvdata: + return def main(): - application = webapp.WSGIApplication([('/', DSEHandler)], + application = webapp.WSGIApplication([('/', DSEHandler), + ('/cse', CSEHandler)], debug=True) util.run_wsgi_app(application) From 8f2a7d9d9a2fe32088670f12418de46255e74a9c Mon Sep 17 00:00:00 2001 From: "M. Nasimul Haque" Date: Wed, 24 Feb 2010 14:54:58 +0000 Subject: [PATCH 2/2] cse export complete --- main.py | 94 +++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 71 insertions(+), 23 deletions(-) diff --git a/main.py b/main.py index 635ad23..001435f 100755 --- a/main.py +++ b/main.py @@ -20,32 +20,30 @@ dseroot = "http://www.dsebd.org/" dselatest = dseroot + "latest_share_price_all.php" -dsesanere = re.compile(r']*>') -datere = re.compile(r'[a-zA-Z]{3}\s*\d{2},\s*\d{4}\s*at\s*\d{2}:\d{2}:\d{2}') cselatest = "http://www.csebd.com/trade/top.htm" -csedatere = re.compile(r'Date: ' - r'[a-zA-Z]{3}\s*\d{2}\s*\d{4}\s*\d{1,2}:\d{1,2}(AM|PM)') -csedatare = re.compile(r'\w+\s*') fetch_error_message = 'Sorry, there was an error fetching data from main server.' -time_key = 'timekey' -data_key = 'csvdata' +time_key = 'dsedate' +data_key = 'dsedata' cse_key = 'csedata' csedate_key = 'csedate' -cache_time = 10 * 60 # ten minutes +cache_time = 1 * 60 class DSEHandler(webapp.RequestHandler): + dsesanere = re.compile(r']*>') + datere = re.compile(r'[a-zA-Z]{3}\s*\d{2},\s*\d{4}\s*at\s*\d{2}:\d{2}:\d{2}') + def _get_time(self): last_update = memcache.get(time_key) - if last_update: + if last_update is not None: return last_update response = urlfetch.fetch(dseroot) if response.status_code == 200: - last_update = datere.search(response.content).group() + last_update = self.datere.search(response.content).group() last_update = last_update.replace(' ', '') last_update = datetime.datetime.strptime(last_update, "%b%d,%Yat" "%H:%M:%S") @@ -73,7 +71,7 @@ def get(self): return dsecontent = dseresult.content - dsecontent = dsesanere.sub('', dsecontent) + dsecontent = self.dsesanere.sub('', dsecontent) soup = BeautifulSoup(dsecontent) headtr = soup.body.table.tr.findAll('b') @@ -90,8 +88,8 @@ def get(self): data = soup.body.table.findAll('tr')[1:] for row in data: row = row.findAll('td')[1:] - d = [row[0].a.contents[0],] - d.append(last_update) + + d = [row[0].a.contents[0],last_update] for col in row[1:]: d.append(col.find(text=True)) @@ -110,34 +108,84 @@ def get(self): class CSEHandler(webapp.RequestHandler): + csedatere = re.compile(r'Date: ' + r'([a-zA-Z]{3})\s*(\d{2})\s*(\d{4})\s*(\d{1,2}):(\d{1,2})(AM|PM)') + csedatare = re.compile(r'^\s*(\w{3,5}).*?' + '(\d+\.{0,1}\d*)\s+' + '(\d+\.{0,1}\d*)\s+' + '(\d+\.{0,1}\d*)\s+' + '(\d+\.{0,1}\d*)\s+' + '(\d+\.{0,1}\d*)\s+' + '(-{0,1}\d+\.{0,1}\d*)\s+' + '(\d+\.{0,1}\d*)\s+' + '(\d+\.{0,1}\d*)\s+', re.MULTILINE) + def get(self): last_update = memcache.get(csedate_key) - if last_update: + csvdata = memcache.get(cse_key) + if csvdata and last_update is not None: csvname = 'cse-%s.csv' % last_update.isoformat() + self.response.headers.add_header('content-disposition', + 'attachment', filename=csvname) + self.response.headers['Content-Type'] = 'text/csv' + self.response.out.write(csvdata) + + logging.info('retrieved from cache') + return + cseresult = urlfetch.fetch(cselatest) if not cseresult.status_code == 200: self.response.out.write(fetch_error_message) return content = cseresult.content - if not last_update: - csvname = csedatere.search(content).group() - logging.info(csvname) + soup = BeautifulSoup(content) + precontents = soup.body.findAll('pre') + + sdate = list(self.csedatere.search(precontents[0].contents[0]).groups()) + for i in [1, 3, 4]: + if len(sdate[i]) == 1: + sdate[i] = '0' + sdate[i] + sdate = ' '.join(sdate) + last_update = datetime.datetime.strptime(sdate, + '%b %d %Y %I %M %p') + csvname = 'cse-%s.csv' % last_update.isoformat() + + output = StringIO.StringIO() + csvfile = csv.writer(output) + heads = ['Company', 'Date Time', 'Open', 'High', 'Low', 'Close', + 'Prev. Close', 'Difference', 'Trades', 'Volume',] + csvfile.writerow(heads) - print content - print csedatare.search(content).group() + contents = precontents[1].contents[0].split('\n') + for content in contents: + try: + data = list(self.csedatare.search(content).groups()) + if data[-1] == '0': + continue + data.insert(1, last_update) + csvfile.writerow(data) + except: + pass self.response.headers.add_header('content-disposition', 'attachment', filename=csvname) self.response.headers['Content-Type'] = 'text/csv' - csvdata = memcache.get(cse_key) - if csvdata: - return + + csvdata = output.getvalue() + output.close() + + self.response.out.write(csvdata) + + memcache.set(cse_key, csvdata, cache_time) + memcache.set(csedate_key, last_update, cache_time) + + logging.info('fetched real data') def main(): - application = webapp.WSGIApplication([('/', DSEHandler), + application = webapp.WSGIApplication([('/dse', DSEHandler), ('/cse', CSEHandler)], debug=True) util.run_wsgi_app(application)