Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

full async info from banq

  • Loading branch information...
commit 46e5f1ceac9253f1e75e72f8d5275703ded42bc7 1 parent 1963000
Matthew Leon authored
Showing with 55 additions and 54 deletions.
  1. +40 −45 banq.py
  2. +3 −9 nelligan.py
  3. +12 −0 utils.py
View
85 banq.py
@@ -1,12 +1,21 @@
-import requests
+import gevent.monkey
+gevent.monkey.patch_all()
+
import re
+import requests
+import grequests
+
from urlparse import urljoin
from pprint import pprint
from lxml import etree
+from utils import standardize_key
+
IRIS_URL = 'http://iris.banq.qc.ca/alswww2.dll/APS_ZONES?fn=QuickSearch'
SEARCH_URL_BASE = 'http://iris.banq.qc.ca/alswww2.dll/'
+POOL_SIZE = 4
+
def search(query, results=10):
s = requests.session()
@@ -28,49 +37,32 @@ def search(query, results=10):
'q.Query': query,
}
resp = s.get(search_url, params=params)
- tree = etree.fromstring(resp.text, etree.HTMLParser())
- return books(tree)
-
-def books(tree):
- for elem in tree.xpath('//table[@id="BrowseList"]/tr'):
- info = {}
- fields = elem.xpath('.//td[@class="SummaryFieldData"]')
-
- info['title'] = fields[0][0].text
- path = fields[0][0].get('href')
- info['url'] = urljoin(SEARCH_URL_BASE, path)
-
- if len(fields) > 2:
- info['author'] = fields[1].text
- info['publisher'] = fields[2].text
- else: # the last field is either an author or a publisher
- if ':' in fields[-1].text:
- info['publisher'] = fields[-1].text
- else:
- info['author'] = fields[-1].text
-
- try:
- img_id = elem.xpath('.//img/@id')[0]
- info['isbn'] = re.search('\d+', img_id).group(0)
- except:
- pass
-
- details = book_details(info['url'])
-
- yield dict(info.items() + details.items())
-
-def book_details(url):
- page_details = requests.get(url)
- tree = etree.fromstring(page_details.text, etree.HTMLParser())
-
- try:
- availability = tree.xpath('//div[@class="darkheading"]')[0].getnext()
- except IndexError:
- # no availability details
- return {}
-
+ book_requests = (grequests.get(url) for url in book_urls(resp.text))
+ book_responses = grequests.imap(book_requests, size=POOL_SIZE)
+ return (book_details(response) for response in book_responses)
+
+def book_urls(html):
+ tree = etree.fromstring(html, etree.HTMLParser())
+ XPATH = '//a[@class="SummaryFieldLink"]/@href'
+ return (urljoin(SEARCH_URL_BASE, path) for path in tree.xpath(XPATH))
+
+def book_details(response):
+ tree = etree.fromstring(response.text, etree.HTMLParser())
+ book = {'url': response.url}
+
+ # get basic book info
+ for elem in tree.xpath('//td[@style="width:15%; "]'):
+ cell = 'following-sibling::td[2]/'
+ if elem.text.strip() == 'Sujets':
+ book['subjets'] = elem.xpath(cell + 'a/text()[normalize-space()]')
+ else:
+ key = standardize_key(elem.text.strip())
+ val = ''.join(elem.xpath(cell + '/text()')).strip()
+ book[key] = val
+
+ # get book locations
locations = []
- for elem in availability.iterfind('li'):
+ for elem in tree.xpath('//div[@style="margin-left:20px"]/li'):
location = {'shelf': None, 'copies': []}
shelf = elem.find('b')
location['shelf'] = shelf.xpath('span/text()')
@@ -79,9 +71,12 @@ def book_details(url):
status = copy.tail.strip()[1:].lstrip()
location['copies'].append((call_number, status))
locations.append(location)
+ book['locations'] = locations
- return {'locations': locations}
+ return book
if __name__ == "__main__":
"""test with a simple search"""
- pprint(list(search('portnoy')))
+ for book in search('portnoy'):
+ pprint(book)
+
View
12 nelligan.py
@@ -11,6 +11,8 @@
from lxml import etree
from gevent.pool import Pool
+from utils import standardize_key
+
# might use the EN URL at some point
SEARCH_EN_URL = 'http://nelligan.ville.montreal.qc.ca/search/X'
SEARCH_FR_URL = 'http://nelligan.ville.montreal.qc.ca/search*frc/X'
@@ -31,17 +33,9 @@ def book_urls(html):
'//span[@class="brief-lien-titre"]/a/@href')
return (urljoin(SEARCH_FR_URL, url) for url in tree.xpath(XPATH))
-STD_KEYS = {'Titre': 'title', 'Auteur': 'author'}
-def standardize_key(key):
- """Change a book's "Titre" to a book's "title", etc."""
- try:
- return STD_KEYS[key]
- except KeyError:
- return key.lower()
-
def book_details(response):
tree = etree.fromstring(response.text, etree.HTMLParser())
- book = {url: response.url}
+ book = {'url': response.url}
# get basic book info
for elem in tree.xpath('//td[@class="bibInfoLabel"]'):
View
12 utils.py
@@ -0,0 +1,12 @@
+STD_KEYS = {
+ 'Titre': 'title',
+ 'Auteur': 'author'
+}
+
+def standardize_key(key):
+ """Change a book's "Titre" to a book's "title", etc."""
+ try:
+ return STD_KEYS[key]
+ except KeyError:
+ return key.lower()
+
Please sign in to comment.
Something went wrong with that request. Please try again.