Skip to content

Commit

Permalink
Fork of code from ScraperWiki at https://classic.scraperwiki.com/scra…
Browse files Browse the repository at this point in the history
  • Loading branch information
mdugmore committed May 17, 2016
0 parents commit 5529532
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
# Ignore output of scraper
data.sqlite
1 change: 1 addition & 0 deletions README.textile
@@ -0,0 +1 @@
Lists recipes, ingredients and categories from the BBC Food website.
148 changes: 148 additions & 0 deletions scraper.py
@@ -0,0 +1,148 @@
import scraperwiki
import lxml.html
from urlparse import urljoin

process_existing = False

def scrape_recipe_list(url) :

print "Scraping " + url

html = scraperwiki.scrape(page)

root = lxml.html.fromstring(html)

recipes = root.cssselect("#the-collection li")

process_recipe_list(url, recipes)

def process_recipe_list(url, recipes):

print "process_recipe_list : " + str(len(recipes)) + " recipes found"

for recipe in recipes :

recipe_link = recipe.cssselect("a")[0]

recipe_url_relative = recipe_link.attrib.get('href')

recipe_url = urljoin(url, recipe_url_relative)

print recipe_url

scrape_recipe(recipe_url)

def scrape_recipe(recipe_url) :

if not process_existing and recipe_exists(recipe_url):

print "Not Scraping " + recipe_url + ", already exists"

else:

print "Scraping " + recipe_url

html = scraperwiki.scrape(recipe_url)
root = lxml.html.fromstring(html)

hrecipe_element = root.cssselect(".hrecipe")[0]
hrecipe = lxml.html.tostring(hrecipe_element)

print hrecipe

recipe_name = hrecipe_element.cssselect(".fn")[0].text_content()
ingredients = hrecipe_element.cssselect("#ingredients .ingredient")

print recipe_name, str(len(ingredients)) + " ingredients"

recipe_model = { "url" : recipe_url, "name" : recipe_name, "ingredients" : list() }

for ingredient in ingredients:
recipe_model["ingredients"].append(process_ingredient(ingredient))

scraperwiki.sqlite.save(unique_keys=["url"], table_name="recipes", data=recipe_model)

def process_ingredient(element) :

ingredient = element.text_content() #lxml.html.text_content(
ingredient_model = { "name" : ingredient }

print ingredient

scraperwiki.sqlite.save(unique_keys=["name"], table_name="ingredients", data=ingredient_model)

return ingredient_model

def recipe_exists (url):

print "recipe_exists : " + url

count = 0

try:
result = scraperwiki.sqlite.execute("select count(*) from recipes where url = ?", (url))
count = int(result["data"][0][0])

#print "result : " + str(result)
#print "result['data'] : " + str(result["data"])
#print "result['data'][0] : " + str(result["data"][0])
#print "result['data'][0][0] : " + str(result["data"][0][0])

#print "result is list : " + str(type(result) is list)
#print "result['data'] is list : " + str(type(result["data"]) is list)
#print "result['data'][0][0] is list : " + str(type(result["data"][0][0]) is list)

#print "Count : " + str(count)

except scraperwiki.sqlite.SqliteError, e:
print "Unexpected error:" + str(e)

return count > 0

def scrape_dishes_az() :
print "Scraping dishes a-z"

#For a-z - 97, 122
for i in range(97, 122):

#print i , chr(i)
page = "http://www.bbc.co.uk/food/dishes/by/letter/" + chr(i)

print "Scraping page : " + page

html = scraperwiki.scrape(page)

root = lxml.html.fromstring(html)

recipe_categories = root.cssselect(".resource.food")

for category in recipe_categories:

recipes_link = category.cssselect("a")[0]
category_name = recipes_link.text_content()

recipes_url_relative = recipes_link.attrib.get('href')

recipes_url = urljoin(page, recipes_url_relative)

print recipes_url, category_name

category_model = { "url" : recipes_url, "name" : category_name }

scraperwiki.sqlite.save(unique_keys=["url"], table_name="categories", data=category_model)

html = scraperwiki.scrape(recipes_url)

root = lxml.html.fromstring(html)

recipes = root.cssselect(".resource-list li")

print str(len(recipes)) + " found"

for category in recipe_categories:

process_recipe_list(page, recipes)

page = "http://www.bbc.co.uk/food/collections/slow_cooker_recipes"
#scrape_recipe_list(page)
scrape_dishes_az()

0 comments on commit 5529532

Please sign in to comment.