Browse files

Spreadsheet scraper: Handle many-to-many lookups, passed as a list of…

… names.

Don't blow up if list_record['location'] is not a string.
Docstring fixes.
Allow list_record to already have an 'attributes' key.
  • Loading branch information...
slinkp committed Sep 26, 2012
1 parent 8872fa2 commit f792d9483cc735d630c283121adcce481a7d16cd
Showing with 22 additions and 7 deletions.
  1. +22 −7 ebdata/ebdata/scrapers/general/spreadsheet/
@@ -257,16 +257,17 @@ def clean_list_record(self, list_record):
Given a dict, prepare it for saving as a newsitem.
Result will be a dictionary of anything from list_record
that looks like a known field of the NewsItem model.
Anything that looks like a known SchemaField of the item's Schema
will be set as an 'attributes' sub-dictionary.
will be set as an item in an 'attributes' sub-dictionary.
Unrecognized keys will be ignored (and logged).
Locations are found heuristically:
- If there's a 'location' key, try to split the value into (lat, lon) points
- If there's keys like 'latitude'/'lat' and 'longitude'/'lon'/'long'/'lng', use those
- If there's a 'location_name', geocode if needed
- If there's no 'location_name', reverse-geocode if needed
- If there's no 'location_name', reverse-geocode if possible
from ebpub.db.models import NewsItem
@@ -278,7 +279,7 @@ def clean_list_record(self, list_record):
# orginal, this gives us a way to use it by mapping it to
# "location"
lat, lon = re.split(r'[\s,]+', list_record.pop('location'))
lat, lon = re.split(r'[\s,]+', str(list_record.pop('location')))
list_record.setdefault('lat', lat)
list_record.setdefault('lon', lon)
except ValueError:
@@ -301,18 +302,32 @@ def clean_list_record(self, list_record):
core_fields['location_name'] = location_name
# Attributes.
attributes = {}
attributes = list_record.get('attributes', {})
schemafields = self.schema.schemafield_set.all()
for sf in schemafields:
if in list_record:
# TODO: coerce types? Or maybe Django's implicit conversion is OK.
value = unicode(list_record.pop(
value = list_record.pop(
if sf.is_many_to_many_lookup():
self.logger.error("We can't currently handle many-to-many lookups in this scraper, dunno what to do with field %s" %
if sf.is_lookup:
# Passed value needs to be a list of strings.
if isinstance(value, basestring):
value = [value]
lookups = [
sf, name=v, code=v, make_text_slug=False
for v in value]
value = ','.join([str( for lookup in lookups])
elif sf.is_lookup:
# Need an int id.
value = unicode(value)
value = Lookup.objects.get_or_create_lookup(
sf, name=value, code=value, make_text_slug=False)
value =
# TODO: handle other types?
value = unicode(value)
attributes[] = value
core_fields['attributes'] = attributes
if len(list_record):

0 comments on commit f792d94

Please sign in to comment.