Skip to content

Commit

Permalink
Add a script to fix inconsistent references to people from SayIt spea…
Browse files Browse the repository at this point in the history
…kers

We have changed the form of IDs in the South African PopIt instance
several times, but without removing the entries corresponding to old
PopIt IDs from the popit-django popit_person table. This meant that the
name resolution on importing new committee proceedings could associate
the speeches with various SayIt speakers corresponding to different
entries in that table.

This commit introduces a script to hopefully clean up that mess; it
tries to remove any row with an deprecated format ID from the
popit_person table, safely removing any SayIt Speaker or EntityName
associated with that row. It also removes any rows from this table that
can't be associated with any Pombola core Person any more, so long as
they are not referenced any more.

This should fix #1597 and hopefully #1601.

The underlying issue (that Pombola ZA stores the identity of people in
five different places) is being addressed by #1594.
  • Loading branch information
mhl committed Feb 2, 2015
1 parent 0bc38c9 commit c3ea9ca
Showing 1 changed file with 207 additions and 0 deletions.
207 changes: 207 additions & 0 deletions pombola/core/management/commands/core_fix_sayit_speakers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
import re

from django.conf import settings
from django.core.management.base import BaseCommand

from popit.models import Person as PopItPerson

from pombola.core.models import Person, Identifier

from speeches.models import Speaker, Speech

# We've ended up in a situation where there are lots of Person objects
# from popit-django that refer to old PopIt URLs still in the database
# - even worse, they may still be referenced by SayIt Speaker
# objects. This script should safely get rid of these without breaking
# any references from Speaker objects.

current_popit_api_base_url = settings.POPIT_API_URL

# This is the format of the URLs currently in PopIt:

current_popit_id_re = re.compile(
r'^{0}persons//core_person/(?P<pombola_id>\d+)$'.format(
current_popit_api_base_url
)
)

# This matches any PopIt URLs that have the Pombola Person ID embedded
# in them.

other_popit_id_re = re.compile('za-new-import.*core_person/(?P<pombola_id>\d+)$')

# This matches the older form of PopIt URL, which had a
# pombola.core.models.Identifer encoded instead of a Person Person ID

This comment has been minimized.

Copy link
@dracos

dracos Feb 2, 2015

Member

"Pombola Person ID"? :)

# directly embedded in the URL.

identifier_based_re = re.compile(
r'http://za-peoples-assembly.popit.mysociety.org/api/v0.1/' +
r'persons/org.mysociety.za(?P<pombola_identifier>.*)$'
)

def get_popit_url_info(popit_url):
"""Return all the information that we can derive from a PopIt URL
This returns a dict, with key / value pairs representing whether
this is a current PopIt URL (i.e. matches the format currently in
use), a Pombola Person object ID (if one could be derived from the
URL) or whether it's "impossible" to deal with - we couldn't parse
it or it referred to a non-existent person or identifier."""

m_current = current_popit_id_re.search(popit_url)
m_other_popit = other_popit_id_re.search(popit_url)
m_old_identifier = identifier_based_re.search(popit_url)

result = {
'impossible': False,
'current': False,
'pombola_id': None,
}

m_either = m_current or m_other_popit
if m_either:
pombola_id = int(m_either.group('pombola_id'))
result['pombola_id'] = pombola_id
result['current'] = bool(m_current)
try:
Person.objects.get(pk=pombola_id)
except Person.DoesNotExist:
message = "Warning: no Pombola Person corresponding to {0}"
print message.format(popit_url)
result['impossible'] = True
elif m_old_identifier:
# Also look out for older URLs that are based on an
# identifier object with scheme org.mysociety.za:
try:
identifier = Identifier.objects.get(
scheme='org.mysociety.za',
identifier=m_old_identifier.group('pombola_identifier')
)
pombola_person = identifier.content_object
if pombola_person:
result['pombola_id'] = pombola_person.id
else:
print "Warning: {0} had no content_object set".format(
identifier
)

This comment has been minimized.

Copy link
@dracos

dracos Feb 2, 2015

Member

I don't think this matters, as the case when impossible and pombola_id are both False is the same as when impossible is True, but do you want to set impossible to True here anyway?

except Identifier.DoesNotExist:
message = "Warning, no Pombola Person corresponding to {0}"
print message.format(popit_url)
result['impossible'] = True
else:
print "Warning: no case would handle {0}".format(popit_url)
result['impossible'] = True

return result

def safely_delete(p):
# Since the django-sayit Speaker references the PopIt Person with
# on_delete=models.PROTECT, we have to delete any speaker that
# refers to it first; also double-check that there are no speeches
# still associated with that speaker before doing that. In
# addition, there may be EntityName objects still referring to
# them, which also use PROTECT, so delete any of them too.
all_deleted = True
for s in p.speaker_set.all():
if s.speech_set.count() == 0:
s.delete()
else:
message = "!!! Couldn't delete; there were still {0} speeches associated with {1} {2}"
all_deleted = False
print message.format(s.speech_set.count(), s.id, s)
for en in p.entityname_set.all():
en.delete()
if all_deleted:
p.delete()


class Command(BaseCommand):

def handle(*args, **options):

popit_url_to_pk = {p.popit_url: p.id for p in PopItPerson.objects.all()}

# Build up some preliminary mappings:

popit_url_to_pombola_id = {}
pombola_id_to_current_popit_url = {}

impossible_ids = set()

for popit_url, pk in popit_url_to_pk.items():
details = get_popit_url_info(popit_url)
pombola_id = details['pombola_id']
if details['impossible']:
impossible_ids.add(pk)
elif pombola_id:
popit_url_to_pombola_id[popit_url] = pombola_id
if details['current']:
pombola_id_to_current_popit_url[pombola_id] = popit_url

# Now build a mapping from the id of each row of the
# popit_person table that should be deleted to the id of its
# replacement.

mapping_from_deleted = {}

for popit_url, pk in popit_url_to_pk.items():
details = get_popit_url_info(popit_url)

This comment has been minimized.

Copy link
@dracos

dracos Feb 2, 2015

Member

Probably not worth it, but you could cache these details in the first loop to save them being looked up again. And then only loop through [ people for which not impossible and not current and pombola_id ] perhaps.

if details['impossible']:
continue
# The rows with canonical URLs won't be rewritten:
if details['current']:
continue
# Ignore any row where we couldn't find the Pombola Person
# that it refers to:
pombola_id = details['pombola_id']
if not pombola_id:
continue
# Otherwise we should be able to delete these rows after
# replacing any references to them.
current_popit_url = pombola_id_to_current_popit_url[pombola_id]
mapping_from_deleted[pk] = popit_url_to_pk[current_popit_url]

# Now replace all the references in the speeches_speaker table:

for speaker in Speaker.objects.all():
if not speaker.person:
continue
popit_person_id = speaker.person.id
new_popit_person_id = mapping_from_deleted.get(popit_person_id)
if not new_popit_person_id:
continue
new_popit_person = PopItPerson.objects.get(pk=new_popit_person_id)
print "Replacing {0} with {1}".format(
speaker.person, new_popit_person
)
# There might already be a Speaker that points to the new
# popit-django Person; if there is, then use that instead,
# otherwise modify the speaker to point to the new person.
try:
existing_speaker = Speaker.objects.get(
instance=speaker.instance,
person=new_popit_person
)
message = "A Speaker referring to {0} already existed..."
message += "\n... replacing the speaker in all speeches"
print message.format(new_popit_person)
Speech.objects.filter(speaker=speaker). \
update(speaker=existing_speaker)
except Speaker.DoesNotExist:
speaker.person = new_popit_person
speaker.save()

# Now remove all the deleted PopIt Person rows and Speaker
# rows that reference them, that we've now ensured they're
# safe to delete.

print "Deleting replaced PopIt Person and Speaker objects that we've replaced"
for p in PopItPerson.objects.filter(id__in=mapping_from_deleted.keys()):
safely_delete(p)

# Finally, delete any rows which are "impossible" (i.e. those
# that we could not find a corresponding Pombola Person for).

print "Deleting any PopIt Person and Speaker objects that no longer refer to people"
for p in PopItPerson.objects.filter(id__in=impossible_ids):
safely_delete(p)

0 comments on commit c3ea9ca

Please sign in to comment.