-
Notifications
You must be signed in to change notification settings - Fork 42
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a script to fix inconsistent references to people from SayIt spea…
…kers We have changed the form of IDs in the South African PopIt instance several times, but without removing the entries corresponding to old PopIt IDs from the popit-django popit_person table. This meant that the name resolution on importing new committee proceedings could associate the speeches with various SayIt speakers corresponding to different entries in that table. This commit introduces a script to hopefully clean up that mess; it tries to remove any row with an deprecated format ID from the popit_person table, safely removing any SayIt Speaker or EntityName associated with that row. It also removes any rows from this table that can't be associated with any Pombola core Person any more, so long as they are not referenced any more. This should fix #1597 and hopefully #1601. The underlying issue (that Pombola ZA stores the identity of people in five different places) is being addressed by #1594.
- Loading branch information
Showing
1 changed file
with
207 additions
and
0 deletions.
There are no files selected for viewing
207 changes: 207 additions & 0 deletions
207
pombola/core/management/commands/core_fix_sayit_speakers.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
import re | ||
|
||
from django.conf import settings | ||
from django.core.management.base import BaseCommand | ||
|
||
from popit.models import Person as PopItPerson | ||
|
||
from pombola.core.models import Person, Identifier | ||
|
||
from speeches.models import Speaker, Speech | ||
|
||
# We've ended up in a situation where there are lots of Person objects | ||
# from popit-django that refer to old PopIt URLs still in the database | ||
# - even worse, they may still be referenced by SayIt Speaker | ||
# objects. This script should safely get rid of these without breaking | ||
# any references from Speaker objects. | ||
|
||
current_popit_api_base_url = settings.POPIT_API_URL | ||
|
||
# This is the format of the URLs currently in PopIt: | ||
|
||
current_popit_id_re = re.compile( | ||
r'^{0}persons//core_person/(?P<pombola_id>\d+)$'.format( | ||
current_popit_api_base_url | ||
) | ||
) | ||
|
||
# This matches any PopIt URLs that have the Pombola Person ID embedded | ||
# in them. | ||
|
||
other_popit_id_re = re.compile('za-new-import.*core_person/(?P<pombola_id>\d+)$') | ||
|
||
# This matches the older form of PopIt URL, which had a | ||
# pombola.core.models.Identifer encoded instead of a Person Person ID | ||
This comment has been minimized.
Sorry, something went wrong. |
||
# directly embedded in the URL. | ||
|
||
identifier_based_re = re.compile( | ||
r'http://za-peoples-assembly.popit.mysociety.org/api/v0.1/' + | ||
r'persons/org.mysociety.za(?P<pombola_identifier>.*)$' | ||
) | ||
|
||
def get_popit_url_info(popit_url): | ||
"""Return all the information that we can derive from a PopIt URL | ||
This returns a dict, with key / value pairs representing whether | ||
this is a current PopIt URL (i.e. matches the format currently in | ||
use), a Pombola Person object ID (if one could be derived from the | ||
URL) or whether it's "impossible" to deal with - we couldn't parse | ||
it or it referred to a non-existent person or identifier.""" | ||
|
||
m_current = current_popit_id_re.search(popit_url) | ||
m_other_popit = other_popit_id_re.search(popit_url) | ||
m_old_identifier = identifier_based_re.search(popit_url) | ||
|
||
result = { | ||
'impossible': False, | ||
'current': False, | ||
'pombola_id': None, | ||
} | ||
|
||
m_either = m_current or m_other_popit | ||
if m_either: | ||
pombola_id = int(m_either.group('pombola_id')) | ||
result['pombola_id'] = pombola_id | ||
result['current'] = bool(m_current) | ||
try: | ||
Person.objects.get(pk=pombola_id) | ||
except Person.DoesNotExist: | ||
message = "Warning: no Pombola Person corresponding to {0}" | ||
print message.format(popit_url) | ||
result['impossible'] = True | ||
elif m_old_identifier: | ||
# Also look out for older URLs that are based on an | ||
# identifier object with scheme org.mysociety.za: | ||
try: | ||
identifier = Identifier.objects.get( | ||
scheme='org.mysociety.za', | ||
identifier=m_old_identifier.group('pombola_identifier') | ||
) | ||
pombola_person = identifier.content_object | ||
if pombola_person: | ||
result['pombola_id'] = pombola_person.id | ||
else: | ||
print "Warning: {0} had no content_object set".format( | ||
identifier | ||
) | ||
This comment has been minimized.
Sorry, something went wrong.
dracos
Member
|
||
except Identifier.DoesNotExist: | ||
message = "Warning, no Pombola Person corresponding to {0}" | ||
print message.format(popit_url) | ||
result['impossible'] = True | ||
else: | ||
print "Warning: no case would handle {0}".format(popit_url) | ||
result['impossible'] = True | ||
|
||
return result | ||
|
||
def safely_delete(p): | ||
# Since the django-sayit Speaker references the PopIt Person with | ||
# on_delete=models.PROTECT, we have to delete any speaker that | ||
# refers to it first; also double-check that there are no speeches | ||
# still associated with that speaker before doing that. In | ||
# addition, there may be EntityName objects still referring to | ||
# them, which also use PROTECT, so delete any of them too. | ||
all_deleted = True | ||
for s in p.speaker_set.all(): | ||
if s.speech_set.count() == 0: | ||
s.delete() | ||
else: | ||
message = "!!! Couldn't delete; there were still {0} speeches associated with {1} {2}" | ||
all_deleted = False | ||
print message.format(s.speech_set.count(), s.id, s) | ||
for en in p.entityname_set.all(): | ||
en.delete() | ||
if all_deleted: | ||
p.delete() | ||
|
||
|
||
class Command(BaseCommand): | ||
|
||
def handle(*args, **options): | ||
|
||
popit_url_to_pk = {p.popit_url: p.id for p in PopItPerson.objects.all()} | ||
|
||
# Build up some preliminary mappings: | ||
|
||
popit_url_to_pombola_id = {} | ||
pombola_id_to_current_popit_url = {} | ||
|
||
impossible_ids = set() | ||
|
||
for popit_url, pk in popit_url_to_pk.items(): | ||
details = get_popit_url_info(popit_url) | ||
pombola_id = details['pombola_id'] | ||
if details['impossible']: | ||
impossible_ids.add(pk) | ||
elif pombola_id: | ||
popit_url_to_pombola_id[popit_url] = pombola_id | ||
if details['current']: | ||
pombola_id_to_current_popit_url[pombola_id] = popit_url | ||
|
||
# Now build a mapping from the id of each row of the | ||
# popit_person table that should be deleted to the id of its | ||
# replacement. | ||
|
||
mapping_from_deleted = {} | ||
|
||
for popit_url, pk in popit_url_to_pk.items(): | ||
details = get_popit_url_info(popit_url) | ||
This comment has been minimized.
Sorry, something went wrong.
dracos
Member
|
||
if details['impossible']: | ||
continue | ||
# The rows with canonical URLs won't be rewritten: | ||
if details['current']: | ||
continue | ||
# Ignore any row where we couldn't find the Pombola Person | ||
# that it refers to: | ||
pombola_id = details['pombola_id'] | ||
if not pombola_id: | ||
continue | ||
# Otherwise we should be able to delete these rows after | ||
# replacing any references to them. | ||
current_popit_url = pombola_id_to_current_popit_url[pombola_id] | ||
mapping_from_deleted[pk] = popit_url_to_pk[current_popit_url] | ||
|
||
# Now replace all the references in the speeches_speaker table: | ||
|
||
for speaker in Speaker.objects.all(): | ||
if not speaker.person: | ||
continue | ||
popit_person_id = speaker.person.id | ||
new_popit_person_id = mapping_from_deleted.get(popit_person_id) | ||
if not new_popit_person_id: | ||
continue | ||
new_popit_person = PopItPerson.objects.get(pk=new_popit_person_id) | ||
print "Replacing {0} with {1}".format( | ||
speaker.person, new_popit_person | ||
) | ||
# There might already be a Speaker that points to the new | ||
# popit-django Person; if there is, then use that instead, | ||
# otherwise modify the speaker to point to the new person. | ||
try: | ||
existing_speaker = Speaker.objects.get( | ||
instance=speaker.instance, | ||
person=new_popit_person | ||
) | ||
message = "A Speaker referring to {0} already existed..." | ||
message += "\n... replacing the speaker in all speeches" | ||
print message.format(new_popit_person) | ||
Speech.objects.filter(speaker=speaker). \ | ||
update(speaker=existing_speaker) | ||
except Speaker.DoesNotExist: | ||
speaker.person = new_popit_person | ||
speaker.save() | ||
|
||
# Now remove all the deleted PopIt Person rows and Speaker | ||
# rows that reference them, that we've now ensured they're | ||
# safe to delete. | ||
|
||
print "Deleting replaced PopIt Person and Speaker objects that we've replaced" | ||
for p in PopItPerson.objects.filter(id__in=mapping_from_deleted.keys()): | ||
safely_delete(p) | ||
|
||
# Finally, delete any rows which are "impossible" (i.e. those | ||
# that we could not find a corresponding Pombola Person for). | ||
|
||
print "Deleting any PopIt Person and Speaker objects that no longer refer to people" | ||
for p in PopItPerson.objects.filter(id__in=impossible_ids): | ||
safely_delete(p) |
"Pombola Person ID"? :)