In [13]:
mari = Contributor.objects.get_or_create(display_name="Mari Lund Haanshus")[0]
juksemari = Contributor.objects.get_or_create(display_name="Mari Lund Hånshus")[0]

print(mari, juksemari)

Mari Lund Haanshus Mari Lund Hånshus


In [14]:
from django.db import transaction
from django.db.models import get_models, Model
from django.contrib.contenttypes.generic import GenericForeignKey

@transaction.atomic
def merge_model_objects(primary_object, alias_objects=[], keep_old=False):
    """
    Use this function to merge model objects (i.e. Users, Organizations, Polls,
    etc.) and migrate all of the related fields from the alias objects to the
    primary object.

    Usage:
    from django.contrib.auth.models import User
    primary_user = User.objects.get(email='good_email@example.com')
    duplicate_user = User.objects.get(email='good_email+duplicate@example.com')
    merge_model_objects(primary_user, duplicate_user)
    """
    if not isinstance(alias_objects, list):
        alias_objects = [alias_objects]

    # check that all aliases are the same class as primary one and that
    # they are subclass of model
    primary_class = primary_object.__class__

    if not issubclass(primary_class, Model):
        raise TypeError('Only django.db.models.Model subclasses can be merged')

    for alias_object in alias_objects:
        if not isinstance(alias_object, primary_class):
            raise TypeError('Only models of same class can be merged')

    # Get a list of all GenericForeignKeys in all models
    # TODO: this is a bit of a hack, since the generics framework should provide a similar
    # method to the ForeignKey field for accessing the generic related fields.
    generic_fields = []
    for model in get_models():
        for field_name, field in filter(lambda x: isinstance(x[1], GenericForeignKey), model.__dict__.items()):
            generic_fields.append(field)

    blank_local_fields = set([field.attname for field in primary_object._meta.local_fields if getattr(primary_object, field.attname) in [None, '']])

    # Loop through all alias objects and migrate their data to the primary object.
    for alias_object in alias_objects:
        # Migrate all foreign key references from alias object to primary object.
        for related_object in alias_object._meta.get_all_related_objects():
            # The variable name on the alias_object model.
            alias_varname = related_object.get_accessor_name()
            # The variable name on the related model.
            obj_varname = related_object.field.name
            related_objects = getattr(alias_object, alias_varname)
            for obj in related_objects.all():
                setattr(obj, obj_varname, primary_object)
                obj.save()

        # Migrate all many to many references from alias object to primary object.
        for related_many_object in alias_object._meta.get_all_related_many_to_many_objects():
            alias_varname = related_many_object.get_accessor_name()
            obj_varname = related_many_object.field.name

            if alias_varname is not None:
                # standard case
                related_many_objects = getattr(alias_object, alias_varname).all()
            else:
                # special case, symmetrical relation, no reverse accessor
                related_many_objects = getattr(alias_object, obj_varname).all()
            for obj in related_many_objects.all():
                getattr(obj, obj_varname).remove(alias_object)
                getattr(obj, obj_varname).add(primary_object)

        # Migrate all generic foreign key references from alias object to primary object.
        for field in generic_fields:
            filter_kwargs = {}
            filter_kwargs[field.fk_field] = alias_object._get_pk_val()
            filter_kwargs[field.ct_field] = field.get_content_type(alias_object)
            for generic_related_object in field.model.objects.filter(**filter_kwargs):
                setattr(generic_related_object, field.name, primary_object)
                generic_related_object.save()

        # Try to fill all missing values in primary object by values of duplicates
        filled_up = set()
        for field_name in blank_local_fields:
            val = getattr(alias_object, field_name) 
            if val not in [None, '']:
                setattr(primary_object, field_name, val)
                filled_up.add(field_name)
        blank_local_fields -= filled_up

        if not keep_old:
            alias_object.delete()
    primary_object.save()
    return primary_object

merge_model_objects(mari, juksemari)



<Contributor: Mari Lund Haanshus>

In [None]:
from fuzzywuzzy import process
from django.db.models import Count
contributors = Contributor.objects.annotate(
        num_bylines=Count('byline')
    ).order_by('-num_bylines')
everyone = Contributor.objects.values_list('display_name', flat=True)

In [18]:

for person in contributors:
    print(person.display_name)

Ketil Blom
Nomen Nescio
Brian Olguin
Hans Dalane-Hval
Skjalg Vold
Robin Røkke Johansen
Anders R. Christensen
Stéphane Lelarge
Kristian Ridder–Nielsen
Helle Gannestad
Heljar Havnes
Leif Martin Kirknes
Michael Brøndbo
Filip Roshauw
Aslak Johannessen
Åshild Støylen
Christine Skogen Nyhagen
Geir Molnes
Dag Stian Husby
Hans J. Skjong
Øyvind Bosnes Engen
Petter Fløttum
Åshild Bekke Eidem
Tanja Christiansen
Marius Nergård Pettersen
Svein Egil Hatlevik
Marianne Granheim Trøyflat
Kjetil Strømme
Ellinor Bente Dalbye
Anders Schiøtz Worren
Anna Young
Helena Nielsen
Peter Vollset
Henrik Evertsson
Audun Halaas
Mari Lund Haanshus
Anne Ogundipe
Thomas Karlsen
Anne B. Viken
Espen A. Eik
Jana Kristensen
Axel M. K. Hærland
Ellen Sofie Lauritzen
Egil Ellenes
Patrick Da Silva Sæther
Silje Bekeng
Eirik Omvik
Rolv Christian Topdahl
Anders Rikstad
Kristoffer Hatteland Endresen
Tonje T. Larsen
Unni Claussen
Andreas Slettholm
Richard Eriksen
Magnus Newth
Jørgen Brynhildsvoll
Halvor Finess Tretvoll
Solveig Nygaa

In [15]:
for contributor in contributors[:100]:
    best = process.extract(contributor.display_name, everyone, limit=3)
    print(best)
    

[('Ketil Blom', 100), ('Universitas’ Matpanel. Foto: Ketil Blom', 90), ('Melike Leblebicio?Lu Ketil Blom', 90)]
[('Nomen Nescio', 100), ('Sbio', 67), ('Noen', 67)]
[('Brian Olguin', 100), ('Adrian Plau', 61), ('«Ine»', 59)]
[('Hans Dalane-Hval', 100), ('Photo Hans Dalane Hval', 95), ('Olav Hval', 85)]
[('Skjalg Vold', 100), ('Tia Karlsen Skjalg Bøhmer Vold', 85), ('Tone Vold-Johansen', 85)]
[('Robin Røkke Johansen', 100), ('Robin Sande', 85), ('Finn-Eirik Johansen', 74)]
[('Anders R. Christensen', 100), ('Kaare R. Norum', 85), ('Anders Moe', 85)]
[('Kristian Ridder–Nielsen', 100), ('Kristian', 90), ('Harriet Bjerrum Nielsen Kollegiemedlem', 85)]
[('Stéphane Lelarge', 100), ('Heidi Thaulow Harto Stephane Lelarge', 85), ('Ane Hem', 63)]
[('Helle Gannestad', 100), ('Solveig N. Langvad Helle Gannestad', 90), ('Ellen Engelstad', 73)]
[('Heljar Havnes', 100), ('Anton Havnes', 64), ('Aksel Kjær Vidnes', 62)]
[('Leif Martin Kirknes', 100), ('Leif Is Leif', 85), ('Leif Lømo', 85)]
[('Michael Br