# Clustering of Wikipedia Names

In [2]:
import csv
from fog.clustering import key_collision

## Constants

In [3]:
LANGS = {
    'english': 'en',
    'french': 'fr',
    'spanish': 'es',
    'portuguese': 'pt',
    'german': 'de',
    'italian': 'it',
    'swedish': 'sv'
}

## Helpers

In [19]:
def is_cluster_relevant(cluster):
    """
    Function returning whether a cluster is relevant and the set of found ids.
    
    A cluster will be deemed relevant if:
        1) It contains names from more than one single id.
        2) If the range between birth/death does not exceed a given threshold.

    """
    ids = set()
    exact_birth_dates = set()
    
    for person in cluster:
        ids.add(person['id'])
        
        exact_birth_date = person['exact_birth']
        
        if exact_birth_date is not None:
            exact_birth_dates.add(exact_birth_date)
    
    if len(ids) == 1:
        return False, ids
    
    if len(exact_birth_dates) > 1:
        return False, ids
    
    return True, ids

In [13]:
def print_cluster(cluster, ids):
    print('Found cluster containing %i ids and %i persons:' % (len(ids), len(cluster)))
    
    for person in cluster:
        print('  %s (%i) (%s - %s) (%s)' % (person['name'], person['id'], person['birth'], person['death'], person['lang']))
        
    print()

In [25]:
def print_cluster_html(cluster, ids):
        
    confidence = 'Low' if any(p for p in cluster if p['exact_birth'] is None) else 'High'
    
    print('<div>')
    print('  <p>')
    print('    (%s confidence) Found cluster containing %i ids and %i persons:' % (confidence, len(ids), len(cluster)))
    print('  </p>')
    
    print('  <ul>')
    for person in cluster:
        link = 'https://%s.wikipedia.org/wiki/%s' % (person['lang'], person['name'])
        print('    <li>%s (%i) (%s - %s) (%s) (%s) <u>(<a href="%s" target="_blank">link</a>)</u></li>' % (person['name'], person['id'], person['birth'], person['death'], person['exact_birth'], person['lang'], link))
    print('  </ul>')
        
    print('</div>')

## Processing data

In [22]:
INPUT = './clustering.csv'
BIRTH_DATE_INPUT = './birthdates.csv'
OUTPUT = './found.csv'
BIRTH_DATE_INDEX = {}
PERSONS = []

with open(BIRTH_DATE_INPUT, 'r') as f:
    reader = csv.DictReader(f)
    
    for line in reader:
        BIRTH_DATE_INDEX[(line['lang'], line['name'])] = line['birth_date']

with open(INPUT, 'r') as f:
    reader = csv.DictReader(f)
    
    for line in reader:
        for lang in LANGS:
            name = line['%s_link' % lang]
            
            if not name:
                continue
                
            person = {
                'id': int(line['id']),
                'lang': LANGS[lang],
                'name': name,
                'birth': int(line['birth_min']) if line['birth_min'] else None,
                'death': int(line['death_min']) if line['death_min'] else None,
                'exact_birth': BIRTH_DATE_INDEX.get((LANGS[lang], name)) or None
            }
            
            PERSONS.append(person)

## Key collision with string normalization

In [26]:
def grouper(i):
    x = PERSONS[i]
    
    if not x['birth']:
        return None
    
    f = x['name'].replace('-', '_').lower()
    
    return (x['birth'], x['death'], f)

clusters = key_collision(range(len(PERSONS)), key=grouper)

of = open(OUTPUT, 'w')
writer = csv.DictWriter(of, fieldnames=['lang', 'name'])
writer.writeheader()

RELEVANT_CLUSTERS = 0
for cluster in clusters:
    cluster = [PERSONS[i] for i in cluster]
    relevant, ids = is_cluster_relevant(cluster)
    
    if not relevant:
        continue
        
    RELEVANT_CLUSTERS += 1
        
    print_cluster_html(cluster, ids)
    for person in cluster:
        writer.writerow({'lang': person['lang'], 'name': person['name']})
    
print('Found %i relevant clusters' % RELEVANT_CLUSTERS)
of.close()

<div>
  <p>
    (High confidence) Found cluster containing 2 ids and 2 persons:
  </p>
  <ul>
    <li>Julio_Edson_Uribe (30507659) (1982 - None) (1982-05-09) (es) <u>(<a href="https://es.wikipedia.org/wiki/Julio_Edson_Uribe" target="_blank">link</a>)</u></li>
    <li>Julio_Edson_Uribe (3946320) (1982 - None) (1982-05-09) (it) <u>(<a href="https://it.wikipedia.org/wiki/Julio_Edson_Uribe" target="_blank">link</a>)</u></li>
  </ul>
</div>
<div>
  <p>
    (Low confidence) Found cluster containing 2 ids and 3 persons:
  </p>
  <ul>
    <li>%C3%89ric_Laurent (47693368) (1947 - None) (None) (en) <u>(<a href="https://en.wikipedia.org/wiki/%C3%89ric_Laurent" target="_blank">link</a>)</u></li>
    <li>%C3%89ric_Laurent (47693480) (1947 - None) (1947-03-04) (fr) <u>(<a href="https://fr.wikipedia.org/wiki/%C3%89ric_Laurent" target="_blank">link</a>)</u></li>
    <li>%C3%89ric_Laurent (47693480) (1947 - None) (1947-03-04) (it) <u>(<a href="https://it.wikipedia.org/wiki/%C3%89ric_Laurent" target="_b