In [1]:
import json
import os
import string
from difflib import SequenceMatcher

def get_common_ancestors(group, matches_to_ignore=[]):
    filepath = f'pedigrees/{group}'

    pedigrees = {}
    for file in os.listdir(filepath):
        with open(f'{filepath}/{file}') as f:
            pedigrees[file.replace('.json', '')] = json.load(f)

    data = {}
    for match in pedigrees:
        data[match] = {}
        for ancestor in pedigrees[match]:
            data[match][ancestor] = {}
            translator = str.maketrans(string.punctuation,
                                       ' '*len(string.punctuation))
            data[match][ancestor]['Forename'] = set(pedigrees[match][ancestor][
                'Forename'].translate(translator).title().split(' '))
            data[match][ancestor]['Surname'] = set(pedigrees[match][ancestor][
                'Surname'].translate(translator).title().split(' '))

            if ('Birth' not in pedigrees[match][ancestor]) or (pedigrees[match][
                ancestor]['Birth'] == None):
                data[match][ancestor]['Birth'] = set()
            elif ('Date' in pedigrees[match][ancestor]['Birth']) and (pedigrees[
                match][ancestor]['Birth']['Date'] != None):
                data[match][ancestor]['Birth'] = set(
                    [int(elem) for elem in pedigrees[match][ancestor]['Birth'][
                        'Date'].translate(translator).split(' ') if (
                        len(elem) == 4) and (elem.isdigit())])
            else:
                data[match][ancestor]['Birth'] = set()

            if ('Death' not in pedigrees[match][ancestor]) or (pedigrees[match][
                ancestor]['Death'] == None):
                data[match][ancestor]['Death'] = set()
            elif ('Date' in pedigrees[match][ancestor]['Death']) and (pedigrees[
                match][ancestor]['Death']['Date'] != None):
                data[match][ancestor]['Death'] = set(
                    [int(elem) for elem in pedigrees[match][ancestor]['Death'][
                        'Date'].translate(translator).split(' ') if (
                        len(elem) == 4) and (elem.isdigit())])
            else:
                data[match][ancestor]['Death'] = set()

            data[match][ancestor]['Marriage'] = set()
            for field in pedigrees[match][ancestor]:
                if field.startswith('Marriage'):
                    if ('Date' in pedigrees[match][ancestor][field]) and (
                        pedigrees[match][ancestor][field]['Date'] != None):
                        marriage_dates = set([int(elem) for elem in pedigrees[
                            match][ancestor][field]['Date'].split(' ') if (
                            len(elem) == 4) and (elem.isdigit())])
                        data[match][ancestor]['Marriage'] = data[match][ancestor][
                            'Marriage'].union(marriage_dates)

    similar_matches = {}
    completed_matches = set()
    for match_1 in data:
        completed_matches.add(match_1)
        for ancestor_1 in data[match_1]:
            for match_2 in data:
                if match_2 not in completed_matches:
                    for ancestor_2 in data[match_2]:
                        good = True
                        if (len(data[match_1][ancestor_1]['Birth']) > 0) and (
                            len(data[match_2][ancestor_2]['Birth']) > 0):
                            good = False
                            for birth_1 in data[match_1][ancestor_1]['Birth']:
                                if good == True:
                                    break
                                for birth_2 in data[match_2][ancestor_2]['Birth']:
                                    if abs(birth_1 - birth_2) <= 10:
                                        good = True
                                        break
                        if good == False:
                            continue

                        good = True
                        if (len(data[match_1][ancestor_1]['Death']) > 0) and (
                            len(data[match_2][ancestor_2]['Death']) > 0):
                            good = False
                            for death_1 in data[match_1][ancestor_1]['Death']:
                                if good == True:
                                    break
                                for death_2 in data[match_2][ancestor_2]['Death']:
                                    if abs(death_1 - death_2) <= 10:
                                        good = True
                                        break
                        if good == False:
                            continue

                        good = False
                        for surname_1 in data[match_1][ancestor_1]['Surname']:
                            if surname_1 in ['Private', '']:
                                break
                            if good == True:
                                break
                            for surname_2 in data[match_2][ancestor_2]['Surname']:
                                if surname_2 in ['Private', '']:
                                    break
                                similarity = SequenceMatcher(
                                    None, surname_1, surname_2).ratio()
                                if similarity >= .75:
                                    good = True
                                    break
                        if good == False:
                            continue

                        good = False
                        for forename_1 in data[match_1][ancestor_1]['Forename']:
                            if forename_1 in ['Private', '']:
                                break
                            if good == True:
                                break
                            for forename_2 in data[match_2][ancestor_2][
                                'Forename']:
                                if forename_2 in ['Private', '']:
                                    break
                                similarity = SequenceMatcher(
                                    None, forename_1, forename_2).ratio()
                                if similarity >= .75:
                                    good = True
                                    break
                        if good == False:
                            continue

                        if f'{match_1} || {ancestor_1}' not in similar_matches:
                            similar_matches[f'{match_1} || {ancestor_1}'] = set(
                                [f'{match_1} || {ancestor_1}'])
                        if f'{match_2} || {ancestor_2}' not in similar_matches:
                            similar_matches[f'{match_2} || {ancestor_2}'] = set(
                                [f'{match_2} || {ancestor_2}'])

                        similar_matches[f'{match_1} || {ancestor_1}'].add(
                            f'{match_2} || {ancestor_2}')
                        similar_matches[f'{match_2} || {ancestor_2}'].add(
                            f'{match_1} || {ancestor_1}')

    cleaned_similar_matches = {}
    for group_1 in similar_matches:
        group_1_match = group_1.split(' || ')[0]
        group_1_ancestor = group_1.split(' || ')[1]
        if (len(data[group_1_match][group_1_ancestor]['Birth']) == 0) and (
            len(data[group_1_match][group_1_ancestor]['Death']) == 0):
            continue
        cleaned_similar_matches[group_1] = similar_matches[group_1].copy()
        for group_2 in similar_matches:
            group_2_match = group_2.split(' || ')[0]
            group_2_ancestor = group_2.split(' || ')[1]
            if (len(data[group_2_match][group_2_ancestor]['Birth']) == 0) and (
                len(data[group_2_match][group_2_ancestor]['Death']) == 0):
                continue
            intersection = similar_matches[group_1].intersection(
                similar_matches[group_2])
            cleaned_intersection = set()
            for match_ancestor_pair in intersection:
                match = match_ancestor_pair.split(' || ')[0]
                ancestor = match_ancestor_pair.split(' || ')[1]
                if (len(data[match][ancestor]['Birth']) > 0) and (
                    len(data[match][ancestor]['Death']) > 0):
                    cleaned_intersection.add(match_ancestor_pair)
            if len(cleaned_intersection) > 0:
                cleaned_similar_matches[group_1] = cleaned_similar_matches[
                    group_1].union(similar_matches[group_2])

    sorted_groups = sorted([group for group in cleaned_similar_matches], 
                           key=lambda x: len(cleaned_similar_matches[x]))

    to_delete = set()
    for i in range(len(sorted_groups)):
        for j in range(i+1, len(sorted_groups)):
            if i not in to_delete:
                if cleaned_similar_matches[sorted_groups[i]].issubset(
                    cleaned_similar_matches[sorted_groups[j]]):
                    to_delete.add(i)
                    del cleaned_similar_matches[sorted_groups[i]]

    sorted_groups = sorted(
        [group for group in cleaned_similar_matches], key=lambda x: len(set(
            [elem.split(' || ')[0] for elem in cleaned_similar_matches[x]
            ])), reverse=True)
    
    to_print = []
    for group in sorted_groups:
        counter = 0
        for match_ancestor_pair in cleaned_similar_matches[group]:
            match = match_ancestor_pair.split(' || ')[0]
            if match in matches_to_ignore:
                counter += 1
        if len(cleaned_similar_matches[group]) > counter:
            to_print.append(group)

    for group in to_print:
        print('COUNT', len(set([elem.split(' || ')[0] for elem in 
                                cleaned_similar_matches[group]])))
        for match_ancestor_pair in cleaned_similar_matches[group]:
            match = match_ancestor_pair.split(' || ')[0]
            ancestor = match_ancestor_pair.split(' || ')[1]
            print(match)
            print(ancestor, pedigrees[match][ancestor])
            print()
        print()
        print()
        print()

In [2]:
def search_by_location(group, location, matches_to_ignore=[]):
    filepath = f'pedigrees/{group}'

    pedigrees = {}
    for file in os.listdir(filepath):
        with open(f'{filepath}/{file}') as f:
            pedigrees[file.replace('.json', '')] = json.load(f)
            
    for match in pedigrees:
        if match not in matches_to_ignore:
            for ancestor in pedigrees[match]:
                for field in pedigrees[match][ancestor]:
                    if 'Location' in pedigrees[match][ancestor][field]:
                        if pedigrees[match][ancestor][field]['Location'] != None:
                            if (location in pedigrees[match][
                                ancestor][field]['Location']):
                                print(match)
                                print(pedigrees[match][ancestor])
                                print()

In [3]:
def search_by_surname(group, surname, matches_to_ignore=[]):
    filepath = f'pedigrees/{group}'

    pedigrees = {}
    for file in os.listdir(filepath):
        with open(f'{filepath}/{file}') as f:
            pedigrees[file.replace('.json', '')] = json.load(f)
            
    for match in pedigrees:
        if match not in matches_to_ignore:
            for ancestor in pedigrees[match]:
                if 'Surname' in pedigrees[match][ancestor]:
                    if pedigrees[match][ancestor]['Surname'] != None:
                        if (surname in pedigrees[match][ancestor]['Surname']):
                            print(match)
                            print(pedigrees[match][ancestor])
                            print()

In [4]:
def get_all_pedigrees():
    filepath = f'pedigrees/'
    
    pedigrees = {}
    for group in os.listdir(filepath):
        if group not in ['tests', '.DS_Store']:
            pedigrees[group] = {}
            for file in os.listdir(f'{filepath}{group}/'):
                with open(f'{filepath}/{group}/{file}') as f:
                    pedigrees[group][file.replace('.json', '')] = json.load(f)
    
    return pedigrees

In [5]:
def get_groups_with_location(location):
    pedigrees = get_all_pedigrees()
    groups_with_location = set()
    for group in pedigrees:
        for match in pedigrees[group]:
            for ancestor in pedigrees[group][match]:
                for field in pedigrees[group][match][ancestor]:
                    if 'Location' in pedigrees[group][match][ancestor][field]:
                        if type(pedigrees[group][match][ancestor][field][
                            'Location']) == str:
                            if location in pedigrees[group][match][ancestor][
                                field]['Location']:
                                groups_with_location.add(group)
    
    return groups_with_location

In [6]:
def get_groups_with_surname(surname):
    pedigrees = get_all_pedigrees()
    groups_with_location = set()
    for group in pedigrees:
        for match in pedigrees[group]:
            for ancestor in pedigrees[group][match]:
                if 'Surname' in pedigrees[group][match][ancestor]:
                    if type(pedigrees[group][match][ancestor]['Surname']
                           ) == str:
                        if surname in pedigrees[group][match][ancestor][
                            'Surname']:
                            groups_with_location.add(group)
    
    return groups_with_location

In [11]:
def print_location_matches_by_group(location, matches_to_ignore=[]):
    groups = get_groups_with_location(location)
    for group in groups:
        print('GROUP:', group.upper())
        print()
        search_by_location(group, location, matches_to_ignore)
        print()
        print()

In [13]:
def print_surname_matches_by_group(surname, matches_to_ignore=[]):
    groups = get_groups_with_surname(surname)
    for group in groups:
        print('GROUP:', group.upper())
        print()
        search_by_surname(group, surname, matches_to_ignore)
        print()
        print()