# Summarize the genealogies of individuals

Compute simple statistics for each FarGen participant.

## Setup

Connect to the Neo4j DB.

In [91]:
from neo4j import GraphDatabase

# URL to connect to the Neo4j database where the genealogy is stored.
neo4j_url = 'bolt://aebs-db:7687'

# Connect to the DB.
driver = GraphDatabase.driver(neo4j_url)
session = driver.session()

Get the samplenames from a CSV file.

In [156]:
# CSV file
samplenames_fn = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis/data/genealogy/fargen_rin_samplename.csv'
with open(samplenames_fn) as fid:
    sample_list = []  # List of sample names.
    rin_list = []  # List of RIN (genealogy) IDs.
    output_data = dict()  # Data to write to CSV will be stored in this dictionary.
    _ = fid.readline()  # Discard header.
    for line in fid:
        # Get the RIN and samplename from this line.
        rin, sample = line.strip().split(',')
        rin_list.append(rin)
        output_data[rin] = {'sample': sample}
        sample_list.append(sample)

In [157]:
print('Number of samples: {n}'.format(n=len(rin_list)))

Number of samples: 1534


Look-up all probands in database.

In [20]:
# For each RIN, find an individual matching that RIN.
result = session.run('UNWIND $inds AS x MATCH (proband:Person {ind: x}) RETURN proband.ind', inds=rin_list)
values = result.values()
print('Found {n} out of {m} probands.'.format(n=len(values), m=len(rin_list)))

Found 1534 out of 1534 probands.


## Genealogy size

Below we compute the size of the genealogy. We calculate both the total number of individuals in each probands tree, and the distinct number of individuals (as ancestors may occur in multiple lineages). The proband is included in both these numbers, so a proband that is also a founder will have a genealogy size of 1.

In [207]:
%%time
results = session.run('UNWIND $inds AS x MATCH (p:Person {ind: x})-[:is_child*0..]->(a) RETURN x, count(a), count(DISTINCT a)', inds=rin_list)
gen_sizes = results.values()

CPU times: user 75.6 ms, sys: 0 ns, total: 75.6 ms
Wall time: 1min 10s


In [206]:
print('Calculated the genealogy sizes for {n} probands.'.format(n=len(gen_sizes)))

Calculated the genealogy sizes for 1512 probands.


In [190]:
for rin, count, distinct_count in gen_sizes:
    output_data[rin]['gen_size'] = count
    output_data[rin]['gen_size_unique'] = distinct_count

## Lineage lengths

For each proband, calculate the length of all paths to all founders. Note that probands with no ancestors in the genealogy will not yield a result here.

In [194]:
%%time
results = session.run('UNWIND $inds AS x MATCH path = (p:Person {ind: x})-[:is_child*]->(a:Founder) RETURN x, a.ind, length(path)', inds=rin_list)
values = results.values()

CPU times: user 8min 32s, sys: 32.9 s, total: 9min 5s
Wall time: 11min 3s


The result is a list where each item is a path from one particular proband to one particular founder. Convert this so we have one dict per proband.

In [195]:
path_sum_dict = dict()
for ind, anc, path_len in values:
    if path_sum_dict.get(ind) is None:
        # Initialize list for current individual.
        path_sum_dict[ind] = []
    
    # Add the current path to this individual.
    path_sum_dict[ind].append([anc, path_len])

Calculate the shortest and longest paths and add this information to the `output_data` dictionary.

In [196]:
for rin, path_list in path_sum_dict.items():
    oldest_ancestors, longest_path = max(path_list, key=lambda x: x[1])
    youngest_ancestors, shortest_path = min(path_list, key=lambda x: x[1])
    
    output_data[rin]['longest_lineage'] = longest_path
    output_data[rin]['shortest_lineage'] = shortest_path

## Write results to CSV

In [214]:
# Path to write CSV to.
csv_path = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis/data/genealogy/individuals_summary.csv'
with open(csv_path, 'w') as fid:
    # Header of CSV.
    header = 'ind,sample,gen_size,gen_size_unique,shortest_lineage,longest_lineage'
    fid.write(header + '\n')
    # Loop over individuals in output data dictionary.
    for ind in output_data.keys():
        # Get each value to write to the current line.
        sample = output_data[ind].get('sample')
        gen_size = output_data[ind].get('gen_size')
        gen_size_unique = output_data[ind].get('gen_size_unique')
        shortest_lineage = output_data[ind].get('shortest_lineage')
        longest_lineage = output_data[ind].get('longest_lineage')
        
        # Probands with no ancestors in the genealogy will have path lengths "None". Set these to zero.
        if shortest_lineage is None:
            shortest_lineage = 0
        if longest_lineage is None:
            longest_lineage = 0
            
        # Write a line.
        line = '{ind},{sample},{gen_size},{gen_size_unique},{shortest_lineage},{longest_lineage}'.format(
                    ind=ind,sample=sample,gen_size=gen_size,gen_size_unique=gen_size_unique,
                    shortest_lineage=shortest_lineage,longest_lineage=longest_lineage)
        row = line + '\n'
        fid.write(row)