In [1]:
import pandas as pd
import numpy as np
import time
import argparse
from itertools import compress

In [2]:
def group_families(data_path,data_delimiter=' ',ID_header_1='ID1',ID_header_2='ID2'):
    
    # Indicate current task
    print('\nGrouping IDs in the same family\n',end='')
    
    # Read input
    data = pd.read_csv(data_path,sep=data_delimiter)[[ID_header_1,ID_header_2]]
    
    # For progress calculation
    tic = time.time()
    total_length = len(data)
    
    # Repeat grouping until every pair has been exhausted
    family_list = []
    while len(data) > 0:
        incomplete = True
        family = [list(data[ID_header_1])[0],list(data[ID_header_2])[0]]
        data = data.iloc[1:,:]
        
        # Repeatedly search for new members of family
        while incomplete:
            
            # Find all new entries to add to current family
            new_ID = data.isin(family)
            
            # Detecting if no new entries will be added to current family
            if sum(new_ID[ID_header_1]) + sum(new_ID[ID_header_2]) == 0:
                #print(family)
                family_list.append(family)
                incomplete = False
                continue
                
            # Remove all irrelevant rows
            new_ID = new_ID[np.logical_or(new_ID[ID_header_1] == True, new_ID[ID_header_2] == True)]
            
            # Debug
            #display(new_ID)
            #display(data.loc[new_ID.index,:])
            #display(fulldata.loc[new_ID.index])
            
            # Update list of IDs for current working family
            for (index,row) in new_ID.iterrows():
                for item in data.loc[index,:]:
                    if item not in family:
                        family.append(item)
                data.drop(index, inplace=True)
        
        # Calculate and display progress
        toc = time.time()
        if toc-tic > 1:
            print(f'\rProgress: {total_length-len(data)}/{total_length} Family Count: {len(family_list)}',end='')
            tic = time.time()
    
    # Indicate completed task
    print(f'\rProgress: {total_length-len(data)}/{total_length} Family Count: {len(family_list)}',end='')
    print('\nDone',end='')
    return pd.DataFrame({'family':family_list})

In [3]:
def obtain_relations(data_path,data_delimiter,ID_header_1,ID_header_2,relation_header):
    
    '''
    Relationship Key:
    S = Self
    D = Duplicate or Monozygotic Twin
    P = Parent
    F = First Sibling (Relation by Parents)
    2 = 2nd Sibling (Relation by Single Parent ~25% shared genetics)
    3 = 3rd Sibling (Relation by ~12.5% shared genetics)
    '''

    # Indicate current task
    print('\nObtaining relationships in family\n',end='')
    
    # Read input
    data = pd.read_csv(data_path,sep=data_delimiter)
    
    # For progress calculation
    tic = time.time()
    total_length = len(family_data)
    
    # Obtain relationship matrix and relation for each family
    relations = []
    for (index,row) in family_data.iterrows():
        
        # Generate empty matrix with correct dimensions
        dim = len(row[0])
        matrix = np.full((dim,dim),' ',dtype='U')
        
        # Fill diagonal
        for i in range(0,dim):
            matrix[i,i] = 'S'
        
        # Find all relevant relation information
        target = data.isin(row[0])
        target = target[np.logical_and(target[ID_header_1] == True, target[ID_header_2] == True)].index
        relation_data = data.loc[target,:]
        
        # Update matrix with proper values
        for (i,info) in relation_data.iterrows():
            ID1_index = row[0].index(info[ID_header_1])
            ID2_index = row[0].index(info[ID_header_2])
            relation = info[relation_header][0]
            matrix[ID1_index,ID2_index] = relation
            matrix[ID2_index,ID1_index] = relation
        relations.append(matrix)
        
        # Calculate and display progress
        toc = time.time()
        if toc-tic > 1:
            print(f'\rProgress: {index+1}/{total_length}',end='')
            tic = time.time()
    
    # Indicate completed task
    print(f'\rProgress: {index+1}/{total_length}',end='')
    print('\nDone',end='')
    return pd.DataFrame({'relation':relations})

In [4]:
def match_pheno_data(family_data,
    pheno_path,pheno_delimiter,pheno_ID_header,pheno_header,
    covar_path,covar_delimiter,covar_ID_header,covar_header):
    
    # Indicate current task
    print('\nObtaining phenotype info\n',end='')
    
    # Read phenotype file
    pheno_data = pd.read_csv(pheno_path,sep=pheno_delimiter)[[pheno_ID_header,pheno_header]]
    
    # Read covariate file
    covar_data = pd.read_csv(covar_path,sep=covar_delimiter)[[covar_ID_header,covar_header]]
    
    # For progress calculation
    tic = time.time()
    total_length = len(family_data)
    
    # List phenotype value for each ID in a family
    pheno = []
    covar = []
    missing_ID = 0
    missing_value = 0
    for index,row in family_data.iterrows():
        temp_pheno = []
        temp_covar = []
        for ID in row[0]:
            
            # Locate index of phenotype value for ID
            ID_index = pheno_data.loc[pheno_data[pheno_ID_header]==ID].index
            
            # Add value of phenotype onto list
            if len(ID_index) == 0:
                temp_pheno.append(-9)
                temp_covar.append(-9)
                missing_ID += 1
            elif len(ID_index) == 1:
                phenotype = pheno_data.at[pheno_data.loc[pheno_data[pheno_ID_header]==ID].index[0],pheno_header]
                covariate = covar_data.at[covar_data.loc[covar_data[covar_ID_header]==ID].index[0],covar_header]
                if pd.isna(phenotype):
                    phenotype = -9
                    missing_value += 1
                if pd.isna(covariate):
                    covariate = -9
                    missing_value += 1
                temp_pheno.append(phenotype)
                temp_covar.append(covariate)
            else:
                print('\nMultiple values found, using first\n',end='')
                phenotype = pheno_data.at[pheno_data.loc[pheno_data[pheno_ID_header]==ID].index[0],pheno_header]
                covariate = covar_data.at[covar_data.loc[covar_data[covar_ID_header]==ID].index[0],covar_header]
                if pd.isna(phenotype):
                    phenotype = -9
                    missing_value += 1
                if pd.isna(covariate):
                    covariate = -9
                    missing_value += 1
                temp_pheno.append(phenotype)
                temp_covar.append(covariate)
        
        # Rejoin to main list
        pheno.append(temp_pheno)
        covar.append(temp_covar)
        
        # Calculate and display progress
        toc = time.time()
        if toc-tic > 1:
            print(f'\rProgress: {index+1}/{total_length} Missing ID: {missing_ID} Missing Value: {missing_value}',end='')
            tic = time.time()
    
    # Indicate copleted task
    print(f'\rProgress: {index+1}/{total_length} Missing ID: {missing_ID} Missing Value: {missing_value}',end='')
    print('\nDone',end='')
    return pd.DataFrame({'phenotype':pheno,'covariate':covar})

In [5]:
def choose_ID(family_data,pheno_data):
    
    # Indicate current task
    print('\nChoosing individual\n',end='')

    # For progress calculation
    tic = time.time()
    total_length = len(pheno_data)

    # Iterate through dataframe
    chosen = []
    ignore = []
    for index,row in pheno_data.iterrows():
        highest_pheno = list(row[0] == max(row[0]))
        if sum(highest_pheno) == 1:
            target_index = row[0].index(max(row[0]))
            chosen.append(family_data.loc[index][0][target_index])
            ignore.append(family_data.loc[index][0].pop(target_index))
        elif sum(highest_pheno) > 1:
            relevant_ID = list(compress(family_data.loc[index][0],highest_pheno))
            relevant_covar = list(compress(row[1],highest_pheno))
            highest_covar = list(relevant_covar == max(relevant_covar)).index(True)
            top_ID = relevant_ID[highest_covar]
            for item in family_data.loc[index][0]:
                if item == top_ID:
                    chosen.append(item)
                else:
                    ignore.append(item)
        else:
            display(highest_pheno)
            print('Something went REALLY wrong.')
        
        # Calculate and display progress
        toc = time.time()
        if toc-tic > 1:
            print(f'\rProgress: {index+1}/{total_length}',end='')
            tic = time.time()
    
    # Indicate completed task
    print(f'\rProgress: {index+1}/{total_length}',end='')
    print('\nDone',end='')
    return [pd.DataFrame({'Chosen_IID':chosen}),pd.DataFrame({'Ignore_IID':ignore})]

In [6]:
#def main():
data_path = '../intermediate_files/king_degree_2.kin0.reduced'
data_delimiter = ' '
ID_header_1 = 'ID1'
ID_header_2 = 'ID2'
relation_header = 'InfType'
pheno_path = '../../regenie/regenie_input/PD_ltfh.txt'
pheno_delimiter = ' '
pheno_ID_header = 'IID'
pheno_header = 'ltfh'
covar_path = '../../regenie/regenie_input/PD_covar.txt'
covar_delimiter = ' '
covar_ID_header = 'IID'
covar_header = 'age'

In [7]:
family_data = group_families(data_path,data_delimiter,ID_header_1,ID_header_2)


Grouping IDs in the same family
Progress: 39438/39438 Family Count: 31115
Done

In [8]:
relation_data = obtain_relations(data_path,data_delimiter,ID_header_1,ID_header_2,relation_header)


Obtaining relationships in family
Progress: 31115/31115
Done

In [9]:
pheno_data = match_pheno_data(family_data,
    pheno_path,pheno_delimiter,pheno_ID_header,pheno_header,
    covar_path,covar_delimiter,covar_ID_header,covar_header)


Obtaining phenotype info
Progress: 31115/31115 Missing ID: 28 Missing Value: 1
Done

In [10]:
ID_data = choose_ID(family_data,pheno_data)


Choosing individual
Progress: 31115/31115
Done

In [11]:
ID_data[0].to_csv('../keep_IDs.txt',sep=' ',index=False,header=False)

In [12]:
ID_data[1].to_csv('../remove_IDs.txt',sep=' ',index=False,header=False)