# Check Related Samples
- **Author(s)** - Frank Grenn
- **Quick Description:** use chr1 to identify related male samples. Then see if they are in the same chrY haplogroup. Also check results from King tool.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
WRKDIR = "/PATH/chrY"
BFILEDIR = f"{WRKDIR}/y_male_only_bfiles"
OUTDIR = f"{WRKDIR}/output_male_hemizygous_only_het_filter_run"



Prune snps   
`plink --bfile /PATH/chr1 --indep-pairwise 1000 10 0.02 --out /PATH/pruned_chr1_data`   
 Extract pruned SNPs  
`plink --bfile /PATH/chr1 --extract /PATH/pruned_chr1_data.prune.in --make-bed --out /PATH/chr1_pruned `    
 MAF filter and relatedness calculations  
`plink --bfile /PATH/chr1_pruned --maf 0.05 --genome --min 0.2 --out /PATH/chr1_pruned_genome`    


In [None]:
#read genome file
related = pd.read_table(f"{WRKDIR}/chr1_pruned_genome.genome",sep='\s+')
related['ID1_double'] = related['FID1']+'_'+related['IID1']
related['ID2_double'] = related['FID2']+'_'+related['IID2']
print(related.shape)#(2023323, 16)
print(related.head())

In [None]:
set(related['RT'])

In [None]:
#get haplogroup data
samples = pd.read_csv(f"{BFILEDIR}/chrY_male_hemizygous_only_het_filter_hg19_final.fam",sep="\s",header=None)
samples.columns = ['fid','iid','pid','mid','sex','pheno']
print(samples.shape)
print(samples.head())

In [None]:
meta = pd.read_csv("/PATH/AMPPD_releasev2_covariates_Feb2021.csv")
print(meta.shape)
print(meta.head())

In [None]:
meta_merge = pd.merge(left = samples, right = meta, left_on = 'fid', right_on = 'ID')
print(meta_merge.shape)

In [None]:
males = meta_merge[meta_merge['SEX']==1]['ID'].tolist()
print(len(males))

In [None]:
males_double_id = [iid+"_"+iid for iid in males]
print(len(males_double_id))
print(males_double_id[0:10])

In [None]:
#filter down the related df to only include males
print(related.shape)
related = related[related['FID1'].isin(males)]
print(related.shape)
related = related[related['FID2'].isin(males)]
print(related.shape)

In [None]:
yhaplo = pd.read_csv(f"{OUTDIR}/output_yhaplo/haplogroups.chrY_male_hemizygous_only_het_filter_hg19_final.txt",sep="\s+",header=None)
yhaplo.columns = ['id','haplo_short','haplo_short_rep_snp','haplo_long']
print(yhaplo.shape)
print(yhaplo.head())

In [None]:
#assume samples with "A" haplogroup were not assigned one.
yhaplo_pass = yhaplo[yhaplo['haplo_long']!='A']
print(yhaplo_pass.shape)
print(yhaplo_pass.head())

In [None]:
yhaplo_male = yhaplo_pass[yhaplo_pass.id.isin(males_double_id)]
yhaplo_male['haplo_major'] = yhaplo_male['haplo_long'].str[0]
print(yhaplo_male.shape)
print(yhaplo_male.head())

In [None]:
id1_haplo_df = yhaplo_male.copy()[['id','haplo_major']]
id1_haplo_df.columns = ['id1','id1_haplo_major']

print(related.shape)
related_1_haplo = pd.merge(left = related, right = id1_haplo_df, left_on = 'ID1_double', right_on = 'id1', how = 'left')
print(related_1_haplo.shape)

id2_haplo_df = yhaplo_male.copy()[['id','haplo_major']]
id2_haplo_df.columns = ['id2','id2_haplo_major']

related_haplos = pd.merge(left = related_1_haplo, right = id2_haplo_df, left_on = 'ID2_double', right_on = 'id2', how = 'left')
print(related_haplos.shape)#(659923, 20)

related_haplos = related_haplos.drop(['ID1_double', 'ID2_double','id1','id2'], axis=1)

In [None]:
print(related_haplos.head())

In [None]:
#where do the haplotypes differ?
related_haplos['haplo_diff'] = 'diff'
related_haplos['haplo_diff'].loc[related_haplos['id1_haplo_major'] ==  related_haplos['id2_haplo_major']] = 'same'

In [None]:
print(related_haplos[related_haplos['haplo_diff']=='same'].shape)
print(related_haplos[related_haplos['haplo_diff']=='same'].head())
print(related_haplos[related_haplos['haplo_diff']=='same'].tail())

In [None]:
related_haplos[related_haplos['haplo_diff']=='diff']

In [None]:
related_haplos.loc[related_haplos['id1_haplo_major'] !=  related_haplos['id2_haplo_major']].drop(columns=['haplo_diff','IID1','IID2'])

In [None]:
related_haplos.loc[related_haplos['id1_haplo_major'] !=  related_haplos['id2_haplo_major']].drop(columns=['haplo_diff']).to_csv(f"{OUTDIR}/related_mismatches.csv",index=None)

## Check results from King tool

In [None]:
king = pd.read_table("/PATH/king_all_chr.kin0")
king['ID1_double'] = king['ID1']+'_'+king['ID1']
king['ID2_double'] = king['ID2']+'_'+king['ID2']
print(king.shape)
print(king.head())

In [None]:
#filter down the related df to only include males
print(king.shape)
king = king[king['ID1'].isin(males)]
print(king.shape)
king = king[king['ID2'].isin(males)]
print(king.shape)

In [None]:
min(king.KINSHIP)

In [None]:
id1_haplo_df = yhaplo_male.copy()[['id','haplo_major']]
id1_haplo_df.columns = ['id1','id1_haplo_major']

print(king.shape)
related_1_haplo = pd.merge(left = king, right = id1_haplo_df, left_on = 'ID1_double', right_on = 'id1')#, how = 'left')
print(related_1_haplo.shape)

id2_haplo_df = yhaplo_male.copy()[['id','haplo_major']]
id2_haplo_df.columns = ['id2','id2_haplo_major']

related_haplos = pd.merge(left = related_1_haplo, right = id2_haplo_df, left_on = 'ID2_double', right_on = 'id2')#, how = 'left')
print(related_haplos.shape)#(659923, 20)

related_haplos = related_haplos.drop(['ID1_double', 'ID2_double','id1','id2'], axis=1)

In [None]:
#where do the haplotypes differ?
related_haplos['haplo_diff'] = 'diff'
related_haplos['haplo_diff'].loc[related_haplos['id1_haplo_major'] ==  related_haplos['id2_haplo_major']] = 'same'

In [None]:
print(related_haplos[related_haplos['haplo_diff']=='same'].shape)
print(related_haplos[related_haplos['haplo_diff']=='same'].head())
print(related_haplos[related_haplos['haplo_diff']=='same'].tail())

In [None]:
related_haplos[related_haplos['haplo_diff']=='diff']