# Check The Variants Making Up Specific Y Chromosome Haplogroups
- **Author(s)** - Frank Grenn
- **Date Started** - April  2021
- **Quick Description:** Get variants associated with all haplogroups that were associated with disease and in 50 or more samples. Use ANNOVAR to annotate these variants to determine if they are coding or not. 

In [None]:
import pandas as pd

In [None]:
WRKDIR = '$PATH/chrY'

## 1) Get Significant Haplogroups

In [None]:
combine_df = pd.read_csv(f"{WRKDIR}/meta_analyzed_full_haplo_cohort_count.csv")
print(combine_df.shape)
print(combine_df.head())

In [None]:
combine_df.columns

In [None]:
len(set(combine_df.haplo_full))

### Multiple Test Correction

In [None]:
sig_full_haplo_multi = combine_df[(combine_df.pval<0.05/17)]
print(sig_full_haplo_multi.shape)
print(sig_full_haplo_multi.head())

In [None]:
combine_df.shape

In [None]:
sig_full_haplo = combine_df[(combine_df.pval<0.05)]
print(sig_full_haplo.shape)
print(sig_full_haplo.head())

In [None]:
print(len(set(combine_df.haplo_full.tolist())))

In [None]:
print(len(set(sig_full_haplo.haplo_full.tolist())))

## 2) Get Haplogroup SNP Names
use reference file used in each haplogroup calling tool

In [None]:
snappy_tree = pd.read_table(f"{WRKDIR}/snappy/ref_files/y_hg_and_snps.sort",skiprows=1,sep="\t",header=None)
snappy_tree.columns = ['haplo','snps']
print(snappy_tree.shape)
print(snappy_tree.head())

In [None]:
#cut -f 1,2 {WRKDIR}/yhaplo/yhaplo/input/isogg.2016.01.04.txt > temp.isogg.2016.rep.snps.txt
yhaplo_tree =  pd.read_table(f"{WRKDIR}/yhaplo/yhaplo/input/temp.isogg.2016.rep.snps.txt",sep="\t")
yhaplo_tree.columns = ['snps','haplo']
yhaplo_tree['haplo'] = yhaplo_tree['haplo'].str.strip()
yhaplo_tree['snps'] = yhaplo_tree['snps'].str.strip()

#yhaplo_tree = pd.read_table(f"{WRKDIR}/yhaplo/yhaplo/input/representative.SNPs.isogg.2015tree.txt",sep="\s+",header=None)
#yhaplo_tree.columns = ['haplo','snps']
print(yhaplo_tree.shape)
print(yhaplo_tree.head())

In [None]:
ltrack_tree = pd.read_csv(f"{WRKDIR}/../Y-LineageTracker/LineageTracker/Data/HaplogroupInfo.csv")
print(ltrack_tree.shape)
print(ltrack_tree.head())

In [None]:
# table downloaded from isogg https://isogg.org/tree/ISOGG_YDNA_SNP_Index.html
snps = pd.read_csv("$PATH/SNP_Index_Human.csv",skiprows=1)
snps['Subgroup_no_tilde']  = snps['Subgroup Name'].str.replace('~','')
print(snps.shape)
print(snps.head())

In [None]:
sig_full_haplo

In [None]:
yhaplo_tree.head()

In [None]:
snappy_tree.head()

In [None]:
ltrack_tree.tail()

In [None]:
def get_ltrack_snps(haplo):
    #print(haplo)
    #print(ltrack_tree[ltrack_tree.Haplogroup ==haplo])
    ltrack_snps = list(set(ltrack_tree[ltrack_tree.Haplogroup ==haplo].Mutation))
    #print(ltrack_snps)
    
    full_list = []
    for snp in ltrack_snps:
        full_list = full_list + snp.split("/")
    

    return list(set(full_list))

In [None]:
def get_snappy_snps(haplo):
    snappy_haplo_snps = list(set(snappy_tree[snappy_tree.haplo==haplo].snps))
    print(snappy_haplo_snps)
    if(len(snappy_haplo_snps)==1):
        snappy_haplo_snps = snappy_haplo_snps[0].split(',')
        print(snappy_haplo_snps)
    

    return list(set(snappy_haplo_snps))

In [None]:
def get_yhaplo_snps(haplo):
    yhaplo_haplo_snps = list(set(yhaplo_tree[yhaplo_tree.haplo==haplo].snps))
    print(yhaplo_haplo_snps)
    if(len(yhaplo_haplo_snps)==1):
        yhaplo_haplo_snps = yhaplo_haplo_snps[0].split('/')
        print(yhaplo_haplo_snps)
    

    return list(set(yhaplo_haplo_snps))

In [None]:
sig_full_haplo.head()

In [None]:
snps_df = pd.DataFrame()
for index,row in sig_full_haplo.iterrows():
    print(row)
    current_haplo = row.haplo_full
    tool_snps = []
    if(sig_full_haplo[sig_full_haplo.haplo_full==row.haplo_full].pval.isna().tolist()[0]==False):
        if(row.tool=="snappy"):
            tool_snps = get_snappy_snps(current_haplo)
            if(len(tool_snps)==0):
                tool_snps = get_snappy_snps(current_haplo+'~')
        elif(row.tool=="yhaplo"):
            tool_snps = get_yhaplo_snps(current_haplo)
            if(len(tool_snps)==0):
                tool_snps = get_yhaplo_snps(current_haplo+'~')
        elif(row.tool=="ltrack"):
            tool_snps = get_ltrack_snps(current_haplo)
            if(len(tool_snps)==0):
                tool_snps = get_ltrack_snps(current_haplo+'~')
    
    print(tool_snps)
        
    tool_snp_df = pd.DataFrame(data={'snp':tool_snps})
    tool_snp_df['haplo'] = row.haplo_full
    tool_snp_df['tool'] = row.tool
    print(tool_snp_df.shape)
    print(tool_snp_df.head())
    
    snps_df = snps_df.append(tool_snp_df).drop_duplicates()
    
    #break

In [None]:
sig_full_haplo.shape

In [None]:
snps_df.shape

In [None]:
snps_df.head()

In [None]:
snps.head()

In [None]:
merge_snps_df = pd.merge(left = snps_df, right = snps, left_on = 'snp', right_on = 'Name',how = 'inner')
print(merge_snps_df.shape)
print(merge_snps_df.head())

In [None]:
print(len(set(merge_snps_df.snp)))
print(len(set(snps_df.snp)))
print(set(merge_snps_df.snp)^set(snps_df.snp))

In [None]:
merge_snps_df.tail()

In [None]:
merge_snps_df.columns = ['snp','tool_haplo_name','tool','Name','isogg_subgroup_name','alt_snp_names','rsid','hg19_bp','hg38_bp','mutation','isogg_subgroup_name_no_tilde']
#merge_snps_df.to_csv(f"{WRKDIR}/sig_haplo_variants_newest.csv",index=None)

In [None]:
print(len(set(snps_df['haplo'])))
print(len(set(merge_snps_df['tool_haplo_name'])))
print(len(set(merge_snps_df['isogg_subgroup_name_no_tilde'])))
print(set(snps_df['haplo']) ^ set(merge_snps_df['tool_haplo_name']))

In [None]:
print(len(set(snps_df['snp'])))
print(len(set(merge_snps_df['Name'])))
print(set(snps_df['snp']) ^ set(merge_snps_df['Name']))

## 3) Annotate with ANNOVAR
may need to reformat some values first

In [None]:
#some may have *_bp as a string
print(merge_snps_df[merge_snps_df['hg38_bp'].str.contains("..",regex=False)])

In [None]:
merge_snps_df.loc[merge_snps_df['hg19_bp'].str.contains("..",regex=False),'hg19_bp'] = merge_snps_df.loc[merge_snps_df['hg19_bp'].str.contains("..",regex=False),'hg19_bp'].str.split("\.\.").str[0]
merge_snps_df.loc[merge_snps_df['hg38_bp'].str.contains("..",regex=False),'hg38_bp'] = merge_snps_df.loc[merge_snps_df['hg38_bp'].str.contains("..",regex=False),'hg38_bp'].str.split("\.\.").str[0]

In [None]:
#chr	start	end	ref	alt	snp
avinput_hg19 = merge_snps_df[['hg19_bp','mutation','snp']]
avinput_hg19['chr'] = 'Y'
avinput_hg19['ref'] = avinput_hg19['mutation'].str.split('->').str[0]
avinput_hg19['alt'] = avinput_hg19['mutation'].str.split('->').str[1]
avinput_hg19['start'] = avinput_hg19['hg19_bp']
avinput_hg19['end'] = avinput_hg19['hg19_bp']
avinput_hg19 = avinput_hg19[['chr','start','end','ref','alt','snp']]



#need to adjust end positions for ref alleles with more than one nucleotide
avinput_hg19['end'] = avinput_hg19['start'].astype('int64') + avinput_hg19['ref'].str.len()-1
#replace * with - for annovar syntax
avinput_hg19['ref'] = avinput_hg19['ref'].replace('*','-')
avinput_hg19['alt'] = avinput_hg19['alt'].replace('*','-')


print(avinput_hg19.head())
print(avinput_hg19.tail())
avinput_hg19.drop_duplicates().to_csv(f"{WRKDIR}/annotation/sig_snp_hg19_new.avinput",index=None,sep="\t")

``` table_annovar.pl $PATH/sig_snp_hg19_new.avinput $ANNOVAR_DATA/hg19/ -buildver hg19 --thread 16  -out $PATH/sig_snp_hg19_new.annovar  -remove -protocol refGene,avsnp150  -operation g,f  -nastring . ```

In [None]:
annotation_hg19 = pd.read_table(f"{WRKDIR}/annotation/sig_snp_hg19_new.annovar.hg19_multianno.txt").drop_duplicates()
annotation_hg19['mut'] = annotation_hg19['Ref']+'->'+annotation_hg19['Alt']
print(set(annotation_hg19['Func.refGene'].tolist()))
print(set(annotation_hg19['ExonicFunc.refGene'].tolist()))
print(annotation_hg19.shape)
print(annotation_hg19.head())

## 4) Combine Annotation Results With SNP Data

In [None]:
merge_snp_info = pd.merge(left = merge_snps_df, right = annotation_hg19, left_on = ['hg19_bp','mutation'],right_on = ['Start','mut'])
print(merge_snp_info.shape)
print(len(set(merge_snps_df['hg19_bp'].tolist())))
print(len(set(annotation_hg19['Start'].tolist())))
print(len(set(merge_snp_info['Start'].tolist())))
print(merge_snp_info.head())

In [None]:
merge_snp_info.to_csv(f"{WRKDIR}/sig_haplo_variants_annotated_newest.csv",index=None)

In [None]:
merge_snp_info['tool_snp_names'] = merge_snp_info.groupby(['Start','End','Ref','Alt'])['snp'].transform(lambda x: ','.join(x))
merge_snp_info['tool_haplo_names'] = merge_snp_info.groupby(['Start','End','Ref','Alt'])['tool_haplo_name'].transform(lambda x: ','.join(x))
merge_snp_info['tool_names'] = merge_snp_info.groupby(['Start','End','Ref','Alt'])['tool'].transform(lambda x: ','.join(x))

In [None]:
merge_reduced = merge_snp_info[['Chr','Start','End','Ref','Alt','avsnp150','Func.refGene','Gene.refGene','GeneDetail.refGene','ExonicFunc.refGene','isogg_subgroup_name','tool_haplo_names','tool_snp_names','tool_names']].copy()
merge_reduced.columns = ['Chr','Start','End','Ref','Alt','rsid','Func.refGene','Gene.refGene','GeneDetail.refGene','ExonicFunc.refGene','isogg_haplo_name','tool_haplo_names','tool_snp_names','tool_names']
print(merge_reduced.shape)
merge_reduced = merge_reduced.drop_duplicates()
print(merge_reduced.shape)

In [None]:
set(merge_reduced['Func.refGene'])

In [None]:
print(len(set(merge_reduced.isogg_haplo_name.tolist())))

In [None]:
merge_reduced.to_csv(f"{WRKDIR}/sig_haplo_variants_annotated_reduced_newest.csv",index=None)

In [None]:
merge_reduced[merge_reduced['Func.refGene'] == 'exonic']

In [None]:
merge_reduced[merge_reduced['Func.refGene'] == 'ncRNA_exonic']

In [None]:
merge_reduced.shape