# Setup Files for Terra Gene Expression Analysis
- **Author(s)** - Frank Grenn
- **Date Started** - March  2021
- **Quick Description:** Collect samples data for use on Terra when obtaining AMPPD expression data

In [None]:
import pandas as pd


In [None]:
WRKDIR = "$PATH/chrY"
BFILEDIR = f"{WRKDIR}/y_male_only_bfiles"
OUTDIR = f"{WRKDIR}/output_male_hemizygous_only_het_filter_run"

## Read fam file

In [None]:
samples = pd.read_csv(f"{BFILEDIR}/chrY_male_hemizygous_only_het_filter_hg19_final.fam",sep="\s+",header=None)
samples.columns = ['fid','iid','pid','mid','sex','pheno']
print(samples.shape)
print(samples.head())

## Id genetic carriers

In [None]:
meta = pd.read_csv("$PATH/AMPPD_releasev2_covariates_Feb2021.csv")
print(meta.shape)
print(meta.head())

In [None]:
ppmi_meta = meta[meta.ID.str.contains('PP-')]

In [None]:
ppmi_meta['PATNO'] = ppmi_meta['ID'].str.split('-').str[1]

In [None]:
ppmi_meta.shape

In [None]:
ppmi_meta[ppmi_meta.PATNO.str.len()==5].shape

In [None]:
gc_ids = (ppmi_meta[ppmi_meta.PATNO.str.len()==5]['ID']).tolist()
print(len(gc_ids))
print(gc_ids[1:10])

In [None]:
meta_merge = pd.merge(left = samples, right = meta, left_on = 'fid', right_on = 'ID')
print(meta_merge.shape)

In [None]:
meta_merge.columns

In [None]:
meta_merge['genetic_carrier'] = 0

In [None]:
meta_merge.loc[meta_merge.ID.isin(gc_ids),'genetic_carrier'] = 1

In [None]:
meta_merge.head()

In [None]:
meta_merge[meta_merge['genetic_carrier']==0].shape

In [None]:
meta_merge[meta_merge['genetic_carrier']==1].shape

## Get ancestry data

In [None]:
anc = pd.read_csv("$PATH/euro_king_pca_v2.5_July2021/genetic_ancestry_all_pca.csv")
print(anc.shape)
print(anc.head())

In [None]:
meta_merge = pd.merge(left = meta_merge, right = anc[['#FID','InfPop']], left_on = "fid", right_on = "#FID")
print(meta_merge.shape)
print(meta_merge.head())

## Yhaplo Data

In [None]:
#just get yhaplo data for now because first character of haplogroup for all samples is the same between the yhaplo and snappy tools
yhaplo = pd.read_csv(f"{OUTDIR}/output_yhaplo/haplogroups.chrY_male_hemizygous_only_het_filter_hg19_final.txt",sep="\s+",header=None)
yhaplo.columns = ['id','haplo_short','haplo_short_rep_snp','haplo_long']
print(yhaplo.shape)
print(yhaplo.head())

In [None]:
yhaplo['haplo_major'] = yhaplo['haplo_long'].str[0]
yhaplo['id'] = yhaplo['id'].apply(lambda x: x[0:(int)(len(x)/2)])
print(yhaplo.shape)
print(yhaplo.head())

In [None]:
yhaplo = yhaplo[['id','haplo_major','haplo_long']]
yhaplo.columns = ['id','yhaplo_haplo_major','yhaplo_haplo']

## Snappy Data

In [None]:
snappy = pd.read_csv(f"{OUTDIR}/chrY_hgs_snappy_matches.out",sep="\t",header=None)
snappy.columns = ['id','haplo','haplo_score','info_alleles']

#some samples, like "PD-PDNZ095VCJ" have extra data in the "haplo" column, like "B2a1a M109,M152/Page60,P32,P50", and we only want the "B2a1a"
snappy['haplo']= snappy['haplo'].str.split(" ").str[0]
snappy['haplo_major'] = snappy['haplo'].str[0]
print(snappy.shape)
print(snappy.head())

In [None]:
snappy = snappy[['id','haplo_major','haplo']]
snappy.columns = ['id','snappy_haplo_major','snappy_haplo']

## Y-LineageTracker Data

In [None]:
ltrack = pd.read_table(f"{OUTDIR}/output_ltracker/ltrack_hg19.lineageresult.txt")#pd.read_csv(f"{OUTDIR}/output_ltracker/ltrack_out.hapresult.hg",sep="\s+")
print(ltrack.shape)
print(ltrack.head())

In [None]:
ltrack['haplo_major'] = ltrack['Haplogroup'].str[0]
ltrack['id'] = ltrack['SampleID'].apply(lambda x: x[0:(int)(len(x)/2)])
print(ltrack.shape)
print(ltrack.head())

In [None]:
ltrack = ltrack[['id','haplo_major','Haplogroup']]
ltrack.columns = ['id','ltrack_haplo_major','ltrack_haplo']
print(ltrack.head())

## Merge

In [None]:
meta_merge_yhaplo = pd.merge(left = meta_merge, right = yhaplo, left_on = 'ID', right_on = 'id')
print(meta_merge_yhaplo.shape)

In [None]:
meta_merge_yhaplo_snappy = pd.merge(left = meta_merge_yhaplo, right = snappy, left_on = 'ID', right_on = 'id')
print(meta_merge_yhaplo_snappy.shape)

In [None]:
meta_merge_all = pd.merge(left = meta_merge_yhaplo_snappy, right = ltrack, left_on = 'ID',right_on = 'id')
print(meta_merge_all.shape)

In [None]:
meta_merge_all.head()

In [None]:
meta_merge_all = meta_merge_all.drop(columns =['id_x','id_y','id','PHENO','#FID'])
print(meta_merge_all.shape)

In [None]:
meta_merge_all.to_csv(f"{OUTDIR}/chrY_meta.csv",index=None)