# AMP-PD ChrY HG38 to HG19
- **Author(s)** - Frank Grenn
- **Date Started** - February  2021
- **Quick Description:** liftover amppd chry plink binaries from hg38 to hg19 for use in the chrY haplogroup caller tools

#### 1) Try with UCSC liftover tool
#### 2) Try with crossmap tool
#### 3) Look at results

In [None]:
import pandas as pd

In [None]:

WRKDIR = "$PATH/chrY"
BFILEDIR = f"{WRKDIR}/y_male_only_bfiles"

# 1) UCSC Liftover Tool

## Make BED file for UCSC Liftover Tool

In [None]:
#hg38 = pd.read_csv(f"{BFILEDIR}/chrY.bim",sep="\t",header=None)
hg38 = pd.read_csv(f"{BFILEDIR}/chrY_male_hemizygous_only_het_filter.bim",sep="\t",header=None)
hg38.columns = ['chr','snp','idk','pos','alt','ref']
print(hg38.shape)
print(hg38.head())

In [None]:
set(hg38['chr'].to_list())

In [None]:
hg38_bed = pd.DataFrame(data = {'start':hg38['pos'], 'snp':hg38['snp']})
hg38_bed['chr']  = 'chrY'
hg38_bed['end'] = hg38_bed['start']+1
hg38_bed = hg38_bed[['chr','start','end','snp']]
print(hg38_bed.shape)
print(hg38_bed.head())

In [None]:
hg38_bed.to_csv(f"{BFILEDIR}/chrY_positions.bed",header=None,sep="\t",index=None)

## Liftover the HG38 Coordinates to HG19

here: https://genome.ucsc.edu/cgi-bin/hgLiftOver

get the list of variants that succeeded and failed in the liftover

In [None]:
lift_pass = pd.read_csv(f"{BFILEDIR}/liftover_success.bed",sep='\t',header=None)
lift_pass.columns = ['chr','start','end','snp']
print(lift_pass.shape)
print(lift_pass.head())

remove 'Deleted' lines from the failed liftover file  
`grep -v Deleted liftover_fail.txt > liftover_fail.bed`

In [None]:

lift_fail = pd.read_csv(f"{BFILEDIR}/liftover_fail.bed",sep='\t',header=None)
lift_fail.columns = ['chr','start','end','snp']
print(lift_fail.shape)
print(lift_fail.head())

## Identify Failed Variants

In [None]:
lift_fail[['snp']].to_csv(f"{BFILEDIR}/variants_to_exclude.txt",header=None,index=None)

## Remove the Failed Variants From Plink Binary Files

```plink --bfile chrY_male_hemizygous_only_het_filter --exclude variants_to_exclude.txt --make-bed --out chrY_male_hemizygous_only_het_filter_hg19```

In [None]:
#check bim line counts. difference should be the number of lines in lift_fail df above


!wc -l {BFILEDIR}/chrY_male_hemizygous_only_het_filter.bim
!wc -l {BFILEDIR}/chrY_male_hemizygous_only_het_filter_hg19.bim

## Now Convert the HG38 Positions to HG19 in the Files

In [None]:
temp_hg19 = pd.read_csv(f"{BFILEDIR}/chrY_male_hemizygous_only_het_filter_hg19.bim",sep="\t",header=None)
temp_hg19.columns = ['chr','snp','pos','bp','alt','ref']
print(temp_hg19.shape)
print(temp_hg19.head())

In [None]:
temp_hg19.drop_duplicates().shape

In [None]:
lift_pass.drop_duplicates().shape

In [None]:
lift_pass.shape

In [None]:
merged_pass = pd.merge(left = temp_hg19, right = lift_pass, on = 'snp').drop_duplicates()
print(merged_pass.shape)
print(merged_pass.head())

In [None]:
new_hg19_bim = merged_pass[['chr_y','snp','pos','start','alt','ref']]
new_hg19_bim.to_csv(f"{BFILEDIR}/chrY_male_hemizygous_only_het_filter_hg19.bim",sep="\t",header = None, index=None)

## Check For Other Chromosomes And Filter

In [None]:
#do we have any non-chrY chromosomes?
set(merged_pass['chr_y'].to_list())

`plink --allow-extra-chr --bfile chrY_male_hemizygous_only_het_filter_hg19 --chr chrY --make-bed --out chrY_male_hemizygous_only_het_filter_hg19_final`

In [None]:
#and change 24 back to chrY
final = pd.read_csv(f"{BFILEDIR}/chrY_male_hemizygous_only_het_filter_hg19_final.bim",sep="\t",header=None)
final.columns = ['chr','snp','pos','bp','alt','ref']
print(set(final['chr'].to_list()))
print(final.head())
final.chr = 'chrY'
print(final.head())



In [None]:
final.to_csv(f"{BFILEDIR}/chrY_male_hemizygous_only_het_filter_hg19_final.bim",sep="\t",header=None, index=None)

# 2) Crossmap Tool

get chain file from: http://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/ or ftp://ftp.ensembl.org/pub/assembly_mapping/homo_sapiens/

use bed file made in # 1 above  
`module load crossmap   
crossmap bed ../hg38ToHg19.over.chain.gz chrY_positions.bed > chrY_crossmap_hg19.bed  
grep -v Unmap chrY_crossmap_hg19.bed > chrY_crossmap_hg19_pass.bed `

In [None]:
crossmap = pd.read_table("$PATH/chrY_crossmap_hg19_pass.bed",sep="\t",header=None)
crossmap.columns = ['hg38_chr','hg38_start','hg38_end','hg38_snp','arrow','hg19_chr','hg19_start','hg19_end','hg19_snp']
print(crossmap.shape)
print(crossmap.head())

In [None]:
set(crossmap['hg19_chr'].tolist())

try with Ensembl chain file
`module load crossmap   
crossmap bed ../GRCh38_to_GRCh37.chain.gz chrY_positions.bed > chrY_crossmap_ensembl_hg19.bed  
grep -v Unmap chrY_crossmap_ensembl_hg19.bed > chrY_crossmap_ensembl_hg19_pass.bed`

In [None]:
crossmap_ensembl = pd.read_table("$PATH/chrY_crossmap_ensembl_hg19_pass.bed",sep="\t",header=None)
crossmap_ensembl.columns = ['hg38_chr','hg38_start','hg38_end','hg38_snp','arrow','hg19_chr','hg19_start','hg19_end','hg19_snp']
print(crossmap_ensembl.shape)
print(crossmap_ensembl.head())

In [None]:
set(crossmap_ensembl['hg19_chr'].tolist())

check how much the liftover results using the two different chain files overlap

In [None]:
merge_liftover = pd.merge(left = crossmap, right = crossmap_ensembl, on = 'hg38_snp')
print(merge_liftover.shape)
print(merge_liftover.drop_duplicates().shape)

In [None]:
merge_liftover_results = pd.merge(left = crossmap.drop_duplicates(), right = crossmap_ensembl.drop_duplicates(), on = ['hg19_chr','hg19_start','hg19_end','hg19_snp'], how = 'right')
print(merge_liftover_results.shape)

# 3) Look at Liftover Results

### Compare UCSC and Crossmap Results


In [None]:
print(temp_hg19.drop_duplicates().shape)
print(crossmap.drop_duplicates().shape)
test = pd.merge(left = temp_hg19, right = crossmap, left_on = 'snp', right_on = 'hg19_snp')
print(test.drop_duplicates().shape)

In [None]:
test.head()

In [None]:
test2 = pd.merge(left = temp_hg19, right = crossmap, left_on = ['snp','chr','pos'], right_on = ['hg19_snp','hg19_chr','hg19_start'], how = "inner")
print(test2.shape)
print(test2.drop_duplicates().shape)

In [None]:
temp_hg19[temp_hg19['chr']!='chrY'].shape

In [None]:
crossmap[crossmap['hg19_chr']!='chrY'].shape

### Plot Variants

In [None]:
temp_hg19 = pd.read_csv(f"{BFILEDIR}/chrY_male_hemizygous_only_het_filter_hg19.bim",sep="\t",header=None)
temp_hg19.columns = ['chr','snp','pos','bp','alt','ref']
print(temp_hg19.shape)
print(temp_hg19.head())
print(set(temp_hg19['chr'].tolist()))
bad_chr = temp_hg19[temp_hg19['chr']!='chrY']
print(bad_chr.shape)
bad_chr_vars = bad_chr['snp'].tolist()

In [None]:
final = pd.read_csv(f"{BFILEDIR}/chrY_male_hemizygous_only_het_filter_hg19_final.bim",sep="\t",header=None)
final.columns = ['chr','snp','pos','bp','alt','ref']
print(final.shape)
print(final.head())
pass_vars = final['snp'].tolist()

In [None]:
allchrY = pd.read_csv(f"{BFILEDIR}/chrY_male_hemizygous_only_het_filter.bim",sep="\t",header=None)
allchrY.columns = ['chr','snp','pos','bp','alt','ref']
print(allchrY.shape)
#0=pass, 1=failed, 2=converted to non Y chr
allchrY['lift_code']='liftover success'



In [None]:
fail_vars = allchrY.loc[~allchrY['snp'].isin(pass_vars) & ~allchrY['snp'].isin(bad_chr_vars),]
fail_vars = fail_vars['snp'].to_list()
print(len(fail_vars))

In [None]:
allchrY.loc[allchrY['snp'].isin(fail_vars),'lift_code'] = 'liftover failed'

In [None]:
allchrY.loc[allchrY['snp'].isin(bad_chr_vars),'lift_code'] = 'liftover to non-chrY'

In [None]:
allchrY.plot.scatter(x = 'bp',y = 'lift_code')

### MAF of failed liftover variants

`plink --bfile chrY_male_hemizygous_only_het_filter --freq --out chrY_male_hemizygous_only_het_filter`

In [None]:
mafs = pd.read_table(f"{BFILEDIR}/chrY_male_hemizygous_only_het_filter.frq",sep="\s+")
print(mafs.shape)
print(mafs.head())

In [None]:
mafs.describe()

In [None]:
pass_mafs = mafs[mafs.SNP.isin(pass_vars)]

In [None]:
pass_mafs.shape

In [None]:
pass_mafs.describe()

In [None]:
fail_mafs = mafs[mafs.SNP.isin(fail_vars)]

In [None]:
fail_mafs.shape

In [None]:
fail_mafs.describe()

In [None]:
fail_mafs.head()

In [None]:
mafs.head()

In [None]:
mafs[mafs['MAF'].isnull()].shape

In [None]:
mafs[mafs['MAF'].isnull()]