# Check Chromosome Y Reference Data Variant Locations
- **Author(s)** - Frank Grenn
- **Date Started** - May 2022
- **Quick Description:** quick check to see what areas of the gnome are covered by certain y chromosome variant datasets (like if they cover pseudo autosomal regions or heterochromatic regions)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
WRKDIR = "$PATH/chrY"
PAR1_hg38 = [10001,2781479]
PAR2_hg38 = [56887903,57217415]
LONG_ARM_hg38 = [26600001,57227415]
PAR1_hg37 = [10001,2649520]
PAR2_hg37 = [59034050,59363566]

TEMP_FULL = [1,60000000]

In [None]:
def scatter_plot_region(df, region, pos_col, cat_col):
    print(df.shape)
    region_df = df.loc[(df[pos_col]>region[0]) & (df[pos_col]<region[1]),]
    print(region_df.shape)
    
    region_df.plot.scatter(x = pos_col, y = cat_col)

## Check Y-LineageTracker Reference


In [None]:
ltrack_haplos = pd.read_csv(f"{WRKDIR}/../Y-LineageTracker/LineageTracker/Data/HaplogroupInfo.csv")
ltrack_haplos['haplo_major'] = ltrack_haplos['Haplogroup'].str[0]
ltrack_haplos['dataset'] = 'Y-LineageTracker Ref'
print(ltrack_haplos.shape)
print(ltrack_haplos.head())

In [None]:
print(ltrack_haplos.Build37.max())
print(ltrack_haplos.Build37.min())

In [None]:
scatter_plot_region(ltrack_haplos, TEMP_FULL, 'Build38', 'haplo_major')
scatter_plot_region(ltrack_haplos, PAR1_hg38, 'Build38', 'haplo_major')
scatter_plot_region(ltrack_haplos, PAR2_hg38, 'Build38', 'haplo_major')
scatter_plot_region(ltrack_haplos, TEMP_FULL, 'Build37', 'haplo_major')
scatter_plot_region(ltrack_haplos, PAR1_hg37, 'Build37', 'haplo_major')
scatter_plot_region(ltrack_haplos, PAR2_hg37, 'Build37', 'haplo_major')

## Check ISOGG Download

In [None]:
isogg = pd.read_csv(f"{WRKDIR}/SNP_Index_Human.csv")
isogg['haplo_major'] = isogg['Subgroup Name'].str[0]
isogg['dataset'] = 'ISOGG Ref'
print(isogg.shape)
print(isogg.head())


In [None]:

isogg_37 = isogg.copy()
isogg_37['Build 37 Number'] = pd.to_numeric(isogg_37['Build 37 Number'],errors = 'coerce')
print(isogg_37.shape)
isogg_37.dropna(subset = ['Build 37 Number'],inplace=True)
print(isogg_37.shape)
#isogg_37 = isogg_37.astype({'Build 37 Number':'Int64'})
print(PAR1_hg37)
print(PAR2_hg37)
print(isogg_37['Build 37 Number'].max())
print(isogg_37['Build 37 Number'].min())

In [None]:

isogg_38 = isogg.copy()
isogg_38['Build 38 Number'] = pd.to_numeric(isogg_38['Build 38 Number'],errors = 'coerce')
print(isogg_38.shape)
isogg_38 = isogg_38.dropna(subset = ['Build 38 Number'])
print(isogg_38.shape)

print(PAR1_hg38)
print(PAR2_hg38)
print(isogg_38['Build 38 Number'].max())
print(isogg_38['Build 38 Number'].min())

In [None]:
scatter_plot_region(isogg_37, TEMP_FULL, 'Build 37 Number', 'haplo_major')
scatter_plot_region(isogg_37, PAR1_hg37, 'Build 37 Number', 'haplo_major')
scatter_plot_region(isogg_37, PAR1_hg37, 'Build 37 Number', 'haplo_major')
scatter_plot_region(isogg_38, TEMP_FULL, 'Build 38 Number', 'haplo_major')
scatter_plot_region(isogg_38, PAR1_hg38, 'Build 38 Number', 'haplo_major')
scatter_plot_region(isogg_38, PAR2_hg38, 'Build 38 Number', 'haplo_major')

## Check AMP-PD Data

In [None]:
amppd_38 = pd.read_table(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_before_liftover.bim",header=None, names = ['chr','snp','pos','bp','ref','alt'])
amppd_38['dataset']='AMP-PD'
print(amppd_38.shape)
print(amppd_38.head())

In [None]:
amppd_38.bp.max()

In [None]:
amppd_19 = pd.read_table(f"{WRKDIR}/y_male_only_bfiles/chrY_male_hemizygous_only_het_filter_hg19_final.bim",header=None, names = ['chr','snp','pos','bp','ref','alt'])
amppd_19['dataset']='AMP-PD'
print(amppd_19.shape)
print(amppd_19.head())

In [None]:
scatter_plot_region(amppd_19, TEMP_FULL, 'bp', 'chr')
scatter_plot_region(amppd_19, PAR1_hg37, 'bp', 'chr')
scatter_plot_region(amppd_19, PAR1_hg37, 'bp', 'chr')
scatter_plot_region(amppd_38, TEMP_FULL, 'bp', 'chr')
scatter_plot_region(amppd_38, PAR1_hg38, 'bp', 'chr')
scatter_plot_region(amppd_38, PAR2_hg38, 'bp', 'chr')

## Check UKBB Data

In [None]:
ukbb = pd.read_table(f"{WRKDIR}/y_ukbb/chrY_male_only.bim",header=None, names = ['chr','snp','pos','bp','ref','alt'])
ukbb['dataset']='UKBB'
print(ukbb.shape)
print(ukbb.head())

In [None]:
scatter_plot_region(ukbb, TEMP_FULL, 'bp', 'chr')
scatter_plot_region(ukbb, PAR1_hg37, 'bp', 'chr')
scatter_plot_region(ukbb, PAR1_hg37, 'bp', 'chr')

## Check NeruoX Data

In [None]:
neurox = pd.read_table(f"{WRKDIR}/y_neurox/neurox_chrY_male_only.bim",header=None, names = ['chr','snp','pos','bp','ref','alt'])
neurox['dataset']='NeuroX'
print(neurox.shape)
print(neurox.head())

In [None]:
scatter_plot_region(neurox, TEMP_FULL, 'bp', 'chr')
scatter_plot_region(neurox, PAR1_hg37, 'bp', 'chr')
scatter_plot_region(neurox, PAR1_hg37, 'bp', 'chr')

## Make figure with all datasets

In [None]:
plot_df = ltrack_haplos[['Build37','dataset']].copy()
print(plot_df.shape)
plot_df = plot_df.append(isogg_37[['Build 37 Number','dataset']].rename(columns = {'Build 37 Number':'Build37'}))
print(plot_df.shape)
plot_df = plot_df.append(amppd_19[['bp','dataset']].rename(columns = {'bp':'Build37'}))
print(plot_df.shape)
plot_df = plot_df.append(neurox[['bp','dataset']].rename(columns = {'bp':'Build37'}))
print(plot_df.shape)
plot_df = plot_df.append(ukbb[['bp','dataset']].rename(columns = {'bp':'Build37'}))
print(plot_df.shape)
print(plot_df.head())

In [None]:
fig = plt.figure(figsize=(12, 6), dpi=80)

               
#plot
sns.set()


sns_plot = sns.scatterplot(data = plot_df, x = "Build37", y = "dataset", hue = "dataset", linewidth=0)

#sns_plot.fill_between(y1 = 'Y-LineageTracker Ref',y2 = 'UKBB', x = TEMP_FULL, color = 'gray', alpha = 0.4#
sns_plot.fill_between(y1 = 'Y-LineageTracker Ref',y2 = 'UKBB', x = PAR1_hg37, color = 'gray', alpha = 0.4)
sns_plot.fill_between(y1 = 'Y-LineageTracker Ref',y2 = 'UKBB', x = PAR2_hg37, color = 'gray', alpha = 0.4)

locs, labels = plt.xticks()
plt.setp(labels, rotation = 30)

plt.ticklabel_format(style='plain',axis='x')

sns_plot.get_legend().remove()
#plt.legend(bbox_to_anchor=(1.25, 1),borderaxespad=0)
plt.xlabel("Chromosome Y Position (hg19)")
plt.ylabel("Dataset")
plt.title("Y Chromosome Variant Positions in Datasets",fontsize = 16)
plt.show()           

sns_plot.get_figure().savefig(f"{WRKDIR}/y_variant_dataset_positions.png",bbox_inches = "tight")  