# Plot Y Chromosome Gene Blood Expression From AMP-PD Data with Different Y Chromosome Haplogroups
- **Author(s)** - Frank Grenn
- **Date Started** - August  2021
- **Quick Description:** Get AMP-PD expression data, plot with haplogroup and case/control status

In [None]:

import os

import pandas as pd
import mygene

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

In [None]:
WRKDIR = "$PATH"


## AMP-PD Gene Expression Data

In [None]:
amp_counts = pd.read_csv(f"{WRKDIR}/chrY/expression/amppd_chrY_featureCounts.csv")
amp_counts = amp_counts.set_index("Geneid")
print(amp_counts.shape)
print(amp_counts.iloc[0:5,0:5])

## Sample Data


In [None]:
meta = pd.read_csv(f"{WRKDIR}/chrY/output_male_hemizygous_only_het_filter_run/chrY_meta.csv")

print(meta.shape)
print(meta.head())

## Plot for a gene

In [None]:
#gene = "ENSG00000184895"
gene = "ENSG00000184895.7"

In [None]:
amp_counts.loc[gene,].to_frame()

In [None]:
amp_counts.loc[gene,].describe()

In [None]:
meta_quant = pd.merge(left = meta, left_on= 'fid',right = amp_counts.loc[gene,].to_frame(), right_index=True)
meta_quant = meta_quant.rename(columns = {gene:'counts'})
print(meta_quant.shape)
print(meta_quant.head())

In [None]:
#check haplogroup counts to see which are worth plotting.
meta_quant.yhaplo_haplo_major.value_counts()

In [None]:
haplos = ['R','I','J','E','G']

In [None]:
def haplogroup_violin_plot(df, title, haplogroups, save_file = None, combine_all=True):
    

    #if we want a violin plot with all data combined, the duplicate the df, set the haplo col to 'all_haplogroups' and add to the main dataframe
    if (combine_all):
        combine_df = df.copy()
        #print(combine_df.yhaplo_haplo_major.value_counts())
        combine_df = combine_df[combine_df.yhaplo_haplo_major.isin(haplogroups)]
        #print(combine_df.yhaplo_haplo_major.value_counts())
        combine_df.yhaplo_haplo_major = 'All Haplogroups'
        df = df.append(combine_df, ignore_index=True)
        haplogroups = haplogroups + ['All Haplogroups']
    
    #plot
    plt.figure(figsize=(7, 5))
    sns.set()

    
    sns_plot = sns.violinplot(palette="tab10",x='yhaplo_haplo_major', y='counts', data=df,inner = "box",order=sorted(haplogroups),hue='pheno',split=False)
    sns_plot = sns.stripplot(palette="tab10",linewidth=1,x='yhaplo_haplo_major', y='counts', data=df,color=".4",order=sorted(haplogroups),hue='pheno',dodge=True)
    #sns_plot = sns.boxplot(palette="tab10",x="yhaplo_haplo_major", y="TPM",color=".4",data=df, order=sorted(haplogroups),hue="pheno" )

    plt.xlabel(f"Y Chromosome Haplogroup")
    plt.ylabel(f"Counts")
    plt.title(title)

    handles, labels = sns_plot.get_legend_handles_labels()

    labels = ['control','case','control','case']

    # When creating the legend, only use the first two elements
    # to effectively remove the last two.
    l = plt.legend(handles[0:2], labels[0:2], bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)


    plt.show()
    if (save_file is not None):
        sns_plot.get_figure().savefig(save_file,bbox_inches='tight')

In [None]:
haplogroup_violin_plot(meta_quant,f"SRY ({gene}) Expression", haplos, save_file = f"{WRKDIR}/chrY/expression/amp_sry_counts_plot.png",combine_all=True)