# Plot Y Chromosome Gene Blood Expression From AMP-PD Data with Different Y Chromosome Haplogroups
- **Author(s)** - Frank Grenn
- **Quick Description:** Get AMP-PD expression data, plot with haplogroup and case/control status

In [None]:

import os

import pandas as pd
import mygene

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

In [None]:
WRKDIR = "/PATH"


## AMP-PD Gene Expression Data

In [None]:
amp_counts = pd.read_csv(f"{WRKDIR}/chrY/expression/amppd_chrY_featureCounts.csv")
amp_counts = amp_counts.set_index("Geneid")
print(amp_counts.shape)
print(amp_counts.iloc[0:5,0:5])

## Sample Data


In [None]:
meta = pd.read_csv(f"{WRKDIR}/chrY/output_male_hemizygous_only_het_filter_run/chrY_meta.csv")

print(meta.shape)
print(meta.head())

## Get list of genes to plot

In [None]:
results = pd.read_csv(f"{WRKDIR}/chrY/expression/amppd_haplogroup_G_diff_exp_edgeR_results.csv")
print(results.shape)
print(results.head())

In [None]:
genes_to_plot = results.loc[(results.FDR<0.05) & (results.logFC>2),].iloc[:,0].tolist()
print(len(genes_to_plot))

In [None]:
print(genes_to_plot)

## Plot for a multiple genes

In [None]:
#rows and cols in plot grid. Need to be able to fit everything in ct list above
rows = 2
cols = 2

#haplogroups to include in the plots
haplos = ['R','I','J','E','G']

In [None]:


gene_count = 0;
subplot_count = 0
fig = plt.figure(figsize=(12, 8), dpi=80)
fig.subplots_adjust(hspace=0.5, wspace=0.5)

for i in range(rows):
    for j in range(cols):

        gene = genes_to_plot[gene_count]
        print(gene)
        full_gene = [g for g in amp_counts.index if gene in g][0]
        print(full_gene)
        
        gene_counts = amp_counts[[gene in i for i in amp_counts.index]].transpose()
        print(gene_counts.shape)
        
        meta_quant = pd.merge(left = meta, left_on= 'fid',right = gene_counts, right_index=True)
        meta_quant = meta_quant.rename(columns = {full_gene:'counts'})
        print(meta_quant.shape)
        print(meta_quant.head())

        

        #add subplot
        ax = fig.add_subplot(rows,cols,subplot_count+1)

        #plot
        sns.set()


        sns_plot = sns.violinplot(palette="tab10",x='yhaplo_haplo_major', y='counts', data=meta_quant,inner = "box",order=sorted(haplos),hue='pheno',split=False)
        sns_plot = sns.stripplot(palette="tab10",linewidth=1,x='yhaplo_haplo_major', y='counts', data=meta_quant,color=".4",order=sorted(haplos),hue='pheno',dodge=True)
        #sns_plot = sns.boxplot(palette="tab10",x="yhaplo_haplo_major", y="TPM",color=".4",data=df, order=sorted(haplogroups),hue="pheno" )

        plt.xlabel(f"Y Chromosome Haplogroup")
        plt.ylabel(f"Counts")
        plt.title(f"{full_gene}")

        handles, labels = sns_plot.get_legend_handles_labels()

        labels = ['control','case','control','case']

        # When creating the legend, only use the first two elements
        # to effectively remove the last two.
        l = plt.legend(handles[0:2], labels[0:2], bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)

        subplot_count = subplot_count + 1 

    

        gene_count = gene_count + 1
        if(gene_count >= len(genes_to_plot)):
            break;
            
    if(gene_count >= len(genes_to_plot)):
        break;



fig.suptitle(f"Genes Highly Expressed in AMP-PD Samples with Major Haplogroup G")
plt.show()
sns_plot.get_figure().savefig(f"{WRKDIR}/chrY/expression/haplo_g_amppd_counts_violin_multiplot.png")   




In [None]:
amp_counts.loc["ENSG00000184895.7",]

In [None]:



gene_count = 0;
subplot_count = 0
fig = plt.figure(figsize=(10, 14), dpi=80)
fig.subplots_adjust(hspace=0.5, wspace=0.5)

for i in range(rows):
    for j in range(cols):

        gene = genes_to_plot[gene_count]
        
        var_geno = genos[genos.FID==variant].T
        var_geno.columns = var_geno.iloc[0,:]
        var_geno = var_geno.drop('FID',0)

        #get expression data for the gene in a specific celltype
        exp_data = pd.read_table(f"{processing_output}/{celltype}_0025_15.txt")
        if(gene in exp_data.GENES.tolist()):
            
            #get pvalue
            eqtl_output = pd.read_table(f"{output_path}/{cistrans}_{celltype}_0025_15.txt")
            subplot_title = celltype
            
            eqtl_row = eqtl_output[(eqtl_output.snps==variant ) & (eqtl_output.gene==gene)]
            if(len(eqtl_row)==1):
                print(eqtl_row)
                print(eqtl_row.pvalue)
                print(eqtl_row.pvalue.values[0])
                pval = eqtl_row.pvalue.values[0]
                print(pval)
                subplot_title = f"{celltype} \n eQTL pvalue:{pval}"
                print(f"has pvalue:{pval}")
            
            
            print(f"{gene} in {celltype} data")
            exp_data = exp_data[exp_data.GENES==gene].T
            exp_data.columns = exp_data.iloc[0,:]
            exp_data = exp_data.drop('GENES',0)

            #merge
            merged = pd.merge(left = var_geno, right = exp_data, left_index = True, right_index = True).astype('float64')
            
            #add subplot
            ax = fig.add_subplot(rows,cols,subplot_count+1)
               
            #plot
            sns.set()
            #plt.figure(figsize=(10,10))

            sns_plot = sns.violinplot(x=variant, y=gene, data=merged,inner = "box",ax = ax)
            sns_plot = sns.stripplot(x=variant, y=gene, data=merged,color=".4",ax = ax)
            plt.xlabel(f"{variant} genotype")
            plt.ylabel(gene)
            plt.title(f"{subplot_title}")
            
            subplot_count = subplot_count + 1 
        else:
            print(f"{gene} not in {celltype} data")

        gene_count = gene_count + 1
        if(celltype_count >= len(genes_to_plot)):
            break;
            
    if(celltype_count >= len(genes_to_plot)):
        break;


fig.suptitle(f"{variant} Genotypes and {gene} Expression")
plt.show()
sns_plot.get_figure().savefig(f"{combined_violin_plot_output}/{variant}_{cistrans}_eQTL_{gene}_{ctname}.png")   





In [None]:
for g in genes_to_plot:
    print(g)

    print(amp_counts[[g in i for i in amp_counts.index]].shape)
    print(amp_counts[[g in i for i in amp_counts.index]].transpose().iloc[0:5,0:5])

In [None]:
genes = ["ENSG00000184895.7"]

In [None]:
amp_counts.loc[gene,].to_frame()

In [None]:
amp_counts.loc[gene,].describe()

In [None]:
meta_quant = pd.merge(left = meta, left_on= 'fid',right = amp_counts.loc[gene,].to_frame(), right_index=True)
meta_quant = meta_quant.rename(columns = {gene:'counts'})
print(meta_quant.shape)
print(meta_quant.head())

In [None]:
#check haplogroup counts to see which are worth plotting.
meta_quant.yhaplo_haplo_major.value_counts()

In [None]:
haplos = ['R','I','J','E','G']

In [None]:
def haplogroup_violin_plot(df, title, haplogroups, save_file = None, combine_all=True):
    

    #if we want a violin plot with all data combined, the duplicate the df, set the haplo col to 'all_haplogroups' and add to the main dataframe
    if (combine_all):
        combine_df = df.copy()
        #print(combine_df.yhaplo_haplo_major.value_counts())
        combine_df = combine_df[combine_df.yhaplo_haplo_major.isin(haplogroups)]
        #print(combine_df.yhaplo_haplo_major.value_counts())
        combine_df.yhaplo_haplo_major = 'All Haplogroups'
        df = df.append(combine_df, ignore_index=True)
        haplogroups = haplogroups + ['All Haplogroups']
    
    #plot
    plt.figure(figsize=(7, 5))
    sns.set()

    
    sns_plot = sns.violinplot(palette="tab10",x='yhaplo_haplo_major', y='counts', data=df,inner = "box",order=sorted(haplogroups),hue='pheno',split=False)
    sns_plot = sns.stripplot(palette="tab10",linewidth=1,x='yhaplo_haplo_major', y='counts', data=df,color=".4",order=sorted(haplogroups),hue='pheno',dodge=True)
    #sns_plot = sns.boxplot(palette="tab10",x="yhaplo_haplo_major", y="TPM",color=".4",data=df, order=sorted(haplogroups),hue="pheno" )

    plt.xlabel(f"Y Chromosome Haplogroup")
    plt.ylabel(f"Counts")
    plt.title(title)

    handles, labels = sns_plot.get_legend_handles_labels()

    labels = ['control','case','control','case']

    # When creating the legend, only use the first two elements
    # to effectively remove the last two.
    l = plt.legend(handles[0:2], labels[0:2], bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)


    plt.show()
    if (save_file is not None):
        sns_plot.get_figure().savefig(save_file,bbox_inches='tight')

In [None]:
haplogroup_violin_plot(meta_quant,f"SRY ({gene}) Expression", haplos, save_file = f"{WRKDIR}/chrY/expression/amp_sry_counts_plot.png",combine_all=True)