# Annotate gene expression files with ancestry information (mtDNA haplogroup, global nuclear ancestry), mitonuclear DNA discordance, and available phenotypes.

In [1]:
import pandas as pd
#pd.set_option("display.max_rows", None, "display.max_columns", None)

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from scipy import stats
import statistics
import statsmodels.api as sm
from itertools import permutations
#import pdfkit as pdf

import seaborn as sns; sns.set(style="ticks", color_codes=True)

from Bio import SeqIO
import sys

In [2]:
%%bash

# Download GTEx v8 normalized counts (gene TPMs).
wget https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz

## Import gene TPM files (from GTEx portal).

Detailed analyses by GTEx: https://gtexportal.org/home/documentationPage

In [None]:
df_tpm_v8 = pd.read_table("GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct", skiprows=[0,1])
df_tpm_v8.name = 'df_tpm_v8'

## Wrangle gene TPMs (from GTEx Portal)

Keep protein-coding genes in mitochondrial DNA (Gene name, short GTEx ID, and long GTEx ID).

Annotate the normalized counts (TPM) with population (AfAM/EuAm), tissue (e.g. Muscle - Skeletal), and mtDNA haplogroup (e.g. L).

In [4]:
# The list of mtDNA protein-coding genes.
list_mtdna = [ 'MT-ND1','MT-ND2','MT-CO1','MT-CO2','MT-ATP8','MT-ATP6','MT-CO3','MT-ND3','MT-ND4L','MT-ND4','MT-CYB','MT-ND5','MT-ND6' ]

In [None]:
def wrangle_counts( counts ):
    # Keep mtDNA prot cod genes.
    counts_mt = counts[counts['Description'].isin(list_mtdna)]  
    # Move individual IDs into rows.
    counts_mt = counts_mt.melt(id_vars=['Name','Description'], var_name='GTEX_ID', value_name='TPM')
    # Change header names.
    counts_mt = counts_mt.rename(columns={'Name':'ENS_ID','Description':'Gene'})
    # Get rid of the Ensemble gene IDs.
    counts_mt.drop('ENS_ID', inplace=True, axis=1)
    # Unmelt by pivoting and resetting index.
    #counts_mt = counts_mt.pivot(index='GTEX_ID',columns='Gene').reset_index()
    return(counts_mt)

def get_info():
    # Get GTEX_ID, mitochondrial haplogroup, and self-reported race.
    info = pd.read_table("../data/secure/phenotypes_v8.txt")
    info.columns = ['GTEX_ID','short_ID','self_rep_race','mtDNA_haplo','Tissue']
    # Change labels of self-reported race.
    info['self_rep_race'] = [ {2:'AfAm',3:'EuAm'}[x] for x in info['self_rep_race'] ]
    return(info)

def ann_counts(counts):
    # Add the info to the merged raw counts.
    ann_counts = pd.merge( wrangle_counts(counts), get_info(), on=['GTEX_ID'] )
    #ann_counts.name = ann_counts.name + "_ann"
    return(ann_counts)
    

df_tpm_v8_ann = ann_counts(df_tpm_v8)
df_tpm_v8_ann.name = "df_tpm_v8_ann"

## Remove the gene ID prefix `"MT-"`

In [8]:
# Remove the gene ID prefix "MT-".
def rem_mt_prefix(df):
    dict_mtdna = {'MT-ND1':'ND1','MT-ND2':'ND2','MT-CO1':'CO1','MT-CO2':'CO2','MT-ATP8':'ATP8','MT-ATP6':'ATP6','MT-CO3':'CO3','MT-ND3':'ND3','MT-ND4L':'ND4L','MT-ND4':'ND4','MT-ND5':'ND5','MT-CYB':'CYB','MT-ND6':'ND6'}
    new_col = [dict_mtdna[gene] for gene in df['Gene']]
    return(new_col)


df_tpm_v8_ann['Gene'] = rem_mt_prefix(df_tpm_v8_ann)

# Mitonuclear discordance
- Annotate dataframe with global ancestry estimates.
- Estimate Minonuclear discordance using mtDNA haplogroup and global ancestry values.

In [19]:
def get_globalAnc(K):
    df = pd.read_table("../data/2021-04-09/mitonuclearDiscordance_global"+K+"_GTExv8.txt", index_col=0)
    df = df.rename(columns={'subject':'short_ID', 'self_rep':'self_rep_race'})
    df['self_rep_race'] = [ {2:'AfAm',3:'EuAm'}[x] for x in df['self_rep_race'] ]
    df['mitonucl_discord'] = recalc_mnd(df)
    return(df)

def ann_globalAnc(tpm,K):
    globalK = get_globalAnc(K)
    tpm_mnd = pd.merge(tpm,globalK, on=['short_ID','self_rep_race'])
    #tpm_mnd.name = tpm_mnd.name +"_"+ K
    return(tpm_mnd)
    
def recalc_mnd(ann):
    col_mnd=[]
    for i,val in enumerate(ann['mt_ancestry']):
        # Estimate mitonuclear discordance.
        if val == 'African':
            mnd = 1 - ann['global_af'].iloc[i]
        elif val == 'European':
            mnd = 1 - ann['global_eu'].iloc[i]
        elif val == 'Asian' or val == 'NatAm':
            mnd = 1 - ann['global_as'].iloc[i]
        else:
            # If I see this, I should realize something is wrong.
            mnd == 447
        # Output the MND value.
        col_mnd.append(mnd)
    return(col_mnd)
            

globalK2 = get_globalAnc('K2')

df_tpm_v8_ann_K2 = ann_globalAnc(df_tpm_v8_ann,'K2')
df_tpm_v8_ann_K2.name = "df_tpm_v8_ann_K2"

#recalc_mnd(globalK2,'K2')

## Number of samples per tissue.

In [46]:
def tissue_counts():
    data = df_tpm_v8_ann_K2[['short_ID','self_rep_race','Tissue']].drop_duplicates()
    out = pd.DataFrame()
    ## Get total samples.
    ##out = out.append(data['mtDNA_haplo'].value_counts().to_dict(),ignore_index=True)
    for group in ['AfAm','EuAm']:
        # Get the counts of haplogroups per group.
        x = data[data['self_rep_race']==group]['Tissue'].value_counts().to_dict()
        out = out.append(x, ignore_index=True)
    # Add label.
    out.index = ['AfAm','EuAm']
    # Filter tissues.
    #list_tissue = 
    out = out[list_tissue]
    # Transpose.
    out = out.transpose()
    # Add a column with total number per tissue.
    out['Total'] = out['AfAm'] + out['EuAm']
    # Sorted by total.
    out = out.sort_values('Total', ascending=False)
    return(out)

tissue_counts()

Unnamed: 0,AfAm,EuAm,Total
Muscle - Skeletal,85.0,580.0,665.0
Whole Blood,79.0,550.0,629.0
Artery - Tibial,75.0,471.0,546.0
Nerve - Tibial,66.0,431.0,497.0
Esophagus - Muscularis,54.0,379.0,433.0
Heart - Left Ventricle,42.0,321.0,363.0
Heart - Atrial Appendage,41.0,311.0,352.0


## Number of samples per population.

In [47]:
pd.DataFrame(df_tpm_v8_ann_K2[['short_ID','self_rep_race']].drop_duplicates()['self_rep_race'].value_counts())

Unnamed: 0,self_rep_race
EuAm,688
AfAm,101


# ALTERNATIVE: Remake the global ancestry, mitohaplo merge.

In [None]:
# Input mt haplogroup info for GTEx samples.
def get_haplo():
    df_haplo = pd.read_table("GTEx_v8_mtDNA_haplogroups.txt", header=None)
    df_haplo.columns = ['short_ID','mt_haplo','pop']
    df_haplo['pop'] = [{3:'EuAm',2:'AfAm'}[x] for x in df_haplo['pop']]
    df_haplo['mt_haplo_pop'] = [ {'L':'African','H':'Eurasian','U':'Eurasian','T':'Eurasian','J':'Eurasian',
                                  'K':'Eurasian','I':'Eurasian','V':'Eurasian','W':'Eurasian','X':'Eurasian',
                                  'C':'Native American','B':'Native American','A':'Native American',
                                  'Z':'Asian','F':'Asian','R':'Eurasian?','M':'Eurasian?','N':'Eurasian?'}[x] for x in df_haplo['mt_haplo']]
    return(df_haplo)

# Input qlobal ancestry estimated for K=2 using ADMIXTURE.
def get_global():
    df_ids = pd.read_table("/Users/edmundo/Documents/GitHub/mitonuclear_gtex/mitonucl/data/2021-04-09/NATMERG-WGS-ALL_biall_nopal_filt_common_pruned_keep2.ids", header=None)
    df_ids.columns = ['short_ID']
    df_ids = df_ids.reset_index()
    #return(df_ids)
    df_Q = pd.read_table("/Users/edmundo/Documents/GitHub/mitonuclear_gtex/mitonucl/data/2021-04-09/NATMERG-WGS-ALL_biall_nopal_filt_common_pruned_keep2.2.Q", header=None, sep=' ')
    df_Q.columns = ['anc_afr','anc_eur']
    df_Q = df_Q.reset_index()
    #return(df_Q)
    df_glob = pd.merge(df_ids,df_Q, on='index').drop(columns=['index'])
    return(df_glob)
    
# Compute mitonuclear DNA discordance.
def recalc_mnd(ann):
    col_mnd=[]
    for i,val in enumerate(ann['mt_haplo_pop']):
        # Estimate mitonuclear discordance.
        if val == 'African':
            mnd = 1 - ann['anc_afr'].iloc[i]
        elif val == 'Eurasian':
            mnd = 1 - ann['anc_eur'].iloc[i]
        elif val != 'African' or val != 'Eurasian':
            mnd = "Unknown"
        # Output the MND value.
        col_mnd.append(mnd)
    return(col_mnd)

# Merge these annotations into one file.
def join_global_haplo():
    df_ann = pd.merge( get_global() , get_haplo() ,on='short_ID')
    #return(df_ann)
    df_ann['mnd'] = recalc_mnd(df_ann)
    return(df_ann)


### Input the GTEx TPM file. ###
# Do not use. Old and missing haplogroups.
###df_tpm = pd.read_table('../results/2021-04/gtexportal_v8_tpm_ann.mt', sep='\t')[['Gene','GTEX_ID','TPM','short_ID','Tissue']]

# Import GTEx v8 gene reads (in TPM):
#df_tpm = pd.read_table("GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct", skiprows=[0,1])
#df_tpm.to_csv("gtexportal_v8_tpm_ann.mt.tab", sep='\t')
df_tpm = pd.read_table("gtexportal_v8_tpm_ann.mt.tab", sep='\t')

# Wrangle the GTEx Portal v8 "gene TPMs" file.
def wrangle_tpm( counts ):
    # Keep mtDNA protein coding genes.
    # The list of mtDNA protein-coding genes.
    list_mtdna = [ 'MT-ND1','MT-ND2','MT-CO1','MT-CO2','MT-ATP8','MT-ATP6','MT-CO3','MT-ND3','MT-ND4L','MT-ND4','MT-CYB','MT-ND5','MT-ND6' ]
    counts_mt = counts[counts['Description'].isin(list_mtdna)]  
    # Move individual IDs into rows.
    counts_mt = counts_mt.melt(id_vars=['Name','Description'], var_name='GTEX_ID', value_name='TPM')
    # Change header names.
    counts_mt = counts_mt.rename(columns={'Name':'ENS_ID','Description':'Gene'})
    # Get rid of the Ensemble gene IDs.
    counts_mt.drop('ENS_ID', inplace=True, axis=1)
    # 
    counts_mt['short_ID'] = ["-".join(x.split('-')[:2]) for x in df_wrangled['GTEX_ID']]
    return(counts_mt)

# Remove the gene ID prefix "MT-".
def remove_mt_prefix(df):
    dict_mtdna = {'MT-ND1':'ND1','MT-ND2':'ND2','MT-CO1':'CO1','MT-CO2':'CO2','MT-ATP8':'ATP8','MT-ATP6':'ATP6','MT-CO3':'CO3','MT-ND3':'ND3','MT-ND4L':'ND4L','MT-ND4':'ND4','MT-ND5':'ND5','MT-CYB':'CYB','MT-ND6':'ND6'}
    new_col = [dict_mtdna[gene] for gene in df['Gene']]
    return(new_col)

# Join the TPM and annotations of global ancestry, mt haplogroup, and mitonuclear discordance.
def join_tpm_ann(counts):
    # Global anc and mt haplogroup annotations.
    df_ann = join_global_haplo()
    # Add the info to the merged raw counts.
    ann_counts = pd.merge( wrangle_tpm(counts), df_ann )
    # Remove "MT-" prefix.
    ann_counts['Gene'] = remove_mt_prefix(ann_counts)
    return(ann_counts)


#get_haplo()
#get_global()
#join_global_haplo()
#df_tpm_ann = join_tpm_ann()
#df_tpm_ann['Gene'] = remove_mt_prefix(df_tpm_ann)
#df_wrangled = wrangle_tpm(df_tpm)
df_tpm_ann = join_tpm_ann(df_tpm)

In [None]:
#df_tpm.to_csv("gtexportal_v8_tpm_ann.mt.tab", sep='\t')