In [1]:
## Daniel Marten 
## Significance Values for Mean Count and TPMs

import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import statistics
import sys
import os
import re
import scipy
import statsmodels
import statsmodels.api as sm
import statsmodels.stats.weightstats as sm_stats
import seaborn as sns

from statsmodels.formula.api import ols as formula_OLS


from cmapPy.pandasGEXpress.parse_gct import parse as tpm_parser




In [2]:
# Groupings of the major tissue categories used

ectoDerm = [
    "Nerve_Tibial",
    "Skin Not Sun Exposed (Suprapubic)",
    "Skin_Sun_Exposed_Lower_leg",
    "Breast - Mammary Tissue",
    "Pituitary",
    "Adrenal Gland",
    "Brain - Cerebellum",
    "Brain - Cortex",
    "Cervix - Endocervix",
    "Minor Salivary Gland",
    ]


brainDerm = [
    "Brain - Nucleus accumbens (basal ganglia)",
    "Brain - Caudate (basal ganglia)",
    "Brain - Cerebellar Hemisphere",
    "Brain - Frontal Cortex (BA9)",
    "Brain - Hypothalamus",
    "Brain - Putamen (basal ganglia)",
    "Brain - Hippocampus",
    "Brain - Anterior cingulate cortex (BA24)",
    "Brain - Amygdala",
    "Brain - Spinal cord (cervical c-1)",
    "Brain - Substantia nigra",
    "Brain - Cerebellum",
    "Brain - Cortex",]

mesoDerm = [
    "Muscle - Skeletal",
    "Whole Blood",
    "Adipose - Subcutaneous",
    "Cells - Cultured fibroblasts",
    "Adipose - Visceral (Omentum)",
    "Esophagus - Muscularis",
    "Artery - Aorta",
    "Heart - Left Ventricle",
    "Heart - Atrial Appendage",
    "Spleen",
    "Prostate",
    "Artery - Coronary",
    "Artery - Tibial",
    "Cells - EBV-transformed lymphocytes",
    "Vagina",
    "Uterus",
    "Kidney - Cortex",
    "Bladder",
    "Cervix - Ectocervix",
    "Fallopian Tube",
    "Kidney - Medulla",]

endoDerm = [
    "Thyroid",
    "Lung",
    "Esophagus - Mucosa",
    "Colon - Transverse",
    "Esophagus - Gastroesophageal Junction",
    "Stomach",
    "Colon - Sigmoid",
    "Pancreas",
    "Liver",
    "Small Intestine - Terminal Ileum"]

germline = [
    'Testis',
    'Ovary'
]

tissuecats = {'Brain':[brainDerm],
              'Ecto':[ectoDerm],
              'Meso':[mesoDerm],
              'Endo':[endoDerm],
              'Testis':['Testis'],
              'Ovary':['Ovary']} 

In [3]:
# Regular expression work for matching
for dermkey in tissuecats.keys():
    tissuecats[dermkey].append([re.sub(r'\W+', '', xi).lower().replace('_','') for xi in tissuecats[dermkey][0]])

In [4]:
# Read in mean counts of all genes-or-controls in all 54 tissues 
df_meancount = pd.read_csv('meancount_54tissues_withORFseqsJoined_29241total_21436genes_47removed_7805controls.tsv',sep='\t',index_col='Name')
df_meancount.columns # and display all column names

  df_meancount = pd.read_csv('meancount_54tissues_withORFseqsJoined_29241total_21436genes_47removed_7805controls.tsv',sep='\t',index_col='Name')


Index(['adiposesubcutaneous', 'adiposevisceralomentum', 'adrenalgland',
       'arteryaorta', 'arterycoronary', 'arterytibial', 'bladder',
       'brainamygdala', 'brainanteriorcingulatecortexba24',
       'braincaudatebasalganglia', 'braincerebellarhemisphere',
       'braincerebellum', 'braincortex', 'brainfrontalcortexba9',
       'brainhippocampus', 'brainhypothalamus',
       'brainnucleusaccumbensbasalganglia', 'brainputamenbasalganglia',
       'brainspinalcordcervicalc1', 'brainsubstantianigra',
       'breastmammarytissue', 'cellsculturedfibroblasts',
       'cellsebvtransformedlymphocytes', 'cervixectocervix',
       'cervixendocervix', 'colonsigmoid', 'colontransverse',
       'esophagusgastroesophagealjunction', 'esophagusmucosa',
       'esophagusmuscularis', 'fallopiantube', 'heartatrialappendage',
       'heartleftventricle', 'kidneycortex', 'kidneymedulla', 'liver', 'lung',
       'minorsalivarygland', 'muscleskeletal', 'nervetibial', 'pancreas',
       'pituitary', 'pr

In [5]:
# Drop columns not used in groupings for significance counts
# - makes pandas 'melt' logic difficult

df_meancount= df_meancount.drop(['evoera', 'annotation', 'ps', 'description', 'plength',
       'gapgene', 'chr', 'oldlongtranscriptstarthg38',
       'oldlongtranscriptendhg38', 'strand', 'cdsstarthg38', 'cdsendhg38',
       'proteinsequence', 'cdssequence', 'inold', 'genestart', 'genestop',
       'innew', 'evoera38', 'evoera538', 'annotation38', 'brain',
       'ecto', 'meso', 'endo', 'ORF_Control_Set', 'ORF_Gene_Length',
       'ORF_Nucleic_Seq', 'ORF_AA_Seq', 'ORF_Plength', 'Joined_Plength','geneid',
       'Joined_Pseq', 'Joined_GeneSeq'],axis=1)
df_meancount

Unnamed: 0_level_0,adiposesubcutaneous,adiposevisceralomentum,adrenalgland,arteryaorta,arterycoronary,arterytibial,bladder,brainamygdala,brainanteriorcingulatecortexba24,braincaudatebasalganglia,...,smallintestineterminalileum,spleen,stomach,thyroid,uterus,vagina,wholeblood,ovary,testis,evoera5
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Baz_Hs_1,111.23101,102.85601,92.99423,57.33489,74.23024,48.84502,40.02540,31.48850,24.58174,13.71341,...,34.56696,143.61683,22.86498,125.09515,28.85724,45.87489,162.84939,12.36130,6.97662,5-Primate
Baz_Hs_10,177.68898,163.43796,500.49319,256.47366,271.80059,243.99274,318.44142,268.11229,291.90454,345.83109,...,308.93844,380.62067,362.45395,357.40362,345.79863,327.91935,59.59042,830.07000,201.40222,5-Primate
Baz_Hs_103,35.51803,35.50075,48.26199,36.34178,36.78436,41.45156,40.78227,38.05410,42.33698,39.54385,...,29.40110,32.84427,29.19821,34.65814,36.20726,32.81738,9.08867,35.52709,32.54088,5-Primate
Baz_Hs_108,171.90412,14.81687,18.29058,212.37097,221.88249,199.52621,34.83342,15.83672,3.95681,25.68333,...,6.20162,8.73152,109.97821,29.36997,36.59781,138.86082,1.44878,56.54748,1494.23209,5-Primate
Baz_Hs_112,68.48757,54.71876,30.09576,45.70655,46.41960,48.74929,40.97488,26.73542,29.43152,28.15127,...,45.96444,27.92372,31.67508,65.19148,56.65446,56.96237,13.41577,62.93445,49.51862,5-Primate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vdp2013_S4_994,41.25872,9.46195,116.29349,13.61857,21.68078,14.82259,2.75110,86.05548,134.37962,60.13233,...,2.38709,2.00565,1.62410,4.41251,3.75358,3.82522,2.79571,9.81327,9.45471,5-Primate
vdp2013_S4_995,114.22039,101.54883,68.93886,91.20602,93.59158,116.57940,115.27335,27.66570,30.40872,39.47657,...,82.83554,53.80745,71.18307,94.48100,99.45430,190.27711,28.26805,99.02971,77.27178,5-Primate
vdp2013_S4_997,44.13762,33.59868,27.36927,34.07429,35.58481,41.66631,36.37751,34.32911,49.21922,32.17122,...,41.68812,28.70325,37.42835,31.26090,30.04364,36.78778,20.00996,23.59714,28.65766,5-Primate
vdp2013_S4_998,251.08933,211.76876,259.63283,181.18146,197.17140,179.48804,215.75188,104.99212,124.67759,152.83093,...,278.27528,301.76628,157.19595,280.34296,270.41003,201.19458,111.39997,242.41815,298.21205,5-Primate


In [6]:
# melt_df - dataframe produced by pd.melt()
melt_df = pd.melt(frame=df_meancount,id_vars = ['evoera5'],var_name='Tissue',value_name='Mean(Count)',ignore_index=False)
melt_df


Unnamed: 0_level_0,evoera5,Tissue,Mean(Count)
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Baz_Hs_1,5-Primate,adiposesubcutaneous,111.23101
Baz_Hs_10,5-Primate,adiposesubcutaneous,177.68898
Baz_Hs_103,5-Primate,adiposesubcutaneous,35.51803
Baz_Hs_108,5-Primate,adiposesubcutaneous,171.90412
Baz_Hs_112,5-Primate,adiposesubcutaneous,68.48757
...,...,...,...
vdp2013_S4_994,5-Primate,testis,9.45471
vdp2013_S4_995,5-Primate,testis,77.27178
vdp2013_S4_997,5-Primate,testis,28.65766
vdp2013_S4_998,5-Primate,testis,298.21205


In [7]:
len(list(melt_df.Tissue.unique())) # check that we have 54 unique tissues

54

In [8]:
tissuecats['Testis'][1] = ['testis']
tissuecats['Ovary'][1] = ['ovary']

In [9]:
# Create empty lists that will become the dataframe  

dfResults = pd.DataFrame()

left = []
right = []
floatvals = []
pvals = []
meanvals = []
lenvals = []

eragroups = ['1-Ancient', '2-Metazoa', '3-Chordate', '4-Mammal', '5-Primate','6-Intergenic ORF Control', '7-Intergenic Non-ORF Control']
tissueCatKeys = ['Brain', 'Ecto', 'Meso', 'Endo', 'Testis', 'Ovary']

In [10]:
melt_df.evoera5.unique() # ensure the correct number of evolutionary categories - 5 eras and two controls 

array(['5-Primate', '3-Chordate', '2-Metazoa', '1-Ancient', '4-Mammal',
       '6-Intergenic ORF Control', '7-Intergenic Non-ORF Control'],
      dtype=object)

In [11]:
def sig_table(eras_input,tissue_input,tissue_cats_in,incoming_df,era_col,tissue_col,verbose=False):
    
    
    # Create empty lists that will become the dataframe  

    dfResults = pd.DataFrame()

    left = []
    right = []
    floatvals = []
    pvals = []
    meanvals = []
    lenvals = []
    
    for x1_idx in range(len(eras_input)):
        for x2 in eras_input[x1_idx:]:
            x1 = eras_input[x1_idx]
            # now x1 and x2 are every combination of era labels
            for y1_idx in range(len(tissue_input)):
                for y2 in tissue_input[y1_idx:]:
                    y1 = tissue_input[y1_idx]
                    #print(x1,x2)
                    #print(y1,y2)
                    # now y1 and y2 are every combination of tissue group labels
                    if verbose:
                        print(f'comparing ({x1},{y1}) to ({x2},{y2})')
                    if (x1==x2) and (y1==y2):
                        if verbose:
                            print('Duplicate - pass')
                    else:
                        xinput = incoming_df[(incoming_df[era_col]==x1)&(incoming_df.Tissue.isin(tissue_cats_in[y1][1]))]['Mean(Count)']
                        x_mean,x_median,x_std,x_stderr,x_count = xinput.mean(),xinput.median(),xinput.std(),xinput.sem(),len(xinput)
                        
                        yinput = incoming_df[(incoming_df[era_col]==x2)&(incoming_df.Tissue.isin(tissue_cats_in[y2][1]))]['Mean(Count)']
                        y_mean,y_median,y_std,y_stderr,y_count = yinput.mean(),yinput.median(),yinput.std(),yinput.sem(),len(yinput)

                        statsboi = scipy.stats.mannwhitneyu(x=xinput,y=yinput,nan_policy='omit')

                        left.append((x1,y1,x_mean,x_median,x_std,x_stderr,x_count))
                        right.append((x2,y2,y_mean,y_median,y_std,y_stderr,y_count))
                        floatvals.append(statsboi[0])
                        pvals.append(statsboi[1])

                
    dfResults['Left Era'] = [tup[0] for tup in left]
    dfResults['Left Tissue_Group'] = [tup[1] for tup in left]
    dfResults['Left Mean'] = [tup[2] for tup in left]
    dfResults['Left Median'] = [tup[3] for tup in left]
    dfResults['Left STDev'] = [tup[4] for tup in left]
    dfResults['Left STDerr'] = [tup[5] for tup in left]
    dfResults['Left Count'] = [tup[-1] for tup in left]

    dfResults['Right Era'] = [tup[0] for tup in right]
    dfResults['Right Tissue_Group'] = [tup[1] for tup in right]
    dfResults['Right Mean'] = [tup[2] for tup in right]
    dfResults['Right Median'] = [tup[3] for tup in right]
    dfResults['Right STDev'] = [tup[4] for tup in right]
    dfResults['Right STDerr'] = [tup[5] for tup in right]
    dfResults['Right Count'] = [tup[-1] for tup in right]

    dfResults['Floats'] = floatvals
    dfResults['pvals'] = pvals
    dfResults.sort_values(by='pvals',inplace=True)
    dfResults.reset_index(drop=True,inplace=True)
    
    display(dfResults)
    
    return dfResults

    
    

In [12]:
# Create dataframe with meancount significance results
df_mc_sig = sig_table(eragroups,tissueCatKeys,tissuecats,melt_df,'evoera5','Tissue')

Unnamed: 0,Left Era,Left Tissue_Group,Left Mean,Left Median,Left STDev,Left STDerr,Left Count,Right Era,Right Tissue_Group,Right Mean,Right Median,Right STDev,Right STDerr,Right Count,Floats,pvals
0,7-Intergenic Non-ORF Control,Testis,0.921392,0.037800,17.767772,0.284914,3889,7-Intergenic Non-ORF Control,Ovary,0.178024,0.004340,1.409193,0.022597,3889,1.150296e+07,0.000000
1,2-Metazoa,Brain,1721.461516,328.441070,9558.063627,55.849289,29289,7-Intergenic Non-ORF Control,Ecto,0.233230,0.003820,7.013826,0.035566,38890,1.128950e+09,0.000000
2,2-Metazoa,Brain,1721.461516,328.441070,9558.063627,55.849289,29289,7-Intergenic Non-ORF Control,Brain,0.211228,0.004040,3.910108,0.017390,50557,1.468557e+09,0.000000
3,2-Metazoa,Ovary,1890.167011,309.665330,9386.738232,197.758023,2253,6-Intergenic ORF Control,Ovary,0.438971,0.005585,4.369805,0.069830,3916,8.702134e+06,0.000000
4,2-Metazoa,Testis,1796.827462,624.776470,4070.370209,85.753788,2253,6-Intergenic ORF Control,Ovary,0.438971,0.005585,4.369805,0.069830,3916,8.799924e+06,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541,2-Metazoa,Meso,1761.753076,299.176330,5841.652393,26.856256,47313,3-Chordate,Testis,1450.461392,228.582480,5144.033564,129.248706,1584,3.734991e+07,0.825292
542,6-Intergenic ORF Control,Endo,0.392231,0.004260,7.835031,0.039593,39160,6-Intergenic ORF Control,Ovary,0.438971,0.005585,4.369805,0.069830,3916,7.656610e+07,0.880365
543,4-Mammal,Endo,455.631198,5.001175,4178.938264,50.491779,6850,4-Mammal,Ovary,423.404540,5.318960,2221.758628,84.889034,685,2.352850e+06,0.901406
544,2-Metazoa,Brain,1721.461516,328.441070,9558.063627,55.849289,29289,2-Metazoa,Endo,1569.664639,328.970630,5154.494631,34.340412,22530,3.300875e+08,0.930641


In [13]:
# View homemade BH corrections to compare against
#df_results_old = dfResults.copy()
df_homemade = df_mc_sig.sort_values(by='pvals').copy()
df_homemade['rank'] = [rankval for rankval in range(1,546+1)]
df_homemade['adjp'] = [None]*546
for xirow,yirow in df_homemade.iterrows():
    adjp = (yirow['pvals']/yirow['rank'])*546
    df_homemade.loc[xirow,'adjp'] = adjp

display(df_homemade)

Unnamed: 0,Left Era,Left Tissue_Group,Left Mean,Left Median,Left STDev,Left STDerr,Left Count,Right Era,Right Tissue_Group,Right Mean,Right Median,Right STDev,Right STDerr,Right Count,Floats,pvals,rank,adjp
0,7-Intergenic Non-ORF Control,Testis,0.921392,0.037800,17.767772,0.284914,3889,7-Intergenic Non-ORF Control,Ovary,0.178024,0.004340,1.409193,0.022597,3889,1.150296e+07,0.000000,1,0.0
224,1-Ancient,Ecto,2709.178059,930.761210,11192.376669,29.049069,148450,2-Metazoa,Endo,1569.664639,328.970630,5154.494631,34.340412,22530,2.015659e+09,0.000000,2,0.0
223,1-Ancient,Brain,2191.355570,738.423280,5878.216542,13.380853,192985,2-Metazoa,Brain,1721.461516,328.441070,9558.063627,55.849289,29289,3.287988e+09,0.000000,3,0.0
222,1-Ancient,Brain,2191.355570,738.423280,5878.216542,13.380853,192985,2-Metazoa,Meso,1761.753076,299.176330,5841.652393,26.856256,47313,5.317841e+09,0.000000,4,0.0
221,1-Ancient,Brain,2191.355570,738.423280,5878.216542,13.380853,192985,2-Metazoa,Endo,1569.664639,328.970630,5154.494631,34.340412,22530,2.521216e+09,0.000000,5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541,2-Metazoa,Meso,1761.753076,299.176330,5841.652393,26.856256,47313,3-Chordate,Testis,1450.461392,228.582480,5144.033564,129.248706,1584,3.734991e+07,0.825292,542,0.831382
542,6-Intergenic ORF Control,Endo,0.392231,0.004260,7.835031,0.039593,39160,6-Intergenic ORF Control,Ovary,0.438971,0.005585,4.369805,0.069830,3916,7.656610e+07,0.880365,543,0.885229
543,4-Mammal,Endo,455.631198,5.001175,4178.938264,50.491779,6850,4-Mammal,Ovary,423.404540,5.318960,2221.758628,84.889034,685,2.352850e+06,0.901406,544,0.90472
544,2-Metazoa,Brain,1721.461516,328.441070,9558.063627,55.849289,29289,2-Metazoa,Endo,1569.664639,328.970630,5154.494631,34.340412,22530,3.300875e+08,0.930641,545,0.932349


In [14]:
# corrected pvals from statsmodels
fixedpvals = statsmodels.stats.multitest.multipletests(df_mc_sig['pvals'],method='fdr_bh')[1]

# scipy.stats.false_discovery_control(dfResults['pvals'],method='bh')

df_homemade['Adjusted_Pvals_BH'] = fixedpvals
df_homemade['Adjusted_Pvals < 0.05'] = [fp<0.05 for fp in fixedpvals]
df_homemade

Unnamed: 0,Left Era,Left Tissue_Group,Left Mean,Left Median,Left STDev,Left STDerr,Left Count,Right Era,Right Tissue_Group,Right Mean,Right Median,Right STDev,Right STDerr,Right Count,Floats,pvals,rank,adjp,Adjusted_Pvals_BH,Adjusted_Pvals < 0.05
0,7-Intergenic Non-ORF Control,Testis,0.921392,0.037800,17.767772,0.284914,3889,7-Intergenic Non-ORF Control,Ovary,0.178024,0.004340,1.409193,0.022597,3889,1.150296e+07,0.000000,1,0.0,0.000000,True
224,1-Ancient,Ecto,2709.178059,930.761210,11192.376669,29.049069,148450,2-Metazoa,Endo,1569.664639,328.970630,5154.494631,34.340412,22530,2.015659e+09,0.000000,2,0.0,0.000000,True
223,1-Ancient,Brain,2191.355570,738.423280,5878.216542,13.380853,192985,2-Metazoa,Brain,1721.461516,328.441070,9558.063627,55.849289,29289,3.287988e+09,0.000000,3,0.0,0.000000,True
222,1-Ancient,Brain,2191.355570,738.423280,5878.216542,13.380853,192985,2-Metazoa,Meso,1761.753076,299.176330,5841.652393,26.856256,47313,5.317841e+09,0.000000,4,0.0,0.000000,True
221,1-Ancient,Brain,2191.355570,738.423280,5878.216542,13.380853,192985,2-Metazoa,Endo,1569.664639,328.970630,5154.494631,34.340412,22530,2.521216e+09,0.000000,5,0.0,0.000000,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541,2-Metazoa,Meso,1761.753076,299.176330,5841.652393,26.856256,47313,3-Chordate,Testis,1450.461392,228.582480,5144.033564,129.248706,1584,3.734991e+07,0.825292,542,0.831382,0.831382,False
542,6-Intergenic ORF Control,Endo,0.392231,0.004260,7.835031,0.039593,39160,6-Intergenic ORF Control,Ovary,0.438971,0.005585,4.369805,0.069830,3916,7.656610e+07,0.880365,543,0.885229,0.885229,False
543,4-Mammal,Endo,455.631198,5.001175,4178.938264,50.491779,6850,4-Mammal,Ovary,423.404540,5.318960,2221.758628,84.889034,685,2.352850e+06,0.901406,544,0.90472,0.904720,False
544,2-Metazoa,Brain,1721.461516,328.441070,9558.063627,55.849289,29289,2-Metazoa,Endo,1569.664639,328.970630,5154.494631,34.340412,22530,3.300875e+08,0.930641,545,0.932349,0.932349,False


In [15]:
# Final results table with generated p-values
df_mc_sig['Adjusted_Pvals_BH'] = fixedpvals
df_mc_sig['Adjusted_Pvals < 0.05'] = [fp<0.05 for fp in fixedpvals]
df_mc_sig

Unnamed: 0,Left Era,Left Tissue_Group,Left Mean,Left Median,Left STDev,Left STDerr,Left Count,Right Era,Right Tissue_Group,Right Mean,Right Median,Right STDev,Right STDerr,Right Count,Floats,pvals,Adjusted_Pvals_BH,Adjusted_Pvals < 0.05
0,7-Intergenic Non-ORF Control,Testis,0.921392,0.037800,17.767772,0.284914,3889,7-Intergenic Non-ORF Control,Ovary,0.178024,0.004340,1.409193,0.022597,3889,1.150296e+07,0.000000,0.000000,True
1,2-Metazoa,Brain,1721.461516,328.441070,9558.063627,55.849289,29289,7-Intergenic Non-ORF Control,Ecto,0.233230,0.003820,7.013826,0.035566,38890,1.128950e+09,0.000000,0.000000,True
2,2-Metazoa,Brain,1721.461516,328.441070,9558.063627,55.849289,29289,7-Intergenic Non-ORF Control,Brain,0.211228,0.004040,3.910108,0.017390,50557,1.468557e+09,0.000000,0.000000,True
3,2-Metazoa,Ovary,1890.167011,309.665330,9386.738232,197.758023,2253,6-Intergenic ORF Control,Ovary,0.438971,0.005585,4.369805,0.069830,3916,8.702134e+06,0.000000,0.000000,True
4,2-Metazoa,Testis,1796.827462,624.776470,4070.370209,85.753788,2253,6-Intergenic ORF Control,Ovary,0.438971,0.005585,4.369805,0.069830,3916,8.799924e+06,0.000000,0.000000,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541,2-Metazoa,Meso,1761.753076,299.176330,5841.652393,26.856256,47313,3-Chordate,Testis,1450.461392,228.582480,5144.033564,129.248706,1584,3.734991e+07,0.825292,0.831382,False
542,6-Intergenic ORF Control,Endo,0.392231,0.004260,7.835031,0.039593,39160,6-Intergenic ORF Control,Ovary,0.438971,0.005585,4.369805,0.069830,3916,7.656610e+07,0.880365,0.885229,False
543,4-Mammal,Endo,455.631198,5.001175,4178.938264,50.491779,6850,4-Mammal,Ovary,423.404540,5.318960,2221.758628,84.889034,685,2.352850e+06,0.901406,0.904720,False
544,2-Metazoa,Brain,1721.461516,328.441070,9558.063627,55.849289,29289,2-Metazoa,Endo,1569.664639,328.970630,5154.494631,34.340412,22530,3.300875e+08,0.930641,0.932349,False


In [16]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    pass
    # display(df_mc_sig) # uncomment and remove 'pass' to print entirety of results
    
    

In [17]:
# Rename to fit scheme 
df_mc_sig['Left Era'] = df_mc_sig['Left Era'].replace('2-Metazoa','2-Metazoan')
df_mc_sig['Right Era'] = df_mc_sig['Right Era'].replace('2-Metazoa','2-Metazoan')

In [18]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(dfResults)

In [19]:
df_mc_sig.to_csv('mc_significance_mannwhitneyu_bhcorrection_meancounts_intragroup_sametissue_stdev_stderr_metazoan_withcounts.tsv',sep='\t')
df_mc_sig

Unnamed: 0,Left Era,Left Tissue_Group,Left Mean,Left Median,Left STDev,Left STDerr,Left Count,Right Era,Right Tissue_Group,Right Mean,Right Median,Right STDev,Right STDerr,Right Count,Floats,pvals,Adjusted_Pvals_BH,Adjusted_Pvals < 0.05
0,7-Intergenic Non-ORF Control,Testis,0.921392,0.037800,17.767772,0.284914,3889,7-Intergenic Non-ORF Control,Ovary,0.178024,0.004340,1.409193,0.022597,3889,1.150296e+07,0.000000,0.000000,True
1,2-Metazoan,Brain,1721.461516,328.441070,9558.063627,55.849289,29289,7-Intergenic Non-ORF Control,Ecto,0.233230,0.003820,7.013826,0.035566,38890,1.128950e+09,0.000000,0.000000,True
2,2-Metazoan,Brain,1721.461516,328.441070,9558.063627,55.849289,29289,7-Intergenic Non-ORF Control,Brain,0.211228,0.004040,3.910108,0.017390,50557,1.468557e+09,0.000000,0.000000,True
3,2-Metazoan,Ovary,1890.167011,309.665330,9386.738232,197.758023,2253,6-Intergenic ORF Control,Ovary,0.438971,0.005585,4.369805,0.069830,3916,8.702134e+06,0.000000,0.000000,True
4,2-Metazoan,Testis,1796.827462,624.776470,4070.370209,85.753788,2253,6-Intergenic ORF Control,Ovary,0.438971,0.005585,4.369805,0.069830,3916,8.799924e+06,0.000000,0.000000,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541,2-Metazoan,Meso,1761.753076,299.176330,5841.652393,26.856256,47313,3-Chordate,Testis,1450.461392,228.582480,5144.033564,129.248706,1584,3.734991e+07,0.825292,0.831382,False
542,6-Intergenic ORF Control,Endo,0.392231,0.004260,7.835031,0.039593,39160,6-Intergenic ORF Control,Ovary,0.438971,0.005585,4.369805,0.069830,3916,7.656610e+07,0.880365,0.885229,False
543,4-Mammal,Endo,455.631198,5.001175,4178.938264,50.491779,6850,4-Mammal,Ovary,423.404540,5.318960,2221.758628,84.889034,685,2.352850e+06,0.901406,0.904720,False
544,2-Metazoan,Brain,1721.461516,328.441070,9558.063627,55.849289,29289,2-Metazoan,Endo,1569.664639,328.970630,5154.494631,34.340412,22530,3.300875e+08,0.930641,0.932349,False


In [20]:
### SIGNIFICANCE FOR TPMs

In [21]:
# Read in GTEx data - for all 54 tissues with some additional appended information
# as downloaded from GTEx v8 set , no controls or unannotated genes
gtex_full = pd.read_csv('marten_gtex_withera_forsigwork_grch38.tsv',sep='\t',index_col='ENSP')
gtex_full

Unnamed: 0_level_0,Description,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Amygdala,Brain - Anterior cingulate cortex (BA24),...,Gap_Gene?,Chr,Gene_Start_hg38,Gene_End_hg38,Strand,CDS_Start_hg38,CDS_End_hg38,Protein_Sequence,CDS_Sequence,Era-5
ENSP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSP00000000233,ARF5,155.033000,130.395000,104.215000,173.917000,167.389000,169.550000,144.037000,93.188000,132.819000,...,Not_Gap_Gene,chr7,127588344.0,127591705.0,+,127588498.0,127591299.0,MGLTVSALFSRIFGKKQMRILMVGLDAAGKTTILYKLKLGEIVTTI...,ATGGGCCTCACCGTGTCCGCGCTCTTTTCGCGGATCTTCGGGAAGA...,1-Ancient
ENSP00000000412,M6PR,54.179700,46.653400,49.364500,53.353600,51.712000,56.593000,62.392600,11.360100,13.774700,...,Not_Gap_Gene,chr12,8940364.0,8949955.0,-,8941817.0,8946404.0,MFPFYSCWRTGLLLLLLAVAVRESWQTEEKTCDLVGEKGKESEKEL...,ATGTTCCCTTTCTACAGCTGCTGGAGGACTGGACTGCTACTACTAC...,1-Ancient
ENSP00000000442,ESRRA,34.075500,33.017900,34.561200,24.257400,27.962400,26.555800,46.109100,13.058700,17.573900,...,Not_Gap_Gene,chr11,64305577.0,64316738.0,+,64307179.0,64315966.0,MSSQVVGIEPLYIKAEPASPDSPKGSSETETEPPVALAPGPAPTRC...,ATGTCCAGCCAGGTGGTGGGCATTGAGCCTCTCTACATCAAGGCAG...,1-Ancient
ENSP00000001008,FKBP4,24.843200,34.593100,34.090000,40.999300,40.903100,52.456600,50.941200,22.969000,32.425800,...,Not_Gap_Gene,chr12,2794952.0,2805423.0,+,2795139.0,2803258.0,MTAEEMKATESGAQSAPLPMEGVDISPKQDEGVLKVIKREGTGTEM...,ATGACAGCCGAGGAGATGAAGGCGACCGAGAGCGGGGCGCAGTCGG...,1-Ancient
ENSP00000001146,CYP26B1,35.635300,26.576700,3.870610,10.873600,9.130600,5.292380,4.654000,4.021700,4.125860,...,Not_Gap_Gene,chr2,72129237.0,72148038.0,-,72132226.0,72147834.0,MLFEGLDLVSALATLAACLVSVTLLLAVSQQLWQLRWAATRDKSCK...,ATGCTCTTTGAGGGCTTGGATCTGGTGTCGGCGCTGGCCACCCTCG...,1-Ancient
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSP00000492767,TRIM43B,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,Not_Gap_Gene,chr2,95477163.0,95482714.0,-,95477163.0,95482714.0,MDSDFSHAFQKELTCVICLNYLVDPVTICCGHSFCRPCLCLSWEEA...,ATGGACTCAGACTTCTCACATGCCTTCCAGAAGGAACTCACCTGTG...,1-Ancient
ENSP00000492773,FAM217A,0.137665,0.074539,0.092826,0.063639,0.055228,0.068421,0.080345,0.011690,0.018658,...,Not_Gap_Gene,chr6,4068695.0,4087027.0,-,4068695.0,4087027.0,MPRLTQRVAVSPEYLLLPPRHGGCDKISWLPEGKVFSNVHREKISS...,ATGCCCAGATTAACACAGAGGGTTGCTGTGTCTCCTGAATACCTTC...,3-Chordate
ENSP00000492787,RP11-321E2.2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,Not_Gap_Gene,chr5,17518366.0,17518963.0,+,17518366.0,17518963.0,METGRQTGVSAEMLAMPRGLKGSKKDGIPEDLDGNLEAPRDQEGEL...,ATGGAGACCGGCAGGCAAACAGGTGTGTCTGCTGAGATGCTCGCCA...,1-Ancient
ENSP00000492790,ZNF705E,0.275570,0.150565,0.083835,0.190292,0.189638,0.251090,0.180014,0.050442,0.045997,...,Not_Gap_Gene,chr11,71814044.0,71821548.0,-,71816511.0,71821476.0,MHSLKKVTFEDVAIDFTQEEWAMMDTSKRKLYRDVMLENISHLVSL...,ATGCATTCACTAAAGAAAGTGACTTTTGAAGATGTAGCTATTGACT...,1-Ancient


In [22]:
# Drop columns to make melting step clear
gtex_parsed = gtex_full.drop(['Description','ENST', 'ENSG', 'PS', 'Gene_ID', 'Description_2',
       'Plength', 'Gap_Gene?', 'Chr', 'Gene_Start_hg38', 'Gene_End_hg38',
       'Strand', 'CDS_Start_hg38', 'CDS_End_hg38', 'Protein_Sequence',
       'CDS_Sequence','Ensembl_gene_length'],axis=1)

gtex_parsed_melty = pd.melt(frame=gtex_parsed,id_vars = ['Era-5',],var_name='Tissue',value_name='Mean(Count)',ignore_index=False)
gtex_parsed_melty

Unnamed: 0_level_0,Era-5,Tissue,Mean(Count)
ENSP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSP00000000233,1-Ancient,Adipose - Subcutaneous,155.033000
ENSP00000000412,1-Ancient,Adipose - Subcutaneous,54.179700
ENSP00000000442,1-Ancient,Adipose - Subcutaneous,34.075500
ENSP00000001008,1-Ancient,Adipose - Subcutaneous,24.843200
ENSP00000001146,1-Ancient,Adipose - Subcutaneous,35.635300
...,...,...,...
ENSP00000492767,1-Ancient,Whole Blood,0.000000
ENSP00000492773,3-Chordate,Whole Blood,0.000000
ENSP00000492787,1-Ancient,Whole Blood,0.000000
ENSP00000492790,1-Ancient,Whole Blood,0.018846


In [23]:
# some correction for tissue keys, as re-used from above 
tissuecats['Testis'] = [['Testis'],['testis']]
tissuecats['Ovary'] = [['Ovary'],['ovary']]

In [24]:
# similar regular expression fixes as done for meancounts
gtex_parsed_melty['tissue_alt'] = [re.sub(r'\W+', '', xi).lower().replace('_','') for xi in gtex_parsed_melty['Tissue']]
                                   
                                   

In [25]:
gtex_parsed_melty

Unnamed: 0_level_0,Era-5,Tissue,Mean(Count),tissue_alt
ENSP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSP00000000233,1-Ancient,Adipose - Subcutaneous,155.033000,adiposesubcutaneous
ENSP00000000412,1-Ancient,Adipose - Subcutaneous,54.179700,adiposesubcutaneous
ENSP00000000442,1-Ancient,Adipose - Subcutaneous,34.075500,adiposesubcutaneous
ENSP00000001008,1-Ancient,Adipose - Subcutaneous,24.843200,adiposesubcutaneous
ENSP00000001146,1-Ancient,Adipose - Subcutaneous,35.635300,adiposesubcutaneous
...,...,...,...,...
ENSP00000492767,1-Ancient,Whole Blood,0.000000,wholeblood
ENSP00000492773,3-Chordate,Whole Blood,0.000000,wholeblood
ENSP00000492787,1-Ancient,Whole Blood,0.000000,wholeblood
ENSP00000492790,1-Ancient,Whole Blood,0.018846,wholeblood


In [26]:
gtex_parsed_melty['Era-5'].value_counts()

Era-5
1-Ancient     788778
2-Metazoa     119448
3-Chordate     81756
4-Mammal       28566
5-Primate       9018
Name: count, dtype: int64

In [27]:
# Swap order since GTEx tissues still have capitalization and hyphens
gtex_tissuecats = {}

for key in tissuecats.keys():
    gtex_tissuecats[key] = [tissuecats[key][1],tissuecats[key][0]]
    

In [28]:
# Set up and run generation of significance table for TPMs

tissuegroups_tpm = ['Brain','Ecto','Meso','Endo','Ovary','Testis']
eragroups_tpm = ['1-Ancient', '2-Metazoa', '3-Chordate', '4-Mammal', '5-Primate']

dfResults_gtex = sig_table(eragroups_tpm,tissuegroups_tpm,gtex_tissuecats,gtex_parsed_melty,'Era-5','Tissue',verbose=False)



Unnamed: 0,Left Era,Left Tissue_Group,Left Mean,Left Median,Left STDev,Left STDerr,Left Count,Right Era,Right Tissue_Group,Right Mean,Right Median,Right STDev,Right STDerr,Right Count,Floats,pvals
0,1-Ancient,Brain,20.332593,5.667190,69.262859,0.158945,189891,1-Ancient,Ecto,30.384303,8.753520,202.703716,0.633917,102249,8.738335e+09,0.000000
1,2-Metazoa,Brain,16.242300,3.131010,68.331004,0.402952,28756,4-Mammal,Ecto,19.468488,0.058241,503.038155,8.266546,3703,7.861194e+07,0.000000
2,2-Metazoa,Brain,16.242300,3.131010,68.331004,0.402952,28756,4-Mammal,Brain,10.679382,0.036751,47.888222,0.577470,6877,1.482604e+08,0.000000
3,2-Metazoa,Meso,21.319736,3.191015,85.789090,0.398043,46452,3-Chordate,Meso,18.508134,0.792550,196.433798,1.101650,31794,8.610120e+08,0.000000
4,2-Metazoa,Ecto,24.763020,4.259640,462.521130,3.716978,15484,3-Chordate,Meso,18.508134,0.792550,196.433798,1.101650,31794,3.008316e+08,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,5-Primate,Ecto,14.120358,0.000000,206.150915,6.029454,1169,5-Primate,Ovary,12.779064,0.000000,80.940396,6.263356,167,9.830000e+04,0.872487
281,4-Mammal,Brain,10.679382,0.036751,47.888222,0.577470,6877,4-Mammal,Ovary,15.505443,0.040223,91.825431,3.992410,529,1.811828e+06,0.874555
282,5-Primate,Brain,5.584760,0.000000,33.623289,0.721623,2171,5-Primate,Endo,15.014556,0.000000,114.359411,2.798425,1670,1.815634e+06,0.926941
283,2-Metazoa,Ovary,26.507377,4.419200,107.840107,2.292914,2212,4-Mammal,Testis,57.434706,3.189880,418.339022,18.188653,529,5.843210e+05,0.963272


In [29]:
# Add BH corrected methods for gtex stats (same method as above)

fixedpvals_gtex = statsmodels.stats.multitest.multipletests(dfResults_gtex['pvals'],method='fdr_bh')[1]
#scipy.stats.false_discovery_control(dfResults_gtex['pvals'],method='bh',axis=0)

dfResults_gtex['Adjusted_Pvals_BH'] = fixedpvals_gtex
dfResults_gtex['Adjusted_Pvals < 0.05'] = [fp<0.05 for fp in fixedpvals_gtex]

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    pass
    # display(dfResults_gtex.sort_values(by=['Left Era','Left Tissue_Group','Right Era']))
    # uncomment the above and remove 'pass' to print entirety of outputs
    
    
 

In [30]:
# Similar metazoa->metazoan fix 

dfResults_gtex['Left Era'] = dfResults_gtex['Left Era'].replace('2-Metazoa','2-Metazoan')
dfResults_gtex['Right Era'] = dfResults_gtex['Right Era'].replace('2-Metazoa','2-Metazoan')

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    pass
    #     display(dfResults_gtex.sort_values(by=['Left Era','Left Tissue_Group','Right Era']))

In [31]:
dfResults_gtex.to_csv('marten_gtex_significance_mannwhitneyu_bhcorrection_meancounts_intragroup_sametissue_stdev_stderr_metazoan_withcounts_20240209.tsv',sep='\t')
dfResults_gtex # old: 20231019



Unnamed: 0,Left Era,Left Tissue_Group,Left Mean,Left Median,Left STDev,Left STDerr,Left Count,Right Era,Right Tissue_Group,Right Mean,Right Median,Right STDev,Right STDerr,Right Count,Floats,pvals,Adjusted_Pvals_BH,Adjusted_Pvals < 0.05
0,1-Ancient,Brain,20.332593,5.667190,69.262859,0.158945,189891,1-Ancient,Ecto,30.384303,8.753520,202.703716,0.633917,102249,8.738335e+09,0.000000,0.000000,True
1,2-Metazoan,Brain,16.242300,3.131010,68.331004,0.402952,28756,4-Mammal,Ecto,19.468488,0.058241,503.038155,8.266546,3703,7.861194e+07,0.000000,0.000000,True
2,2-Metazoan,Brain,16.242300,3.131010,68.331004,0.402952,28756,4-Mammal,Brain,10.679382,0.036751,47.888222,0.577470,6877,1.482604e+08,0.000000,0.000000,True
3,2-Metazoan,Meso,21.319736,3.191015,85.789090,0.398043,46452,3-Chordate,Meso,18.508134,0.792550,196.433798,1.101650,31794,8.610120e+08,0.000000,0.000000,True
4,2-Metazoan,Ecto,24.763020,4.259640,462.521130,3.716978,15484,3-Chordate,Meso,18.508134,0.792550,196.433798,1.101650,31794,3.008316e+08,0.000000,0.000000,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,5-Primate,Ecto,14.120358,0.000000,206.150915,6.029454,1169,5-Primate,Ovary,12.779064,0.000000,80.940396,6.263356,167,9.830000e+04,0.872487,0.883859,False
281,4-Mammal,Brain,10.679382,0.036751,47.888222,0.577470,6877,4-Mammal,Ovary,15.505443,0.040223,91.825431,3.992410,529,1.811828e+06,0.874555,0.883859,False
282,5-Primate,Brain,5.584760,0.000000,33.623289,0.721623,2171,5-Primate,Endo,15.014556,0.000000,114.359411,2.798425,1670,1.815634e+06,0.926941,0.933492,False
283,2-Metazoan,Ovary,26.507377,4.419200,107.840107,2.292914,2212,4-Mammal,Testis,57.434706,3.189880,418.339022,18.188653,529,5.843210e+05,0.963272,0.966664,False


In [32]:
# EXTRA CODE: verify that p-values of 0 are < min=2.225e-308
sys.float_info

sys.float_info(max=1.7976931348623157e+308, max_exp=1024, max_10_exp=308, min=2.2250738585072014e-308, min_exp=-1021, min_10_exp=-307, dig=15, mant_dig=53, epsilon=2.220446049250313e-16, radix=2, rounds=1)