In [1]:
## Daniel Marten
## Mean Count Construction

import numpy as np
import pandas as pd
import sklearn 
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import statistics
import sys
import os
import statsmodels.api as sm
import statsmodels.stats.weightstats as sm_stats
import seaborn as sns
import scipy
import scipy.stats as scistats

from statsmodels.formula.api import ols as formula_OLS

from cmapPy.pandasGEXpress.parse_gct import parse as tpm_parser

from collections import Counter
import re
import math
import warnings
import pandas as pd
import qtl.io as io
import qtl.norm as norm







In [2]:
# Create PS dictionaries for our five-era configuration
def rev_dict(input_dict):
    ret_dict = {}
    for key,val in input_dict.items():
        for subval in val: 
            ret_dict[subval] = key
            
    return ret_dict

five_era = {
    '1-Ancient':list(range(0,4)),
    '2-Metazoa':list(range(4,8)),
    '3-Chordate':list(range(8,18)),
    '4-Mammal':list(range(18,23)),
    '5-Primate':list(range(23,32)),
}

five_rev_era = rev_dict(five_era)


In [3]:
# read in csv with annotation, era, PS, tissue, and mean count
# Read in melted tidy version of prior output: 
# combined_controls_orfs_norfs_set1_victor_genes_normalized_meaned_counts_GRCh38_29241total_47UGremoved.gct.gz
# ^ as modified in Rstudio
df_new = pd.read_csv(r'/Users/marten/Downloads/marten_binned_meancounts_21436genes_47removed_metafix_7805controls.csv',index_col='Name').drop(['Unnamed: 0','bin'],axis=1)
original_nonorf = df_new[df_new['annotation']=='norf']
original_orf = df_new[df_new['annotation']=='orf']
df_new

Unnamed: 0_level_0,annotation,evo_era,updated_PS,tissue,mean(count)
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Baz_Hs_103,Unannotated With Overlap,humans,31.0,Adipose_Subcutaneous,35.51803
Baz_Hs_108,Unannotated With Overlap,primates,24.0,Adipose_Subcutaneous,171.90412
Baz_Hs_10,Unannotated With Overlap,humans,31.0,Adipose_Subcutaneous,177.68898
Baz_Hs_112,Unannotated With Overlap,primates,25.0,Adipose_Subcutaneous,68.48757
Baz_Hs_113,Unannotated With Overlap,humans,31.0,Adipose_Subcutaneous,0.37962
...,...,...,...,...,...
vdp2013_S4_327,Unannotated No Overlap,ancient,1.0,Whole_Blood,0.00000
vdp2013_S4_605,Unannotated No Overlap,humans,31.0,Whole_Blood,34.86733
vdp2013_S4_609,Unannotated No Overlap,ancient,1.0,Whole_Blood,5.36554
vdp2013_S4_619,Unannotated No Overlap,primates,26.0,Whole_Blood,0.12277


In [4]:
df_new[df_new.tissue=='Whole_Blood'].annotation.value_counts()

annotation
annotated                   19334
orf                          3916
norf                         3889
Unannotated With Overlap     1811
Unannotated No Overlap        291
Name: count, dtype: int64

In [5]:
print('Control Count as: ',3916+3889)

Control Count as:  7805


In [6]:
print('UG count as: ',1811+291)

UG count as:  2102


In [7]:
df_new[df_new.tissue=='Whole_Blood'].shape

(29241, 5)

In [8]:
# Split into table with one row per gene-or-control, not tidy/melty style
df3 = df_new.pivot_table(index=df_new.index,values='mean(count)',columns='tissue').reset_index().set_index('Name')
df3

tissue,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,Artery_Aorta,Artery_Coronary,Artery_Tibial,Bladder,Brain_Amygdala,Brain_Anterior_cingulate_cortex_BA24,Brain_Caudate_basal_ganglia,...,Skin_Not_Sun_Exposed_Suprapubic,Skin_Sun_Exposed_Lower_leg,Small_Intestine_Terminal_Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole_Blood
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Baz_Hs_1,111.23101,102.85601,92.99423,57.33489,74.23024,48.84502,40.02540,31.48850,24.58174,13.71341,...,25.95816,34.84595,34.56696,143.61683,22.86498,6.97662,125.09515,28.85724,45.87489,162.84939
Baz_Hs_10,177.68898,163.43796,500.49319,256.47366,271.80059,243.99274,318.44142,268.11229,291.90454,345.83109,...,281.90650,323.64230,308.93844,380.62067,362.45395,201.40222,357.40362,345.79863,327.91935,59.59042
Baz_Hs_103,35.51803,35.50075,48.26199,36.34178,36.78436,41.45156,40.78227,38.05410,42.33698,39.54385,...,36.93875,37.40587,29.40110,32.84427,29.19821,32.54088,34.65814,36.20726,32.81738,9.08867
Baz_Hs_108,171.90412,14.81687,18.29058,212.37097,221.88249,199.52621,34.83342,15.83672,3.95681,25.68333,...,189.19227,149.53612,6.20162,8.73152,109.97821,1494.23209,29.36997,36.59781,138.86082,1.44878
Baz_Hs_112,68.48757,54.71876,30.09576,45.70655,46.41960,48.74929,40.97488,26.73542,29.43152,28.15127,...,60.36087,57.14526,45.96444,27.92372,31.67508,49.51862,65.19148,56.65446,56.96237,13.41577
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vdp2013_S4_994,41.25872,9.46195,116.29349,13.61857,21.68078,14.82259,2.75110,86.05548,134.37962,60.13233,...,10.26367,22.44065,2.38709,2.00565,1.62410,9.45471,4.41251,3.75358,3.82522,2.79571
vdp2013_S4_995,114.22039,101.54883,68.93886,91.20602,93.59158,116.57940,115.27335,27.66570,30.40872,39.47657,...,125.65639,123.91386,82.83554,53.80745,71.18307,77.27178,94.48100,99.45430,190.27711,28.26805
vdp2013_S4_997,44.13762,33.59868,27.36927,34.07429,35.58481,41.66631,36.37751,34.32911,49.21922,32.17122,...,43.89385,49.42956,41.68812,28.70325,37.42835,28.65766,31.26090,30.04364,36.78778,20.00996
vdp2013_S4_998,251.08933,211.76876,259.63283,181.18146,197.17140,179.48804,215.75188,104.99212,124.67759,152.83093,...,207.48316,208.79061,278.27528,301.76628,157.19595,298.21205,280.34296,270.41003,201.19458,111.39997


In [9]:
unique_prots = list(df3.index)
for up in unique_prots:
    if 'Hs_JP' in up:
        pass # previously used for testing
        # print(up)

In [10]:
# Check to ensure that we have the right number of each annotation category 
dfmap = df_new.copy()
dfmap = dfmap[dfmap['tissue']=='Whole_Blood']
dfmap['annotation'].value_counts()

annotation
annotated                   19334
orf                          3916
norf                         3889
Unannotated With Overlap     1811
Unannotated No Overlap        291
Name: count, dtype: int64

In [11]:
# transferring names from one df to another 
for iterName,iterRow in df3.iterrows():
    # namekey = iterRow['name']
    df3.loc[iterName,'evo_era'] = dfmap.loc[iterName,'evo_era']
    df3.loc[iterName,'annotation'] = dfmap.loc[iterName,'annotation']

In [12]:
# Reading in metadata to append table with 
# As generated in a prior outside file 
phylo_df = pd.read_csv(r'/Users/marten/ug-gc/marten_completeGRCh38_21436genes_47UGremoved_gene_transcript_cds_metadata_mashup_old_new_diffFix_20231005.tsv',sep='\t',index_col='Name')
phylo_original = phylo_df.copy() # making a copy in case we want to look at it later on
updated_mapping = phylo_df.copy()

In [13]:
original_phylo = pd.read_csv(r'gs://ug-wphu/gtex_analysis/victor_2149+Ens89/Hs_Ens89+2149_PS_seq_etc_hg38.txt',sep='\t')


In [14]:
updated_mapping['geneid'] = None
for xi,yi in updated_mapping.iterrows():
    try:
        updated_mapping.loc[xi,'geneid'] = original_phylo.loc[xi].Name
    except:
        updated_mapping.loc[xi,'geneid'] = None

In [15]:
# Join!
df4 = df3.join(updated_mapping) # this has all of this information for everything
df4

Unnamed: 0_level_0,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,Artery_Aorta,Artery_Coronary,Artery_Tibial,Bladder,Brain_Amygdala,Brain_Anterior_cingulate_cortex_BA24,Brain_Caudate_basal_ganglia,...,Protein_Sequence,CDS_Sequence,in_old,Gene_Start,Gene_Stop,in_new,evoera_38,evoera5_38,annotation_38,geneid
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Baz_Hs_1,111.23101,102.85601,92.99423,57.33489,74.23024,48.84502,40.02540,31.48850,24.58174,13.71341,...,MTDTENHDSSPSSTSTCCPPITAGMQLKDSLGPGSNCPLWTLRPLH...,ATGACAGACACTGAAAATCACGACTCATCCCCCTCCAGCACCTCTA...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap,
Baz_Hs_10,177.68898,163.43796,500.49319,256.47366,271.80059,243.99274,318.44142,268.11229,291.90454,345.83109,...,MLVATGQCSRCFMFTFSTFSFNCHNSEVDSVRDRLPQDHSAPANSM...,ATGCTGGTGGCAACAGGGCAGTGTAGCAGGTGCTTCATGTTCACCT...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap,
Baz_Hs_103,35.51803,35.50075,48.26199,36.34178,36.78436,41.45156,40.78227,38.05410,42.33698,39.54385,...,MLGAFRSGPQPLPEPRARCVPQPGLLWALTRRRESPLVTPGLNLEE...,ATGCTGGGGGCTTTCCGGTCGGGGCCGCAGCCGCTTCCGGAGCCGC...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap,
Baz_Hs_108,171.90412,14.81687,18.29058,212.37097,221.88249,199.52621,34.83342,15.83672,3.95681,25.68333,...,MLAEIHPKAGLQSLQFIMELLYWLLEGGDSEDKEDATGNVEMKNIQ...,ATGTTGGCTGAAATTCATCCCAAGGCTGGTCTGCAAAGTCTGCAAT...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap,
Baz_Hs_112,68.48757,54.71876,30.09576,45.70655,46.41960,48.74929,40.97488,26.73542,29.43152,28.15127,...,MRSREAGPKLRRIQEPANGSPGAVSETGGYREERLSDAEIMGKLLA...,ATGCGAAGCAGAGAGGCAGGACCAAAATTGAGGCGAATCCAGGAAC...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vdp2013_S4_994,41.25872,9.46195,116.29349,13.61857,21.68078,14.82259,2.75110,86.05548,134.37962,60.13233,...,MLYTHNTEFNLKRQICFVPQCKTFVSLCFVKQTQENWYTCTSWVLY...,ATGCTTTATACACATAATACTGAATTTAACCTCAAGAGGCAAATCT...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap,
vdp2013_S4_995,114.22039,101.54883,68.93886,91.20602,93.59158,116.57940,115.27335,27.66570,30.40872,39.47657,...,MREWLSIRNMRIKCEIFSCSVKPMSANCISCRMKNATCWLSMRLRN,ATGAGAGAATGGCTCAGCATCAGAAACATGAGAATCAAATGCGAGA...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap,
vdp2013_S4_997,44.13762,33.59868,27.36927,34.07429,35.58481,41.66631,36.37751,34.32911,49.21922,32.17122,...,MFAYKGSSYHVSNTSNSINPTPKLASNPVGRYCMIKCLII,ATGTTTGCATATAAGGGAAGTAGTTATCATGTTAGTAATACCTCTA...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap,
vdp2013_S4_998,251.08933,211.76876,259.63283,181.18146,197.17140,179.48804,215.75188,104.99212,124.67759,152.83093,...,MLLVQGQHQNEEGLTRHLLSSSFTLSLPTPSFPLPHKVPMCLYPPL...,ATGCTGTTGGTTCAAGGACAACACCAGAATGAAGAGGGTCTCACAA...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap,


In [16]:
df_described = df4.dropna(subset='PS').sort_values(by='PS')

In [17]:
df4[df4.PS.isna()]['annotation'].value_counts() # annotation status of genes-or-controls without PS

annotation
orf     3916
norf    3889
Name: count, dtype: int64

In [18]:
print('Total Control as: ',3916+3889)

Total Control as:  7805


In [19]:
# Create a copy of the dataframe that drops Unannotated Genes without PS values 
# this means they were dropped 

df4_dropped = df4.copy()

for df4name , df4row in df4.iterrows():
    # if it is one of the unannotateds and its PS is NaN
    if df4row['annotation'] in ['Unannotated With Overlap','Unannotated No Overlap']:
        if str(df4row['PS'])=='nan':
            df4_dropped.drop(df4name,inplace=True)
            print('Exception',df4name)

In [20]:
df4.shape[0] - df4_dropped.shape[0] # no genes-or-controls here without PS values 

0

In [21]:
df4['annotation'].value_counts()

annotation
annotated                   19334
orf                          3916
norf                         3889
Unannotated With Overlap     1811
Unannotated No Overlap        291
Name: count, dtype: int64

In [23]:
# Moving forward 
# for controls, set their annotation status 'orf','norf' as their evolutionary category

df5 = df4_dropped

controllist = ['orf','norf']

for controlname,controlrow in df5.iterrows():
    try: 
        df5.loc[controlname,'evo_era_5'] = five_rev_era[controlrow['PS']]
    
    except:
        if controlrow['annotation'] in controllist: 
            df5.loc[controlname,'evo_era_5']=df5.loc[controlname,'annotation']
        else:
            df5.loc[controlname,'evo_era_5']=None
            print('Exception - not found')
            
df5 = df5.drop('evo_era',axis=1)

In [24]:
df5

Unnamed: 0_level_0,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,Artery_Aorta,Artery_Coronary,Artery_Tibial,Bladder,Brain_Amygdala,Brain_Anterior_cingulate_cortex_BA24,Brain_Caudate_basal_ganglia,...,CDS_Sequence,in_old,Gene_Start,Gene_Stop,in_new,evoera_38,evoera5_38,annotation_38,geneid,evo_era_5
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Baz_Hs_1,111.23101,102.85601,92.99423,57.33489,74.23024,48.84502,40.02540,31.48850,24.58174,13.71341,...,ATGACAGACACTGAAAATCACGACTCATCCCCCTCCAGCACCTCTA...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap,,5-Primate
Baz_Hs_10,177.68898,163.43796,500.49319,256.47366,271.80059,243.99274,318.44142,268.11229,291.90454,345.83109,...,ATGCTGGTGGCAACAGGGCAGTGTAGCAGGTGCTTCATGTTCACCT...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap,,5-Primate
Baz_Hs_103,35.51803,35.50075,48.26199,36.34178,36.78436,41.45156,40.78227,38.05410,42.33698,39.54385,...,ATGCTGGGGGCTTTCCGGTCGGGGCCGCAGCCGCTTCCGGAGCCGC...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap,,5-Primate
Baz_Hs_108,171.90412,14.81687,18.29058,212.37097,221.88249,199.52621,34.83342,15.83672,3.95681,25.68333,...,ATGTTGGCTGAAATTCATCCCAAGGCTGGTCTGCAAAGTCTGCAAT...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap,,5-Primate
Baz_Hs_112,68.48757,54.71876,30.09576,45.70655,46.41960,48.74929,40.97488,26.73542,29.43152,28.15127,...,ATGCGAAGCAGAGAGGCAGGACCAAAATTGAGGCGAATCCAGGAAC...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap,,5-Primate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vdp2013_S4_994,41.25872,9.46195,116.29349,13.61857,21.68078,14.82259,2.75110,86.05548,134.37962,60.13233,...,ATGCTTTATACACATAATACTGAATTTAACCTCAAGAGGCAAATCT...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap,,5-Primate
vdp2013_S4_995,114.22039,101.54883,68.93886,91.20602,93.59158,116.57940,115.27335,27.66570,30.40872,39.47657,...,ATGAGAGAATGGCTCAGCATCAGAAACATGAGAATCAAATGCGAGA...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap,,5-Primate
vdp2013_S4_997,44.13762,33.59868,27.36927,34.07429,35.58481,41.66631,36.37751,34.32911,49.21922,32.17122,...,ATGTTTGCATATAAGGGAAGTAGTTATCATGTTAGTAATACCTCTA...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap,,5-Primate
vdp2013_S4_998,251.08933,211.76876,259.63283,181.18146,197.17140,179.48804,215.75188,104.99212,124.67759,152.83093,...,ATGCTGTTGGTTCAAGGACAACACCAGAATGAAGAGGGTCTCACAA...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap,,5-Primate


In [25]:
# Rename orfs and norfs
df5['evo_era_5'] = df5['evo_era_5'].replace(
    {'orf':'6-Intergenic ORF Control',
    'norf':'7-Intergenic Non-ORF Control',}
)

In [26]:
## now we start some Germ layer configurations

In [27]:
# Create columns with mean scores for different tissue categories

df5['Brain'] = [None]*df5.shape[0]
df5['Ecto'] = [None]*df5.shape[0]
df5['Meso'] = [None]*df5.shape[0]
df5['Endo'] = [None]*df5.shape[0]

# THIS IS THE OLD SETUP!!!!!!

ectoDerm = [
    "Nerve_Tibial",
    "Skin Not Sun Exposed (Suprapubic)",
    "Skin_Sun_Exposed_Lower_leg",
    "Breast - Mammary Tissue",
    "Pituitary",
    "Adrenal Gland",
    "Brain - Cerebellum",
    "Brain - Cortex",
    "Cervix - Endocervix",
    "Minor Salivary Gland",
    ]

brainDerm = [
    "Brain - Nucleus accumbens (basal ganglia)",
    "Brain - Caudate (basal ganglia)",
    "Brain - Cerebellar Hemisphere",
    "Brain - Frontal Cortex (BA9)",
    "Brain - Hypothalamus",
    "Brain - Putamen (basal ganglia)",
    "Brain - Hippocampus",
    "Brain - Anterior cingulate cortex (BA24)",
    "Brain - Amygdala",
    "Brain - Spinal cord (cervical c-1)",
    "Brain - Substantia nigra",
    "Brain - Cerebellum",
    "Brain - Cortex",]

mesoDerm = [
    "Muscle - Skeletal",
    "Whole Blood",
    "Adipose - Subcutaneous",
    "Cells - Cultured fibroblasts",
    "Adipose - Visceral (Omentum)",
    "Esophagus - Muscularis",
    "Artery - Aorta",
    "Heart - Left Ventricle",
    "Heart - Atrial Appendage",
    "Spleen",
    "Prostate",
    "Artery - Coronary",
    "Artery - Tibial",
    "Cells - EBV-transformed lymphocytes",
    "Vagina",
    "Uterus",
    "Kidney - Cortex",
    "Bladder",
    "Cervix - Ectocervix",
    "Fallopian Tube",
    "Kidney - Medulla",]

endoDerm = [
    "Thyroid",
    "Lung",
    "Esophagus - Mucosa",
    "Colon - Transverse",
    "Esophagus - Gastroesophageal Junction",
    "Stomach",
    "Colon - Sigmoid",
    "Pancreas",
    "Liver",
    "Small Intestine - Terminal Ileum"]

germline = [
    'Testis',
    'Ovary'
]

group_dict = {
    'EctoDerm':sorted(ectoDerm),
    'Brain':sorted(brainDerm),
    'EndoDerm':sorted(endoDerm),
    'MesoDerm':sorted(mesoDerm),
    'Testis':'Testis',
    'Ovary':'Ovary'
}

In [28]:
# Write out tissue definitions

import json

json_tissue_group = json.dumps(group_dict)

with open("tissue_group_definitions_20231121.json", "w") as outfile:
    outfile.write(json_tissue_group)


In [29]:
# df5.columns

In [30]:
# Renaming dictionary ! 
dict_renaming = {}
# dictlist = [brainDerm,ectoDerm,mesoDerm,endoDerm]
# for listeddict in dictlist:
#     for dictkey in listeddict:
#         dict_renaming[dictkey]=re.sub(r'\W+', '', brainDerm[0]).lower()
allcolumns = df5.columns

megaDerm = list(set(ectoDerm+mesoDerm+endoDerm+brainDerm+germline))

for col in df5.columns:
    dict_renaming[col] = re.sub(r'\W+', '', col).lower().replace('_','')

In [31]:
import json

In [32]:
with open('/Users/marten/dict_renaming.json', 'w') as f:
    json.dump(dict_renaming, f)

In [33]:
# everything is now squished and lowercase on purpose

df5 = df5.rename(columns=dict_renaming)
df5

Unnamed: 0_level_0,adiposesubcutaneous,adiposevisceralomentum,adrenalgland,arteryaorta,arterycoronary,arterytibial,bladder,brainamygdala,brainanteriorcingulatecortexba24,braincaudatebasalganglia,...,innew,evoera38,evoera538,annotation38,geneid,evoera5,brain,ecto,meso,endo
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Baz_Hs_1,111.23101,102.85601,92.99423,57.33489,74.23024,48.84502,40.02540,31.48850,24.58174,13.71341,...,True,4-Primate,5-Primate,Unannotated With Overlap,,5-Primate,,,,
Baz_Hs_10,177.68898,163.43796,500.49319,256.47366,271.80059,243.99274,318.44142,268.11229,291.90454,345.83109,...,True,4-Primate,5-Primate,Unannotated With Overlap,,5-Primate,,,,
Baz_Hs_103,35.51803,35.50075,48.26199,36.34178,36.78436,41.45156,40.78227,38.05410,42.33698,39.54385,...,True,4-Primate,5-Primate,Unannotated With Overlap,,5-Primate,,,,
Baz_Hs_108,171.90412,14.81687,18.29058,212.37097,221.88249,199.52621,34.83342,15.83672,3.95681,25.68333,...,True,4-Primate,5-Primate,Unannotated With Overlap,,5-Primate,,,,
Baz_Hs_112,68.48757,54.71876,30.09576,45.70655,46.41960,48.74929,40.97488,26.73542,29.43152,28.15127,...,True,4-Primate,5-Primate,Unannotated With Overlap,,5-Primate,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vdp2013_S4_994,41.25872,9.46195,116.29349,13.61857,21.68078,14.82259,2.75110,86.05548,134.37962,60.13233,...,True,4-Primate,5-Primate,Unannotated With Overlap,,5-Primate,,,,
vdp2013_S4_995,114.22039,101.54883,68.93886,91.20602,93.59158,116.57940,115.27335,27.66570,30.40872,39.47657,...,True,4-Primate,5-Primate,Unannotated With Overlap,,5-Primate,,,,
vdp2013_S4_997,44.13762,33.59868,27.36927,34.07429,35.58481,41.66631,36.37751,34.32911,49.21922,32.17122,...,True,4-Primate,5-Primate,Unannotated With Overlap,,5-Primate,,,,
vdp2013_S4_998,251.08933,211.76876,259.63283,181.18146,197.17140,179.48804,215.75188,104.99212,124.67759,152.83093,...,True,4-Primate,5-Primate,Unannotated With Overlap,,5-Primate,,,,


In [34]:
# dictionary of all layers to itself
germ_dict = {'ecto':[ectoDerm],
            'meso':[mesoDerm],
            'endo':[endoDerm],
            'brain':[brainDerm],
            'germline':[germline]}

# now each germ layer is index to a list of two lists
# 1 - list of names with capitals and formatting
# 2 - only alphanumerical lowercase fellas 
for dermkey in germ_dict.keys():
    # print(dermkey)
    germ_dict[dermkey].append([re.sub(r'\W+', '', xi).lower().replace('_','') for xi in germ_dict[dermkey][0]])

In [35]:
# Get mean scores!
for iterName,iterRow in df5.iterrows():
    df5.loc[iterName,'brain'] = iterRow[germ_dict['brain'][1]].mean()
    df5.loc[iterName,'ecto'] = iterRow[germ_dict['ecto'][1]].mean()
    df5.loc[iterName,'meso'] = iterRow[germ_dict['meso'][1]].mean()
    df5.loc[iterName,'endo'] = iterRow[germ_dict['endo'][1]].mean()

In [36]:
# Read in complete ORF information
df5_old = df5.copy()

complete_orf_info = pd.read_csv('/Users/marten/ug-gc/Notebooks/grch38_intergenic_controls_combined_ORFs_non_ORFs_all_sets_38888_with_sequences_withGRCh37CrossRemoval.tsv',sep='\t',index_col='Name')
complete_orf_info = complete_orf_info[['Control_Set','Length','J','K']].rename(columns={'Control_Set':'ORF_Control_Set','Length':'ORF_Gene_Length','J':'ORF_Nucleic_Seq','K':'ORF_AA_Seq'})
complete_orf_info = complete_orf_info[complete_orf_info['ORF_Control_Set']==1]
complete_orf_info['ORF_Plength'] = [len(pseq) for pseq in list(complete_orf_info['ORF_AA_Seq'])]
complete_orf_info

Unnamed: 0_level_0,ORF_Control_Set,ORF_Gene_Length,ORF_Nucleic_Seq,ORF_AA_Seq,ORF_Plength
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:57850-58014:-norf_0_control_set_1,1,165,AATAGGATGGAAATAGCTGAGATCAGACATCTCCTTTTCAGAGTGG...,NRMEIAEIRHLLFRVENEVYN*LESMKVRKLHIFKELHTRDKLKLS...,55
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:188174-188317:+orf_0_control_set_1,1,144,ATGTTGCTGGGAAGACCCCCAAGTCCCTCTTCTGCATCGTCCTCGG...,MLLGRPPSPSSASSSGSGLVLTHTGKSFSFSWEGQDGQGMVNIWCWA*,48
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:274832-275053:+orf_0_control_set_1,1,222,ATGACAGAAGTAATTCCTGAGTTGCTTCTGAAACCAGAGCTTCCCT...,MTEVIPELLLKPELPSEPLACQMASWRALTHFSPSAIAAHSFQLLK...,74
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:389564-389698:-orf_1_control_set_1,1,135,ATGAAATGCCCTAATCAGAAGTCAGCTCCCCTCATGACCCTGGAGG...,MKCPNQKSAPLMTLEGILPTHRLFLILGLCCYLGLAPCHSQFGD*,45
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:520403-520710.norf_segment:520419-520571:+norf_1_control_set_1,1,153,CACAAGAGAGAATGTGAGCCAAGCAGCTTAGGGTTTAGGCAAGGCT...,HKRECEPSSLGFRQGFCLQETLGYEG*F*P*WAEPTGGI*GSAKLQ...,51
...,...,...,...,...,...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122:26461204-26462129.norf_segment:26461401-26461700:-norf_1_control_set_1,1,300,CAGTTAATAGCAATGACCAATTCTTTTTGAGAAGGGCATAGTAGAG...,QLIAMTNSF*EGHSRALNQIQSFSLLIVVSTSMYYQNYYFFFCQCF...,100
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26588359-26588487:+orf_0_control_set_1,1,129,ATGAGTTTCCCAATTGGGGAATCACCTATAAATAATCTTGAAATAA...,MSFPIGESPINNLEISLKLGNLSSVLRAVNTIVLCSLSCLVG*,43
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26623039-26623182:-orf_0_control_set_1,1,144,ATGCTGTGCAGACCACGGCCTCCGCAGAGGATCCCCTCACCCAGGC...,MLCRPRPPQRIPSPRPQGLPSCSLRDEIFNNAVSVSPQCHPHKKVPA*,48
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26627091-26627363:-orf_1_control_set_1,1,273,ATGCCGCGAGCACCCGCCTCCCCGCGCTCTGCTGGCCTCCACACCC...,MPRAPASPRSAGLHTLGPPFLSQGQRGVLEAPELHGSLGGRRSGVR...,91


In [37]:
# Annotate ORF information on to outgoing table, and add gene information for non0con

df_bloated = df5.join(complete_orf_info)#.dropna(subset='ORF_Control_Set')
df_bloated.columns

df_bloated['ORF_AA_Seq']

jp = []
jl = []
jg = []

for xi,yi in df_bloated.iterrows():
    if yi['plength']>0:
        jp.append(yi['proteinsequence'])
        jl.append(yi['plength'])
        jg.append(yi['cdssequence'])
    elif yi['ORF_Plength']>0:
        # print('oopsies')
        jp.append(yi['ORF_AA_Seq'])
        jl.append(yi['ORF_Plength'])
        jg.append(yi['ORF_Nucleic_Seq'])
    else:
        print('WARNING')
        
df_bloated['Joined_Plength'] = jl
df_bloated['Joined_Pseq'] = jp
df_bloated['Joined_GeneSeq'] = jg

df5 = df_bloated.copy()

In [38]:
# Write table with 54 tissue and 6 groups 
df5.to_csv('joined_54tissue_6group_df5_21436genes_47removed_7805orf_geneid.tsv',sep='\t')



In [39]:
# Set of non-germline tissues 
non_germ_tissues = megaDerm.copy()
non_germ_tissues.remove('Testis')
non_germ_tissues.remove('Ovary')

In [40]:
non_germ_tissues_lower = [re.sub(r'\W+', '', xi).lower().replace('_','') for xi in non_germ_tissues]

In [41]:
df5_tissueless = df5.drop(non_germ_tissues_lower,axis=1)

In [43]:
# Create table without individual tissue information 
df5_t6 = df5[['annotation', 'ps','geneid', 'description',
       'plength', 'gapgene', 'chr', 'oldlongtranscriptstarthg38',
       'oldlongtranscriptendhg38', 'strand', 'cdsstarthg38', 'cdsendhg38',
       'proteinsequence', 'cdssequence', 'inold', 'genestart', 'genestop',
       'innew', 'evoera38', 'evoera538', 'annotation38', 'evoera5', 'brain',
       'ecto', 'meso', 'endo','testis','ovary','ORF_Control_Set', 'ORF_Gene_Length',
       'ORF_Nucleic_Seq', 'ORF_AA_Seq', 'ORF_Plength', 'Joined_Plength',
       'Joined_Pseq', 'Joined_GeneSeq']]

In [44]:
# Sorted!
df5_54 = df5[sorted(non_germ_tissues_lower)+list(df5_tissueless.columns)]

In [45]:
len(df5_t6.columns) - len(set(df5_t6.columns))

0

In [46]:
# Ensure only 54 unique tissues , sort them, and do some parsing
unique_tissues = list(set(megaDerm)) # only 54 tissues
unique_tissues_lower = sorted([re.sub(r'\W+', '', xi).lower().replace('_','') for xi in unique_tissues])

In [47]:
# Separately, write df with 54 unique tissues out 

df5_54.to_csv('meancount_54tissues_withORFseqsJoined_29241total_21436genes_47removed_7805controls.tsv',sep='\t')



In [48]:
# FIND SMALLEST NONZERO OF THE 54 TISSUE TABLE FOR EXPORT 
mins54 = [min(df5_54[xii].replace(0,1)) for xii in df5_54.columns[:54]]
smallest_nonzero_tissue = min(mins54)
smallest_nonzero_tissue

0.00023

In [49]:
# Write out dataframe WITHOUT 54 tissues , just 6 tissue categories and plenty of metadata

df5_t6.to_csv('meancount_6groups_oldPS_withORFseqsJoined_29241total_21436genes_47removed_7805controls.tsv',sep='\t')



In [50]:
# Find and check for smallest nonzero value in 6 major tissue categories 

to_check_6germ = ['brain','ecto','meso','endo','ovary','testis']

mins6 = [min(df5_t6[xii].replace(0,1)) for xii in to_check_6germ]
smallest_nonzero_germ = min(mins6)
smallest_nonzero_germ

1.5714285714285715e-05

In [51]:
## DON'T PLOT HERE, SEPARATE NOTEBOOK FOR IT 