In [1]:
# Daniel Marten
# Normalization and Merge Samples on GTEx Outputs

import pandas as pd
import qtl.io as io
import qtl.norm as norm
import csv
from Bio.Seq import Seq



In [2]:
# Code to merge samples 

def merge_samples(paths, strip_dummy_gene=False, only_canon=False):

    dfs_combo = io.read_gct(paths[0])
    for part_num in range(len(paths))[1:]:
        df_new = io.read_gct(paths[part_num]).drop("Description", axis=1)
        dfs_combo = dfs_combo.join(df_new,how="left")
        
    # combined 
    if strip_dummy_gene:
        dfs_combo = dfs_combo[~dfs_combo.index.str.startswith("ENSG")]
    if only_canon:
        dfs_combo = dfs_combo[dfs_combo.index.str.startswith("ENSG")]
        
    return dfs_combo

In [3]:
# Intput of controls to keep 
set_001 = pd.read_csv('grch38_intergenic_controls_combined_ORFs_non_ORFs_set_1_7805_with_sequences_withGRCh37CrossRemoval.tsv',sep='\t')
index_to = set_001.Name

In [4]:
# Input names of paths to read in, in two different formats 
# From GCP Buckets 

new_endpath_names = [
  "marten_set1_0_3000.gene_reads.gct.gz",
  "marten_set2_3001_6000.gene_reads.gct.gz",
  "marten_set3_6001_9000.gene_reads.gct.gz",
  "marten_set4_9001_12000.gene_reads.gct.gz",
  "marten_set5_12001_15000.gene_reads.gct.gz",
  "marten_set6_15001_17382.gene_reads.gct.gz",
]
old_endpath_names = ["samples_1-5000_1x50.gene_reads.gct.gz",
                "5001_10000_51-100x100.gene_reads.gct.gz",
                "10000-15000_101-150x100.gene_reads.gct.gz",
                "15001-17382_151-174.gene_reads.gct.gz"]

victor_genes = ["gs://ug-marten/gtex-june2023/GRCh38_Jul14/Victors/june23_cn_dm_victors/" + xn for xn in new_endpath_names]
controls = ["gs://ug-marten/gtex-june2023/GRCh38/Combined_Controls_Deduplicated/june23_cn_dm_control/" + xn for xn in new_endpath_names]



In [5]:
# Read in per-sample metadata , for tissue types
sample_metadata = pd.read_csv("gs://ug-wphu/gtex_analysis/victor_2149+Ens89/gtex_samples_metadata.tsv", sep='\t')

In [6]:
# Run the merge_samples step on all Intergenic ORF and non-ORF Controls
controls_df = merge_samples(controls, strip_dummy_gene=True)
controls_df['Control_Set'] = [xi.split('_')[-1] for xi in controls_df.index]
controls_df['Status'] = [xi_s.split(':')[-1].split('_')[0][1:] for xi_s in controls_df.index]


In [7]:
# Print control set
controls_df

Unnamed: 0_level_0,Description,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,...,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O,Control_Set,Status
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:35726-35854:-orf_6_control_set_4,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,4,orf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:57850-58014:-norf_0_control_set_1,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:5560...,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,norf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:59864-60013:-orf_8_control_set_5,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,3,0,0,0,5,orf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:66632-67997.norf_segment:67741-67905:+norf_3_control_set_2,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:6663...,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,2,norf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:108680-109563.norf_segment:108819-108959:+norf_9_control_set_5,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:1086...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5,norf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26623039-26623182:-orf_0_control_set_1,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,orf
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26627091-26627363:-orf_1_control_set_1,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,orf
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:56881824-56882003:-orf_0_control_set_1,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,orf
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122:57070946-57071187.norf_segment:57070959-57071090:-norf_2_control_set_2,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122:5707...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,norf


In [8]:
# Create data frame of controls that are indeed intended to be kept 
# this includes additional filtering apart from just set#1
df001 = controls_df.loc[index_to]
df001

Unnamed: 0_level_0,Description,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,...,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O,Control_Set,Status
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:57850-58014:-norf_0_control_set_1,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:5560...,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,norf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:188174-188317:+orf_0_control_set_1,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...,7,1,4,2,3,2,6,7,5,...,2,4,2,1,4,3,1,1,1,orf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:274832-275053:+orf_0_control_set_1,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,orf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:389564-389698:-orf_1_control_set_1,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,orf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:520403-520710.norf_segment:520419-520571:+norf_1_control_set_1,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:5204...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,norf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122:26461204-26462129.norf_segment:26461401-26461700:-norf_1_control_set_1,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122:2646...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,norf
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26588359-26588487:+orf_0_control_set_1,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,orf
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26623039-26623182:-orf_0_control_set_1,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,orf
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26627091-26627363:-orf_1_control_set_1,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,orf


In [9]:
# Complete dataframe of annotated and unannotated genes 
victor_genes_df = merge_samples(victor_genes, strip_dummy_gene=True)

In [10]:
# Group by tissue, a bit time-intensive
samples_by_tissue = sample_metadata.groupby('tissue_id').agg(set)


In [11]:
# Set appropriate annotation status for known genes
# annotated if starts with ENSP
# unannotated if not 
df001['annotation'] = df001.Status
victor_genes_df['annotation'] = "unannotated"
for xi,yi in victor_genes_df.iterrows():
    if yi["Description"][:4] == 'ENSP':
        victor_genes_df.loc[xi,'annotation'] = 'annotated'
victor_genes_df['annotation'].value_counts()


annotation
annotated      19334
unannotated     2149
Name: count, dtype: int64

In [12]:
# New, large data frame of controls (df_001) and annotated+unannotated (victor_genes_df)
# cs for complete set 
cs = pd.concat([df001,victor_genes_df])
cs

Unnamed: 0_level_0,Description,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,...,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O,Control_Set,Status,annotation
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:57850-58014:-norf_0_control_set_1,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:5560...,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,norf,norf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:188174-188317:+orf_0_control_set_1,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...,7,1,4,2,3,2,6,7,5,...,4,2,1,4,3,1,1,1,orf,orf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:274832-275053:+orf_0_control_set_1,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,orf,orf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:389564-389698:-orf_1_control_set_1,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,orf,orf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:520403-520710.norf_segment:520419-520571:+norf_1_control_set_1,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:5204...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,norf,norf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vdp2013_S4_995_nt,vdp2013_S4_995_nt,144,42,113,66,45,68,58,185,79,...,59,190,79,287,145,97,120,,,unannotated
vdp2013_S4_997_nt,vdp2013_S4_997_nt,40,57,25,32,27,31,29,48,55,...,40,28,19,34,31,29,58,,,unannotated
vdp2013_S4_998_nt,vdp2013_S4_998_nt,201,65,192,178,65,101,218,255,249,...,165,197,221,124,195,62,264,,,unannotated
vdp2013_S4_999_nt,vdp2013_S4_999_nt,24,12,55,13,9,59,18,17,34,...,19,45,34,15,43,27,26,,,unannotated


In [13]:
# EXCLUDE 47 UNANNOTATED GENES WHICH WERE NOT IN AMIR KARGER'S MOST RECENT RELEASE
# 47 UGs which did not map properly
exclusion = pd.read_csv('unannotated_47_to_exclude.tsv',sep='\t')
exclusion['dropper'] = [xi + '_nt' for xi in exclusion.Name]
cs = cs.drop(exclusion.dropper)
cs

Unnamed: 0_level_0,Description,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,...,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O,Control_Set,Status,annotation
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:57850-58014:-norf_0_control_set_1,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:5560...,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,norf,norf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:188174-188317:+orf_0_control_set_1,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...,7,1,4,2,3,2,6,7,5,...,4,2,1,4,3,1,1,1,orf,orf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:274832-275053:+orf_0_control_set_1,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,orf,orf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:389564-389698:-orf_1_control_set_1,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,orf,orf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:520403-520710.norf_segment:520419-520571:+norf_1_control_set_1,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:5204...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,norf,norf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vdp2013_S4_995_nt,vdp2013_S4_995_nt,144,42,113,66,45,68,58,185,79,...,59,190,79,287,145,97,120,,,unannotated
vdp2013_S4_997_nt,vdp2013_S4_997_nt,40,57,25,32,27,31,29,48,55,...,40,28,19,34,31,29,58,,,unannotated
vdp2013_S4_998_nt,vdp2013_S4_998_nt,201,65,192,178,65,101,218,255,249,...,165,197,221,124,195,62,264,,,unannotated
vdp2013_S4_999_nt,vdp2013_S4_999_nt,24,12,55,13,9,59,18,17,34,...,19,45,34,15,43,27,26,,,unannotated


In [14]:
# Make sure there are only 54 tissues 
tissues = [x for x in samples_by_tissue["entity:sample_id"].index]
normalized_tissue_dfs = []
meaned_tissue_dfs = []
len(tissues)

54

In [15]:
# Code to get normalized counts per tissue
# This runs Deseq2's normalization method on each tissue counts and then outputs
# To normalize differences between samples per tissue 
# Outputs one version that is normalized everything, for individual samples 
# Outputs another that gets the mean count for each tissue among all samples 
# the latter is what we work with anyways 
# and mean-ing is not done until after normalization 

for tissue in tissues:
    print(tissue)
    new_df = pd.DataFrame()
    new_df = norm.deseq2_normalized_counts(cs[list((samples_by_tissue["entity:sample_id"][tissue]))])
    normalized_tissue_dfs.append(new_df)
    # 
    sep_df = pd.DataFrame()
    sep_df = norm.deseq2_normalized_counts(cs[list((samples_by_tissue["entity:sample_id"][tissue]))])
    sep_df[tissue] = sep_df.mean(axis=1) # additional step
    sep_df = sep_df[[tissue]]
    meaned_tissue_dfs.append(sep_df)

    

Adipose_Subcutaneous
Adipose_Visceral_Omentum
Adrenal_Gland
Artery_Aorta
Artery_Coronary
Artery_Tibial
Bladder
Brain_Amygdala
Brain_Anterior_cingulate_cortex_BA24
Brain_Caudate_basal_ganglia
Brain_Cerebellar_Hemisphere
Brain_Cerebellum
Brain_Cortex
Brain_Frontal_Cortex_BA9
Brain_Hippocampus
Brain_Hypothalamus
Brain_Nucleus_accumbens_basal_ganglia
Brain_Putamen_basal_ganglia
Brain_Spinal_cord_cervical_c-1
Brain_Substantia_nigra
Breast_Mammary_Tissue
Cells_Cultured_fibroblasts
Cells_EBV-transformed_lymphocytes
Cervix_Ectocervix
Cervix_Endocervix
Colon_Sigmoid
Colon_Transverse
Esophagus_Gastroesophageal_Junction
Esophagus_Mucosa
Esophagus_Muscularis
Fallopian_Tube
Heart_Atrial_Appendage
Heart_Left_Ventricle
Kidney_Cortex
Kidney_Medulla
Liver
Lung
Minor_Salivary_Gland
Muscle_Skeletal
Nerve_Tibial
Ovary
Pancreas
Pituitary
Prostate
Skin_Not_Sun_Exposed_Suprapubic
Skin_Sun_Exposed_Lower_leg
Small_Intestine_Terminal_Ileum
Spleen
Stomach
Testis
Thyroid
Uterus
Vagina
Whole_Blood


In [16]:
meaned_tissue_dfs[-5]

Unnamed: 0_level_0,Testis
Name,Unnamed: 1_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:57850-58014:-norf_0_control_set_1,0.143352
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:188174-188317:+orf_0_control_set_1,6.242778
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:274832-275053:+orf_0_control_set_1,0.010114
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:389564-389698:-orf_1_control_set_1,0.019024
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:520403-520710.norf_segment:520419-520571:+norf_1_control_set_1,0.000000
...,...
vdp2013_S4_995_nt,77.271776
vdp2013_S4_997_nt,28.657662
vdp2013_S4_998_nt,298.212047
vdp2013_S4_999_nt,46.252045


In [17]:
# Normalized DF with information for all individual tissue samples 
normalized_combined_df = pd.DataFrame()
normalized_combined_df = normalized_tissue_dfs[0].join(normalized_tissue_dfs[1:], how='left').round(5)
normalized_combined_df

Unnamed: 0_level_0,GTEX-R55C-1626-SM-48FEG,GTEX-XPVG-2726-SM-4B66W,GTEX-1B8SG-0226-SM-7939Q,GTEX-QLQW-1226-SM-2S1Q9,GTEX-145ME-1926-SM-5MR6T,GTEX-1HFI6-0326-SM-ADEI5,GTEX-ZVE2-0326-SM-57WFC,GTEX-1F6I4-0226-SM-9MQMA,GTEX-1MUQO-0526-SM-E9J31,GTEX-1PPGY-0326-SM-DTXFD,...,GTEX-ZTPG-0006-SM-4YCFG,GTEX-QMR6-0005-SM-32PKY,GTEX-1E1VI-0005-SM-ARU6H,GTEX-13111-0005-SM-5NQ7Z,GTEX-ZXG5-0005-SM-57WCN,GTEX-WRHK-0005-SM-3MJF5,GTEX-TMMY-0005-SM-33HBN,GTEX-PLZ6-0006-SM-33HBZ,GTEX-RVPV-0006-SM-2TF6Q,GTEX-XLM4-0005-SM-4AT4P
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:57850-58014:-norf_0_control_set_1,0.00000,1.10598,1.37007,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.91007
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:188174-188317:+orf_0_control_set_1,2.25580,2.21195,6.85037,0.00000,3.20424,0.00000,2.15011,3.23058,3.75983,4.99539,...,2.24974,0.00000,1.50885,6.85273,7.16948,3.82292,1.83134,8.79783,2.80297,7.28057
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:274832-275053:+orf_0_control_set_1,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:389564-389698:-orf_1_control_set_1,0.00000,0.00000,0.00000,0.00000,1.06808,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:520403-520710.norf_segment:520419-520571:+norf_1_control_set_1,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vdp2013_S4_995_nt,68.80198,123.86942,197.29051,118.08887,112.14850,116.55325,112.88099,107.68586,125.95423,85.63528,...,43.86985,30.47347,34.70365,18.06629,37.15092,35.04343,25.63880,31.13077,22.42374,19.11149
vdp2013_S4_997_nt,27.06963,51.98092,46.58248,43.50643,50.19980,33.65270,34.40183,40.92063,43.86466,38.53587,...,15.74815,0.00000,21.12396,16.82034,34.54384,14.01737,20.14477,42.63563,22.42374,19.11149
vdp2013_S4_998_nt,260.54521,261.01055,324.70730,233.07014,291.58609,235.56889,269.83933,220.75601,285.12027,201.95652,...,56.24339,266.64283,102.60209,109.64371,103.63151,88.56432,130.02536,76.47342,89.69498,168.36311
vdp2013_S4_999_nt,19.17432,28.75540,42.47226,52.82923,30.97435,31.19031,44.07734,32.30576,40.73147,11.41804,...,19.68519,0.00000,43.75677,14.32844,31.93676,46.51220,54.20776,84.59449,39.24155,17.29135


In [18]:
# DF with information by tissue 
mean_combined_df = meaned_tissue_dfs[0].join(meaned_tissue_dfs[1:], how='left').round(5)
mean_combined_df

Unnamed: 0_level_0,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,Artery_Aorta,Artery_Coronary,Artery_Tibial,Bladder,Brain_Amygdala,Brain_Anterior_cingulate_cortex_BA24,Brain_Caudate_basal_ganglia,...,Skin_Not_Sun_Exposed_Suprapubic,Skin_Sun_Exposed_Lower_leg,Small_Intestine_Terminal_Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole_Blood
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:57850-58014:-norf_0_control_set_1,0.13836,0.17175,0.26830,0.07284,0.13015,0.11775,0.12869,0.53660,0.43887,0.60124,...,0.17479,0.19805,0.20274,0.13685,0.21237,0.14335,0.12440,0.10187,0.13732,0.05420
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:188174-188317:+orf_0_control_set_1,4.86616,3.80680,3.94178,3.89285,4.24839,3.92053,4.05539,1.87525,1.94112,2.19779,...,6.42494,6.15759,4.99554,6.49090,3.31793,6.24278,6.42749,6.82664,5.37229,3.29182
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:274832-275053:+orf_0_control_set_1,0.04477,0.01229,0.00000,0.00797,0.01523,0.00113,0.00000,0.00000,0.03869,0.00000,...,0.01044,0.00609,0.00443,0.03881,0.00291,0.01011,0.01224,0.01396,0.00000,0.04289
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:389564-389698:-orf_1_control_set_1,0.02227,0.02639,0.01760,0.00697,0.01329,0.00778,0.04006,0.04281,0.07684,0.06839,...,0.01540,0.01371,0.03487,0.00358,0.02535,0.01902,0.02594,0.01509,0.02363,0.00428
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:520403-520710.norf_segment:520419-520571:+norf_1_control_set_1,0.00000,0.00153,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00205,0.00173,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00557,0.00106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vdp2013_S4_995_nt,114.22039,101.54883,68.93886,91.20602,93.59158,116.57940,115.27335,27.66570,30.40872,39.47657,...,125.65639,123.91386,82.83554,53.80745,71.18307,77.27178,94.48100,99.45430,190.27711,28.26805
vdp2013_S4_997_nt,44.13762,33.59868,27.36927,34.07429,35.58481,41.66631,36.37751,34.32911,49.21922,32.17122,...,43.89385,49.42956,41.68812,28.70325,37.42835,28.65766,31.26090,30.04364,36.78778,20.00996
vdp2013_S4_998_nt,251.08933,211.76876,259.63283,181.18146,197.17140,179.48804,215.75188,104.99212,124.67759,152.83093,...,207.48316,208.79061,278.27528,301.76628,157.19595,298.21205,280.34296,270.41003,201.19458,111.39997
vdp2013_S4_999_nt,31.92265,29.03936,12.65225,26.10016,27.81389,35.18813,33.32910,3.94804,3.67438,4.75083,...,18.43340,18.48695,40.75387,30.63705,18.92013,46.25205,21.98046,39.52910,26.82853,28.07457


In [19]:
# adding annotation and evolutionary era information 
normalized_combined_df['annotation'] = cs['annotation']
normalized_combined_df['evo_era'] = cs['annotation']

In [20]:
normalized_combined_df

Unnamed: 0_level_0,GTEX-R55C-1626-SM-48FEG,GTEX-XPVG-2726-SM-4B66W,GTEX-1B8SG-0226-SM-7939Q,GTEX-QLQW-1226-SM-2S1Q9,GTEX-145ME-1926-SM-5MR6T,GTEX-1HFI6-0326-SM-ADEI5,GTEX-ZVE2-0326-SM-57WFC,GTEX-1F6I4-0226-SM-9MQMA,GTEX-1MUQO-0526-SM-E9J31,GTEX-1PPGY-0326-SM-DTXFD,...,GTEX-1E1VI-0005-SM-ARU6H,GTEX-13111-0005-SM-5NQ7Z,GTEX-ZXG5-0005-SM-57WCN,GTEX-WRHK-0005-SM-3MJF5,GTEX-TMMY-0005-SM-33HBN,GTEX-PLZ6-0006-SM-33HBZ,GTEX-RVPV-0006-SM-2TF6Q,GTEX-XLM4-0005-SM-4AT4P,annotation,evo_era
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:57850-58014:-norf_0_control_set_1,0.00000,1.10598,1.37007,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.91007,norf,norf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:188174-188317:+orf_0_control_set_1,2.25580,2.21195,6.85037,0.00000,3.20424,0.00000,2.15011,3.23058,3.75983,4.99539,...,1.50885,6.85273,7.16948,3.82292,1.83134,8.79783,2.80297,7.28057,orf,orf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:274832-275053:+orf_0_control_set_1,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,orf,orf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:389564-389698:-orf_1_control_set_1,0.00000,0.00000,0.00000,0.00000,1.06808,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,orf,orf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:520403-520710.norf_segment:520419-520571:+norf_1_control_set_1,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,norf,norf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vdp2013_S4_995_nt,68.80198,123.86942,197.29051,118.08887,112.14850,116.55325,112.88099,107.68586,125.95423,85.63528,...,34.70365,18.06629,37.15092,35.04343,25.63880,31.13077,22.42374,19.11149,unannotated,unannotated
vdp2013_S4_997_nt,27.06963,51.98092,46.58248,43.50643,50.19980,33.65270,34.40183,40.92063,43.86466,38.53587,...,21.12396,16.82034,34.54384,14.01737,20.14477,42.63563,22.42374,19.11149,unannotated,unannotated
vdp2013_S4_998_nt,260.54521,261.01055,324.70730,233.07014,291.58609,235.56889,269.83933,220.75601,285.12027,201.95652,...,102.60209,109.64371,103.63151,88.56432,130.02536,76.47342,89.69498,168.36311,unannotated,unannotated
vdp2013_S4_999_nt,19.17432,28.75540,42.47226,52.82923,30.97435,31.19031,44.07734,32.30576,40.73147,11.41804,...,43.75677,14.32844,31.93676,46.51220,54.20776,84.59449,39.24155,17.29135,unannotated,unannotated


In [21]:
# Further adding information 
mean_combined_df['annotation'] = cs['annotation']
mean_combined_df

Unnamed: 0_level_0,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,Artery_Aorta,Artery_Coronary,Artery_Tibial,Bladder,Brain_Amygdala,Brain_Anterior_cingulate_cortex_BA24,Brain_Caudate_basal_ganglia,...,Skin_Sun_Exposed_Lower_leg,Small_Intestine_Terminal_Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole_Blood,annotation
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:57850-58014:-norf_0_control_set_1,0.13836,0.17175,0.26830,0.07284,0.13015,0.11775,0.12869,0.53660,0.43887,0.60124,...,0.19805,0.20274,0.13685,0.21237,0.14335,0.12440,0.10187,0.13732,0.05420,norf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:188174-188317:+orf_0_control_set_1,4.86616,3.80680,3.94178,3.89285,4.24839,3.92053,4.05539,1.87525,1.94112,2.19779,...,6.15759,4.99554,6.49090,3.31793,6.24278,6.42749,6.82664,5.37229,3.29182,orf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:274832-275053:+orf_0_control_set_1,0.04477,0.01229,0.00000,0.00797,0.01523,0.00113,0.00000,0.00000,0.03869,0.00000,...,0.00609,0.00443,0.03881,0.00291,0.01011,0.01224,0.01396,0.00000,0.04289,orf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:389564-389698:-orf_1_control_set_1,0.02227,0.02639,0.01760,0.00697,0.01329,0.00778,0.04006,0.04281,0.07684,0.06839,...,0.01371,0.03487,0.00358,0.02535,0.01902,0.02594,0.01509,0.02363,0.00428,orf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:520403-520710.norf_segment:520419-520571:+norf_1_control_set_1,0.00000,0.00153,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00173,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00557,0.00106,norf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vdp2013_S4_995_nt,114.22039,101.54883,68.93886,91.20602,93.59158,116.57940,115.27335,27.66570,30.40872,39.47657,...,123.91386,82.83554,53.80745,71.18307,77.27178,94.48100,99.45430,190.27711,28.26805,unannotated
vdp2013_S4_997_nt,44.13762,33.59868,27.36927,34.07429,35.58481,41.66631,36.37751,34.32911,49.21922,32.17122,...,49.42956,41.68812,28.70325,37.42835,28.65766,31.26090,30.04364,36.78778,20.00996,unannotated
vdp2013_S4_998_nt,251.08933,211.76876,259.63283,181.18146,197.17140,179.48804,215.75188,104.99212,124.67759,152.83093,...,208.79061,278.27528,301.76628,157.19595,298.21205,280.34296,270.41003,201.19458,111.39997,unannotated
vdp2013_S4_999_nt,31.92265,29.03936,12.65225,26.10016,27.81389,35.18813,33.32910,3.94804,3.67438,4.75083,...,18.48695,40.75387,30.63705,18.92013,46.25205,21.98046,39.52910,26.82853,28.07457,unannotated


In [22]:
# Writing out - more information to be appended later 
# large file

# code for when this file is exported
# uncomment lines below to export
# normalized_combined_df.to_csv('combined_controls_orfs_norfs_set1_victor_genes_normalized_counts_GRCh38_29241total_47UGremoved.gct.gz',sep='\t')





In [23]:
# Writing out - more information to be appended later 
# more manageable file 
mean_combined_df.to_csv('combined_controls_orfs_norfs_set1_victor_genes_normalized_meaned_counts_GRCh38_29241total_47UGremoved.gct.gz',sep='\t')
# NOTE: this above file is then annotated with:
# gene annotation, phylostratigraphic, and evolutionary category information. 

In [24]:
# FURTHER PROCESSING IN (THIS) JUPYTER NOTEBOOK: ADD EVO_ERA_CAT AND SPLIT TO TISSUE GROUPS (but not germ)
# import pandas as pd

# Read in phylogenetic information from outside table, to annotate our information with 
phylo_df = pd.read_csv('/Users/marten/ug-gc/marten_completeGRCh38_21436genes_47UGremoved_gene_transcript_cds_metadata_mashup_old_new_diffFix_20231005.tsv',sep='\t',index_col='Name')

In [25]:
# Evolutionary era mapping information 

mapping_file = pd.read_csv('gs://ug-wphu/gtex_vm_files/victor_data/gene_and_control_annotation_mapping_with_evo_era_10222021.tsv',sep='\t')
mapping_file = mapping_file[~mapping_file['annotation'].isin(['original_iorf_controls','random_controls','new_iorf_controls'])]
mapping_file.set_index('Name',inplace=True)
mapping_file

Unnamed: 0_level_0,annotation,evolutionary_era_cat
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Baz_Hs_103,unannotated,humans
Baz_Hs_108,unannotated,primates
Baz_Hs_10,unannotated,humans
Baz_Hs_112,unannotated,primates
Baz_Hs_113,unannotated,humans
...,...,...
vdp2013_S4_995,unannotated,humans
vdp2013_S4_997,unannotated,humans
vdp2013_S4_998,unannotated,humans
vdp2013_S4_999,unannotated,humans


In [26]:
# Initialize columns 
mean_combined_df['evo_era'] = None
mean_combined_df['updated_PS'] = None


In [27]:
# Annotated evolutionary era and updated PS number information on to the mean_combined table

for gene_name,rowinfo in mean_combined_df.iterrows():
    newname = gene_name.split('_nt')[0]
    try :
        newname_info = mapping_file.loc[newname]
        mean_combined_df.loc[gene_name,'evo_era'] = newname_info['evolutionary_era_cat']
        mean_combined_df.loc[gene_name,'updated_PS'] = phylo_df.loc[newname,'PS']
    except:
        pass
    

In [28]:
# melt-ed eventual output of mean_combined table
eventual_output = mean_combined_df.melt(id_vars=['annotation','evo_era','updated_PS'],ignore_index=False,value_name='mean(count)',var_name='tissue').reset_index()
# Sort but don't drop
eventual_output = eventual_output.sort_values(by=['tissue','Name','annotation'])
# Output to tsv, to be read and binned in rstudio
eventual_output.to_csv('gs://ug-marten/gtex-june2023/GRCh38/Combined_Controls_Deduplicated/marten_meancounts_bytissue_melted_evoera_annotation_21436Genes_47UGremoved_7805control_updated.tsv',sep='\t',index=False)



In [29]:
# Construct a protein mapping df

vg_df = victor_genes_df['annotation'].to_frame()
pd_df = df001['annotation'].to_frame()
pd001 = pd.concat([vg_df,pd_df])
pd001

# annotation mapping 

protein_annotation_mapping_df = pd001.copy() # victor_genes_df['annotation'].append([df001['annotation']]).to_frame()
protein_annotation_mapping_df

Unnamed: 0_level_0,annotation
Name,Unnamed: 1_level_1
Baz_Hs_103_nt,unannotated
Baz_Hs_108_nt,unannotated
Baz_Hs_10_nt,unannotated
Baz_Hs_112_nt,unannotated
Baz_Hs_113_nt,unannotated
...,...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122:26461204-26462129.norf_segment:26461401-26461700:-norf_1_control_set_1,norf
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26588359-26588487:+orf_0_control_set_1,orf
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26623039-26623182:-orf_0_control_set_1,orf
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26627091-26627363:-orf_1_control_set_1,orf


In [30]:
# Adding annotation information based on name while ignoring _nt

protein_annotation_mapping_df['evolutionary_era_cat'] = protein_annotation_mapping_df['annotation']
for xi,yi in protein_annotation_mapping_df.iterrows():
    shortname = xi.split('_nt')[0]
    try:
        protein_annotation_mapping_df.loc[xi,'evolutionary_era_cat'] = mapping_file.loc[shortname,'evolutionary_era_cat']
    except:
        pass # ug 


In [31]:
protein_annotation_mapping_df

Unnamed: 0_level_0,annotation,evolutionary_era_cat
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Baz_Hs_103_nt,unannotated,humans
Baz_Hs_108_nt,unannotated,primates
Baz_Hs_10_nt,unannotated,humans
Baz_Hs_112_nt,unannotated,primates
Baz_Hs_113_nt,unannotated,humans
...,...,...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122:26461204-26462129.norf_segment:26461401-26461700:-norf_1_control_set_1,norf,norf
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26588359-26588487:+orf_0_control_set_1,orf,orf
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26623039-26623182:-orf_0_control_set_1,orf,orf
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26627091-26627363:-orf_1_control_set_1,orf,orf


In [32]:
# Drop things that are to be excluded 
# already excluded
# protein_annotation_mapping_df = protein_annotation_mapping_df.drop([p_id.replace('_nt','') for p_id in exclusion.dropper])
# protein_annotation_mapping_df

In [33]:
# output with protein annotation , to be read in later in Rstudio
protein_annotation_mapping_df.to_csv('gs://ug-marten/gtex-june2023/GRCh38/Combined_Controls_Deduplicated/marten_combined_set1_victor_genes_annotation_evoera_update_mapping_21436Genes_47UGremoved_7805control_20231005.tsv',sep='\t')

