In [1]:
## Daniel Marten
## Creating GRCh38 BED files for controls

import warnings
warnings.simplefilter(action='ignore')

import pandas as pd
import numpy as np
import random



In [2]:
# Nam_controls: downloaded 'bam' files from Nam
# 10 sets of GRCh38 controls, each with a pair of Intergenic ORF and Non-Intergenic ORFs
# Make new dataframes and compress to FIVE control sets moving forward

norf_dfs = []
orf_dfs = []

for gtfidx in range(10):
    # Reading in all GRCh38 ORFs
    orf_path = f'/Users/marten/Downloads/nam_controls_marten_June_2023/GRCh38_Ens89/sample-ORFs/GRCh38.Ens89.dna_rm.chromosome.all.intergenic_gt122.orfs.sample{gtfidx}.bed'
    norf_path = f'/Users/marten/Downloads/nam_controls_marten_June_2023/GRCh38_Ens89/sample-nORFs/GRCh38.Ens89.dna_rm.chromosome.all.intergenic_gt122.norf_sample{gtfidx}.bed'
    new_orf = pd.read_csv(orf_path,sep='\t').reset_index()
    new_norf = pd.read_csv(norf_path,sep='\t').reset_index()
    
    # Formatting
    col_renames = {'level_0':'Chr',
               'level_1':'Gene_Start_hg38',
               'level_2':'Gene_End_hg38',
               'level_3':'Name',
               '# All coordinates are 0-indexed, coordinates in ID column are end-inclusive, and coordinates in start/end columns are end-exclusive (as is conventional)':'Strand'
              }
    
    new_orf = new_orf.rename(columns=col_renames)
    new_orf = new_orf.set_index('Name').drop('level_4',axis=1)
    new_orf['Status'] = [f'orf_{gtfidx}']*new_orf.shape[0]
    new_orf['Control_Set'] = (gtfidx//2)+1 # THIS REDUCES THE 10 CONTROL GROUPS TO 5
    #new_orf['annotation'] = ['intergenics']*new_orf.shape[0]
    
    new_norf = new_norf.rename(columns=col_renames)
    new_norf = new_norf.set_index('Name').drop('level_4',axis=1)
    new_norf['Status'] = [f'norf_{gtfidx}']*new_norf.shape[0]
    new_norf['Control_Set'] = (gtfidx//2)+1 # THIS REDUCES THE 10 CONTROL GROUPS TO 5
    #new_norf['annotation'] = ['intergenics']*new_norf.shape[0]

    orf_dfs.append(new_orf)
    norf_dfs.append(new_norf)

In [3]:
raw_bed = pd.read_csv(f'/Users/marten/Downloads/nam_controls_marten_June_2023/GRCh38_Ens89/sample-ORFs/GRCh38.Ens89.dna_rm.chromosome.all.intergenic_gt122.orfs.sample0.bed',
                     sep='\t').reset_index()
raw_bed

Unnamed: 0,level_0,level_1,level_2,level_3,level_4,"# All coordinates are 0-indexed, coordinates in ID column are end-inclusive, and coordinates in start/end columns are end-exclusive (as is conventional)"
0,chr1,188174,188318,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...,0,+
1,chr1,274832,275054,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...,0,+
2,chr1,4879673,4880000,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...,0,+
3,chr1,4935350,4935473,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...,0,+
4,chr1,9453457,9453667,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...,0,+
...,...,...,...,...,...,...
1995,chrY,21505338,21505467,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...,0,-
1996,chrY,22446066,22446234,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...,0,-
1997,chrY,24193329,24193533,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...,0,-
1998,chrY,26623039,26623183,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...,0,-


In [4]:
# Properly formatted dataframe of ALL control sequences - 40,000 of them 
control_list = orf_dfs + norf_dfs
control_df = pd.concat(control_list)
control_df

Unnamed: 0_level_0,Chr,Gene_Start_hg38,Gene_End_hg38,Strand,Status,Control_Set
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:188174-188317:+,chr1,188174,188318,+,orf_0,1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:274832-275053:+,chr1,274832,275054,+,orf_0,1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:4879673-4879999:+,chr1,4879673,4880000,+,orf_0,1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:4935350-4935472:+,chr1,4935350,4935473,+,orf_0,1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:9453457-9453666:+,chr1,9453457,9453667,+,orf_0,1
...,...,...,...,...,...,...
GRCh38.Ens89.dna_rm.chr8.intergenic_gt122:48902094-48904717.norf_segment:48903937-48904062:-,chr8,48903937,48904063,-,norf_9,5
GRCh38.Ens89.dna_rm.chr2.intergenic_gt122:2747860-2748261.norf_segment:2748014-2748175:-,chr2,2748014,2748176,-,norf_9,5
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:55862538-55863365.norf_segment:55862998-55863138:-,chr1,55862998,55863139,-,norf_9,5
GRCh38.Ens89.dna_rm.chr4.intergenic_gt122:117634080-117635194.norf_segment:117634743-117635027:-,chr4,117634743,117635028,-,norf_9,5


In [5]:
# User providing some annotations and length
### HEIRARCHY
# there are duplicates of many Intergenic ORFs between the different control sets, which we do not want
# remove: duplicate ORFs between control sets 
# remove: any intergenic control sequences that overlap annotated or unannotated genes 
# heirarchy: annotated>unannotated>Igen-ORF>Igen-Non-ORF
###
# annotations for sorting later
control_df['user_length'] = control_df.Gene_End_hg38 - control_df.Gene_Start_hg38
control_df['removal'] = False
control_df['heirarchy'] = 3
control_df.loc[control_df['Status'].str.contains('norf'),'heirarchy'] = 4
control_df

Unnamed: 0_level_0,Chr,Gene_Start_hg38,Gene_End_hg38,Strand,Status,Control_Set,user_length,removal,heirarchy
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:188174-188317:+,chr1,188174,188318,+,orf_0,1,144,False,3
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:274832-275053:+,chr1,274832,275054,+,orf_0,1,222,False,3
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:4879673-4879999:+,chr1,4879673,4880000,+,orf_0,1,327,False,3
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:4935350-4935472:+,chr1,4935350,4935473,+,orf_0,1,123,False,3
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:9453457-9453666:+,chr1,9453457,9453667,+,orf_0,1,210,False,3
...,...,...,...,...,...,...,...,...,...
GRCh38.Ens89.dna_rm.chr8.intergenic_gt122:48902094-48904717.norf_segment:48903937-48904062:-,chr8,48903937,48904063,-,norf_9,5,126,False,4
GRCh38.Ens89.dna_rm.chr2.intergenic_gt122:2747860-2748261.norf_segment:2748014-2748175:-,chr2,2748014,2748176,-,norf_9,5,162,False,4
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:55862538-55863365.norf_segment:55862998-55863138:-,chr1,55862998,55863139,-,norf_9,5,141,False,4
GRCh38.Ens89.dna_rm.chr4.intergenic_gt122:117634080-117635194.norf_segment:117634743-117635027:-,chr4,117634743,117635028,-,norf_9,5,285,False,4


In [6]:
# HERE: remove duplicate instances of intergenic ORFs
# and instances of control sequences overlapping 
# decide by: heirarchy, then length, then whichever occurs first 
# keep Intergenic, longest, first-starting

fixed_list = []
dupl_list = []
removed_list = []
gi = 'Gene_Start_hg38'
ge = 'Gene_End_hg38'

total_dups = 0
total_overlaps = 0
by_heirarchy = 0
by_length = 0
punt = 0

for unique_chrom in sorted(control_df.Chr.unique()):
    print(unique_chrom)
    by_chrom_df = control_df[control_df.Chr==unique_chrom]
    
    by_chrom_df = by_chrom_df.sort_values(by=['Chr',gi,ge])
    
    dupls = by_chrom_df[by_chrom_df.duplicated(['Chr','Gene_Start_hg38','Gene_End_hg38'],keep='first')]
    dupl_list.append(dupls)
    
    by_chrom_df_dd = by_chrom_df[~by_chrom_df.duplicated(['Chr','Gene_Start_hg38','Gene_End_hg38'],keep='first')]
    
    total_dups += dupls.shape[0]
    
    for xi,yi in by_chrom_df_dd.iterrows():
        
        index_start = yi[gi]
        index_end = yi[ge]
        
        query_new = by_chrom_df_dd[(by_chrom_df_dd[gi]>(index_start-1))&(by_chrom_df_dd[ge]<(index_end+1000000))]
        
        for xii,yii in query_new.iterrows():
            
            query_start = yii[gi]
            query_end = yii[ge]
            
            case_1 = query_start in range(index_start,index_end+1)
            case_2 = query_end in range(index_start,index_end+1)
            case_3 = index_start in range(int(query_start),int(query_end)+1)
            
            if (xi!=xii):
                if any([case_1,case_2,case_3]):
                    # instance of overlap!
                        total_overlaps += 1
                        if yii['heirarchy'] > yi['heirarchy']:
                            by_chrom_df_dd.loc[xii,'removal'] = True
                            by_heirarchy += 1
                        elif yii['heirarchy'] < yi['heirarchy']:
                            by_chrom_df_dd.loc[xi,'removal'] = True
                            by_heirarchy += 1
                        else:
                            if yii['user_length']>yi['user_length']:
                                by_chrom_df_dd.loc[xi,'removal'] = True
                                by_length += 1
                            elif yii['user_length']<yi['user_length']:
                                by_chrom_df_dd.loc[xii,'removal'] = True
                                by_length += 1
                            else:
                                # Punt - slang to defer decision 
                                # decide which to keep based on which with earliest start
                                punt += 1
                                if yii[gi] > yi[gi]:
                                    by_chrom_df_dd.loc[xii,'removal'] = True
                                else:
                                    by_chrom_df_dd.loc[xi,'removal'] = True
     
    removed_list.append(by_chrom_df_dd[by_chrom_df_dd.removal])
    
    by_chrom_df_dd = by_chrom_df_dd[~by_chrom_df_dd.removal]
    fixed_list.append(by_chrom_df_dd)
    
print('Total duplicates removed (instances): ',total_dups, '(note that first occurence is kept)') # note that the first occurence is kept 
print('Total non-duplicate overlaps observed: ',total_overlaps)
print('Overlaps decided by heirarchy: ',by_heirarchy)
print('Overlaps decided by length: ',by_length)
print('Overlaps where the first occuring one is taken: ',punt)

fixed_df = pd.concat(fixed_list)

fixed_df.shape
            
            
    
    

chr1
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr2
chr20
chr21
chr22
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chrX
chrY
Total duplicates removed (instances):  108 (note that first occurence is kept)
Total non-duplicate overlaps observed:  532
Overlaps decided by heirarchy:  260
Overlaps decided by length:  267
Overlaps where the first occuring one is taken:  5


(39370, 9)

In [7]:
# Print list of all removed controls 
internal_removal = pd.concat(dupl_list + removed_list)
internal_removal

Unnamed: 0_level_0,Chr,Gene_Start_hg38,Gene_End_hg38,Strand,Status,Control_Set,user_length,removal,heirarchy
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:381772-381948:-,chr1,381772,381949,-,orf_5,3,177,False,3
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:389564-389698:-,chr1,389564,389699,-,orf_5,3,135,False,3
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:9286319-9286465:-,chr1,9286319,9286466,-,orf_9,5,147,False,3
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:30588657-30588800:+,chr1,30588657,30588801,+,orf_8,5,144,False,3
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:82185132-82185320:-,chr1,82185132,82185321,-,orf_2,2,189,False,3
...,...,...,...,...,...,...,...,...,...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:376685-376912:-,chrY,376685,376913,-,orf_0,1,228,True,3
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122:9803333-9804623.norf_segment:9803717-9803839:+,chrY,9803717,9803840,+,norf_5,3,123,True,4
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122:14131327-14132345.norf_segment:14131415-14131543:-,chrY,14131415,14131544,-,norf_3,2,129,True,4
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:14853739-14853882:+,chrY,14853739,14853883,+,orf_8,5,144,True,3


In [8]:
# SANITY CHECK: looking at duplicates and making sure they're removed
dupl_list[-5] # "random" removed duplicates by chromosome 

Unnamed: 0_level_0,Chr,Gene_Start_hg38,Gene_End_hg38,Strand,Status,Control_Set,user_length,removal,heirarchy
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GRCh38.Ens89.dna_rm.chr7.intergenic_gt122.orf:49124942-49125067:-,chr7,49124942,49125068,-,orf_8,5,126,False,3
GRCh38.Ens89.dna_rm.chr7.intergenic_gt122.orf:128215691-128216032:-,chr7,128215691,128216033,-,orf_9,5,342,False,3


In [9]:
control_df[(control_df.Chr=='chr7') & (control_df.Gene_Start_hg38==128215691)] # how the duplicate looks in original

Unnamed: 0_level_0,Chr,Gene_Start_hg38,Gene_End_hg38,Strand,Status,Control_Set,user_length,removal,heirarchy
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GRCh38.Ens89.dna_rm.chr7.intergenic_gt122.orf:128215691-128216032:-,chr7,128215691,128216033,-,orf_6,4,342,False,3
GRCh38.Ens89.dna_rm.chr7.intergenic_gt122.orf:128215691-128216032:-,chr7,128215691,128216033,-,orf_9,5,342,False,3


In [10]:
fixed_df[(fixed_df.Chr=='chr7') & (fixed_df.Gene_Start_hg38==128215691)] # how the duplicate looks in final

Unnamed: 0_level_0,Chr,Gene_Start_hg38,Gene_End_hg38,Strand,Status,Control_Set,user_length,removal,heirarchy
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GRCh38.Ens89.dna_rm.chr7.intergenic_gt122.orf:128215691-128216032:-,chr7,128215691,128216033,-,orf_6,4,342,False,3


In [11]:
# SANITY CHECK - making sure removed things are actually removed 
removed_list[len(removed_list)//2] # chosen chromosome of removed samples 

Unnamed: 0_level_0,Chr,Gene_Start_hg38,Gene_End_hg38,Strand,Status,Control_Set,user_length,removal,heirarchy
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GRCh38.Ens89.dna_rm.chr20.intergenic_gt122:1846342-1846867.norf_segment:1846384-1846506:-,chr20,1846384,1846507,-,norf_2,2,123,True,4
GRCh38.Ens89.dna_rm.chr20.intergenic_gt122:5280890-5281982.norf_segment:5281709-5281846:+,chr20,5281709,5281847,+,norf_1,1,138,True,4
GRCh38.Ens89.dna_rm.chr20.intergenic_gt122.orf:7137549-7137689:-,chr20,7137549,7137690,-,orf_8,5,141,True,3
GRCh38.Ens89.dna_rm.chr20.intergenic_gt122:22286316-22288224.norf_segment:22287159-22287422:+,chr20,22287159,22287423,+,norf_5,3,264,True,4
GRCh38.Ens89.dna_rm.chr20.intergenic_gt122:23162495-23163862.norf_segment:23163020-23163271:-,chr20,23163020,23163272,-,norf_4,3,252,True,4
GRCh38.Ens89.dna_rm.chr20.intergenic_gt122:23162495-23163862.norf_segment:23163399-23163527:-,chr20,23163399,23163528,-,norf_7,4,129,True,4
GRCh38.Ens89.dna_rm.chr20.intergenic_gt122.orf:63064440-63064715:+,chr20,63064440,63064716,+,orf_1,1,276,True,3


In [12]:
control_df[(control_df.Chr=='chr20') & (control_df.Gene_Start_hg38==22287159)] # sequences removed 

Unnamed: 0_level_0,Chr,Gene_Start_hg38,Gene_End_hg38,Strand,Status,Control_Set,user_length,removal,heirarchy
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GRCh38.Ens89.dna_rm.chr20.intergenic_gt122:22286316-22288224.norf_segment:22287159-22287422:+,chr20,22287159,22287423,+,norf_5,3,264,False,4


In [13]:
fixed_df[(fixed_df.Chr=='chr20') & (fixed_df.Gene_Start_hg38==63064716)] 
# checking to make sure a random sequence is actually removed 

Unnamed: 0_level_0,Chr,Gene_Start_hg38,Gene_End_hg38,Strand,Status,Control_Set,user_length,removal,heirarchy
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [14]:
# Use the same code to check that, when ran again, no further sequences are removed 

gi = 'Gene_Start_hg38'
ge = 'Gene_End_hg38'
    
for chrom_id in sorted(fixed_df.Chr.unique()):

    print(chrom_id)

    query_df = fixed_df[fixed_df['Chr']==chrom_id]

    for xi,yi in query_df.iterrows():
        index_start = yi[gi]
        index_end = yi[ge]

        query_new = query_df[(query_df[gi]>(index_start-1000000))&(query_df[ge]<(index_end+1000000))]

        for xii,yii in query_new.iterrows():
            query_start = yii[gi]
            query_end = yii[ge]

            if xi!=xii:
                if query_start in range(index_start,index_end+1) or query_end in range(index_start,index_end+1):
                    print(xi,yi)
                    print(xii,yii) 
                    print('Exception(DANGER)')



                elif index_start in range(int(query_start),int(query_end)+1):
                    print(xi,yi)
                    print(xii,yii) 
                    print('Exception(DANGER)')

chr1
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr2
chr20
chr21
chr22
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chrX
chrY


In [15]:
# internal duplicates taken care of, now time to do the external (annotated and unannotated genes)
phylo_df = pd.read_csv('/Users/marten/ug-gc/marten_completeGRCh38_21436genes_47UGremoved_gene_transcript_cds_metadata_mashup_old_new_diffFix_20231005.tsv',sep='\t',index_col='Name')
phylo_df

Unnamed: 0_level_0,PS,Description,Plength,Gap_Gene?,Chr,OldLong_Transcript_Start_hg38,OldLong_Transcript_End_hg38,Strand,CDS_Start_hg38,CDS_End_hg38,Protein_Sequence,CDS_Sequence,in_old,Gene_Start,Gene_Stop,in_new,evoera_38,evoera5_38,annotation_38
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Baz_Hs_1,31,ENST00000308604.5_18272_457,52,Not_Gap_Gene,chr2,,,-,111429413,111429572,MTDTENHDSSPSSTSTCCPPITAGMQLKDSLGPGSNCPLWTLRPLH...,ATGACAGACACTGAAAATCACGACTCATCCCCCTCCAGCACCTCTA...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap
Baz_Hs_10,31,ENST00000411630.2_23991_594,59,Not_Gap_Gene,chr4,,,+,52713673,52713853,MLVATGQCSRCFMFTFSTFSFNCHNSEVDSVRDRLPQDHSAPANSM...,ATGCTGGTGGCAACAGGGCAGTGTAGCAGGTGCTTCATGTTCACCT...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap
Baz_Hs_103,31,ENST00000499346.2_27384_333,84,Not_Gap_Gene,chr5,,,-,128082767,128083022,MLGAFRSGPQPLPEPRARCVPQPGLLWALTRRRESPLVTPGLNLEE...,ATGCTGGGGGCTTTCCGGTCGGGGCCGCAGCCGCTTCCGGAGCCGC...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap
Baz_Hs_108,24,ENST00000501177.3_12701_390,84,Not_Gap_Gene,chr16,,,-,54919086,54925136,MLAEIHPKAGLQSLQFIMELLYWLLEGGDSEDKEDATGNVEMKNIQ...,ATGTTGGCTGAAATTCATCCCAAGGCTGGTCTGCAAAGTCTGCAAT...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap
Baz_Hs_112,25,ENST00000503704.1_24073_293,52,Not_Gap_Gene,chr4,,,-,82900166,82900435,MRSREAGPKLRRIQEPANGSPGAVSETGGYREERLSDAEIMGKLLA...,ATGCGAAGCAGAGAGGCAGGACCAAAATTGAGGCGAATCCAGGAAC...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vdp2013_S4_994,31,HF584391,69,Not_Gap_Gene,chrX,,,-,143628143,143628353,MLYTHNTEFNLKRQICFVPQCKTFVSLCFVKQTQENWYTCTSWVLY...,ATGCTTTATACACATAATACTGAATTTAACCTCAAGAGGCAAATCT...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap
vdp2013_S4_995,31,HF583960,46,Not_Gap_Gene,chr10,,,+,104020518,104021665,MREWLSIRNMRIKCEIFSCSVKPMSANCISCRMKNATCWLSMRLRN,ATGAGAGAATGGCTCAGCATCAGAAACATGAGAATCAAATGCGAGA...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap
vdp2013_S4_997,31,HF548108,40,Not_Gap_Gene,chr6,,,+,70860637,70860760,MFAYKGSSYHVSNTSNSINPTPKLASNPVGRYCMIKCLII,ATGTTTGCATATAAGGGAAGTAGTTATCATGTTAGTAATACCTCTA...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap
vdp2013_S4_998,31,HF583700,82,Not_Gap_Gene,chr12,,,+,50099104,50099353,MLLVQGQHQNEEGLTRHLLSSSFTLSLPTPSFPLPHKVPMCLYPPL...,ATGCTGTTGGTTCAAGGACAACACCAGAATGAAGAGGGTCTCACAA...,True,,,True,4-Primate,5-Primate,Unannotated With Overlap


In [16]:
# Set one column with length information
# Unannotated Genes do not have 'oldest longest transcript' information
# so set their CDS_Start and CDS_End information there instead

for xi,yi in phylo_df.iterrows():
    if str(yi['OldLong_Transcript_Start_hg38'] == 'nan'):
        phylo_df.loc[xi,'OldLong_Transcript_Start_hg38'] = phylo_df.loc[xi,'CDS_Start_hg38']
        phylo_df.loc[xi,'OldLong_Transcript_End_hg38'] = phylo_df.loc[xi,'CDS_End_hg38']
    if 'ENSP' in xi[:4]:
        phylo_df.loc[xi,'Status'] = 'annotated'
    else:
        phylo_df.loc[xi,'Status'] = 'unannotated'
    phylo_df.loc[xi,'Control_Set'] = 'victorgenes' # known annotated & unannotated genes
    

In [17]:
# Rename this to 'Gene Start' and 'Gene End'
phylo_df = phylo_df.rename(columns={'OldLong_Transcript_Start_hg38':'Gene_Start_hg38','OldLong_Transcript_End_hg38':'Gene_End_hg38'})



In [18]:
# Code to: remove control sequences which overlap known annotated and unannotated genes

ug_cleaned_dfs = []
ugag_removal_list = []

known_overlap = 0
ug_overlap = 0
ag_overlap = 0

for unique_chrom in sorted(fixed_df.Chr.unique()):
    print(unique_chrom)
    by_chrom_df = fixed_df[fixed_df.Chr==unique_chrom]
    by_chrom_df_dd = by_chrom_df.sort_values(by=[gi,ge])
    
    phylo_by_chr = phylo_df[phylo_df.Chr==unique_chrom].sort_values(by=[gi,ge])
    
    total_dups += dupls.shape[0]
    
    for xi,yi in by_chrom_df_dd.iterrows():
        
        index_start = yi[gi]
        index_end = yi[ge]
        
        query_new = phylo_by_chr[(phylo_by_chr[gi]>(index_start-1000000))&(phylo_by_chr[ge]<(index_end+1000000))]
        
        for xii,yii in query_new.iterrows():
            
            query_start = yii[gi]
            query_end = yii[ge]
            
            case_1 = query_start in range(index_start,index_end+1)
            case_2 = query_end in range(index_start,index_end+1)
            case_3 = index_start in range(int(query_start),int(query_end)+1)
            
            if (xi!=xii):
                if any([case_1,case_2,case_3]):
                    by_chrom_df_dd.loc[xi,'removal']=True
                    known_overlap += 1
                    
                    if yii['Status']=='annotated':
                        ag_overlap += 1
                    elif yii['Status']=='unannotated':
                        ug_overlap += 1
                    else:
                        raise Exception('Exception')

     
    ugag_removal_list.append(by_chrom_df_dd[by_chrom_df_dd.removal])
    by_chrom_df_dd = by_chrom_df_dd[~by_chrom_df_dd.removal]
    ug_cleaned_dfs.append(by_chrom_df_dd)
    
print('Total ORF/NORFs removed due to overlap with known annotated&unannoted genes: ',known_overlap)
print('For UG: ',ug_overlap)
print('For Ensembl: ',ag_overlap)

df2 = pd.concat(ug_cleaned_dfs)

df2.shape   

chr1
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr2
chr20
chr21
chr22
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chrX
chrY
Total ORF/NORFs removed due to overlap with known annotated&unannoted genes:  201
For UG:  200
For Ensembl:  1


(39177, 9)

In [19]:
# Run the same code again to ensure that nothing is removed the second time 

for chrom_id in df2.Chr.unique():

    print(chrom_id)

    query_df = df2[df2['Chr']==chrom_id]

    for xi,yi in query_df.iterrows():
        index_start = yi[gi]
        index_end = yi[ge]
        
        p_by_c = phylo_df[phylo_df.Chr==chrom_id]
        query_new = p_by_c[(p_by_c[gi]>(index_start-1000000))&(p_by_c[ge]<(index_end+1000000))]

        for xii,yii in query_new.iterrows():
            query_start = yii[gi]
            query_end = yii[ge]

            if xi!=xii:
                if query_start in range(index_start,index_end+1) or query_end in range(index_start,index_end+1):
                    print(xi,yi)
                    print(xii,yii) 
                    raise Exception('DANGER')



                elif index_start in range(int(query_start),int(query_end)+1):
                    raise Exception('DANGER')

chr1
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr2
chr20
chr21
chr22
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chrX
chrY


In [20]:
# a sequence that should've been removed 
ugag_removal_list[len(ugag_removal_list)//3].iloc[2]

Chr                   chr17
Gene_Start_hg38    83055202
Gene_End_hg38      83055523
Strand                    -
Status               norf_0
Control_Set               1
user_length             321
removal                True
heirarchy                 4
Name: GRCh38.Ens89.dna_rm.chr17.intergenic_gt122:83054956-83057066.norf_segment:83055202-83055522:-, dtype: object

In [21]:
# making sure that it is actually removed 
df2[(df2.Chr=='chr17')&(df2.Gene_Start_hg38==83055202)]

Unnamed: 0_level_0,Chr,Gene_Start_hg38,Gene_End_hg38,Strand,Status,Control_Set,user_length,removal,heirarchy
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [22]:
# code to turn the DFs as we have them formatted into properly formatted BEDs with the control set in their name

def bedify(input_df):
    ret_df = df2.reset_index()
    newnames = [ret_df.loc[xi,'Name']+ret_df.loc[xi,'Status']+'_control_set_'+str(ret_df.loc[xi,'Control_Set']) for xi in ret_df.index] 
    ret_df['new_index'] = newnames
    ret_df_2 = ret_df[['Chr','Gene_Start_hg38','Gene_End_hg38','new_index','Strand']]
    ret_df_2['zeroes'] = 0
    ret_df_2 = ret_df_2[['Chr','Gene_Start_hg38','Gene_End_hg38','new_index','zeroes','Strand']]
    ret_df_2.new_index[1]
    return ret_df_2
    

In [23]:
# Export entire set
bedify(df2).to_csv(r'grch38_bed_complete_allsets.bed',sep='\t',index=False,header=False)
# Create one dataframe per set of GRCh38 Controls
df_by_set = {}

In [24]:
df2[~df2.Status.str.contains('norf')].iloc[::2000]

Unnamed: 0_level_0,Chr,Gene_Start_hg38,Gene_End_hg38,Strand,Status,Control_Set,user_length,removal,heirarchy
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:35726-35854:-,chr1,35726,35855,-,orf_6,4,129,False,3
GRCh38.Ens89.dna_rm.chr10.intergenic_gt122.orf:78457018-78457278:+,chr10,78457018,78457279,+,orf_1,1,261,False,3
GRCh38.Ens89.dna_rm.chr13.intergenic_gt122.orf:19295575-19295712:+,chr13,19295575,19295713,+,orf_1,1,138,False,3
GRCh38.Ens89.dna_rm.chr15.intergenic_gt122.orf:96791710-96791832:+,chr15,96791710,96791833,+,orf_4,3,123,False,3
GRCh38.Ens89.dna_rm.chr2.intergenic_gt122.orf:1612891-1613037:-,chr2,1612891,1613038,-,orf_2,2,147,False,3
GRCh38.Ens89.dna_rm.chr20.intergenic_gt122.orf:24084104-24084259:-,chr20,24084104,24084260,-,orf_4,3,156,False,3
GRCh38.Ens89.dna_rm.chr4.intergenic_gt122.orf:12441312-12441443:-,chr4,12441312,12441444,-,orf_3,2,132,False,3
GRCh38.Ens89.dna_rm.chr5.intergenic_gt122.orf:121017800-121017922:+,chr5,121017800,121017923,+,orf_6,4,123,False,3
GRCh38.Ens89.dna_rm.chr7.intergenic_gt122.orf:55593255-55593437:+,chr7,55593255,55593438,+,orf_0,1,183,False,3
GRCh38.Ens89.dna_rm.chr9.intergenic_gt122.orf:62847254-62847424:+,chr9,62847254,62847425,+,orf_1,1,171,False,3


In [25]:
# For each control set, write an output BED file for the ORF and Non-ORF sets

for control_set in sorted(df2.Control_Set.unique()):
    # print(control_set)
    df_temp = df2[df2.Control_Set == control_set].sort_values(by=['Chr','Gene_Start_hg38','Gene_End_hg38'])
    df_temp_orf = df_temp[~df_temp.Status.str.contains('norf')]
    df_temp_norf = df_temp[df_temp.Status.str.contains('norf')]
    print(f'count check: {df_temp_orf.shape[0]} + {df_temp_norf.shape[0]} == {df_temp.shape[0]} ? {df_temp_orf.shape[0] + df_temp_norf.shape[0] == df_temp.shape[0]} ')
    df_by_set[control_set] = {'orf':bedify(df_temp_orf),'norf':bedify(df_temp_norf)}
    bedify(df_temp_orf).to_csv(f'grch38_orf_{control_set}.bed',sep='\t',index=False,header=False,encoding="utf-8")
    bedify(df_temp_norf).to_csv(f'grch38_norf_{control_set}.bed',sep='\t',index=False,header=False,encoding="utf-8")
    

count check: 3943 + 3915 == 7858 ? True 
count check: 3947 + 3902 == 7849 ? True 
count check: 3929 + 3902 == 7831 ? True 
count check: 3924 + 3901 == 7825 ? True 
count check: 3911 + 3903 == 7814 ? True 


In [26]:
# Remove heirarchy and write out removed sequences due to Annotated and Unannotated Genes
pd.concat(ugag_removal_list).drop('heirarchy',axis=1).sort_values(by=["Chr",'Gene_Start_hg38','Gene_End_hg38']).to_csv('removed_sequences_genic_overlap.tsv',sep='\t')



In [27]:
# Write out sequences removed due to ORF-NORF overlap
pd.concat(removed_list).drop('heirarchy',axis=1).sort_values(by=["Chr",'Gene_Start_hg38','Gene_End_hg38']).to_csv('removed_sequences_orf_norf_overlap.tsv',sep='\t')




In [28]:
# Write out ORFs removed due to duplication
pd.concat(dupl_list).drop('heirarchy',axis=1).sort_values(by=["Chr",'Gene_Start_hg38','Gene_End_hg38']).to_csv('removed_sequences_duplicates.tsv',sep='\t')
