In [1]:
# Daniel Marten 
# Create final GRCh37 files 

import pandas as pd
import qtl.io as io
import qtl.norm as norm
import csv
from Bio.Seq import Seq



In [2]:
# Merge all GRCh37 Outputs 

norf_dfs = []
orf_dfs = []

for gtfidx in range(10):
    # Reading in all GRCh37 ORFs
    orf_path = f'/Users/marten/ug-gc/bam-gtf/grch37/sample-ORFs/GRCh37.Ens87.dna_rm.chromosome.all.intergenic_gt122.orfs.sample{gtfidx}.bed'
    norf_path = f'/Users/marten/ug-gc/bam-gtf/grch37/sample-nORFs/GRCh37.Ens87.dna_rm.chromosome.all.intergenic_gt122.norf_sample{gtfidx}.bed'
    new_orf = pd.read_csv(orf_path,sep='\t').reset_index()
    new_norf = pd.read_csv(norf_path,sep='\t').reset_index()
    
    # Formatting
    col_renames = {'level_0':'Chr',
               'level_1':'Gene_Start_hg37',
               'level_2':'Gene_End_hg37',
               'level_3':'Name',
               '# All coordinates are 0-indexed, coordinates in ID column are end-inclusive, and coordinates in start/end columns are end-exclusive (as is conventional)':'Strand'
              }
    
    new_orf = new_orf.rename(columns=col_renames)
    new_orf = new_orf.set_index('Name').drop('level_4',axis=1)
    new_orf['Status'] = [f'orf_{gtfidx}']*new_orf.shape[0]
    new_orf['Control_Set'] = (gtfidx//2)+1
    #new_orf['annotation'] = ['intergenics']*new_orf.shape[0]
    
    new_norf = new_norf.rename(columns=col_renames)
    new_norf = new_norf.set_index('Name').drop('level_4',axis=1)
    new_norf['Status'] = [f'norf_{gtfidx}']*new_norf.shape[0]
    new_norf['Control_Set'] = (gtfidx//2)+1
    #new_norf['annotation'] = ['intergenics']*new_norf.shape[0]

    orf_dfs.append(new_orf)
    norf_dfs.append(new_norf)

mega_37 = pd.concat(orf_dfs + norf_dfs)

In [3]:
# code to turn the DFs as we have them formatted into properly formatted BEDs with the control set in their name

def bedify_37(input_df):
    ret_df = input_df.reset_index()
    #newnames = [ret_df.loc[xi,'Name']+ret_df.loc[xi,'Status']+'_control_set_'+str(ret_df.loc[xi,'Control_Set']) for xi in ret_df.index] 
    ret_df['new_index'] = ret_df.Name
    ret_df_2 = ret_df[['Chr','Gene_Start_hg37','Gene_End_hg37','new_index','Strand']]
    ret_df_2['zeroes'] = 0
    ret_df_2 = ret_df_2[['Chr','Gene_Start_hg37','Gene_End_hg37','new_index','zeroes','Strand']].sort_values(by=['Chr','Gene_Start_hg37','Gene_End_hg37'])
    ret_df_2.new_index[1]
    return ret_df_2


In [4]:
mega_37 # this is the truthset , and in native GRCh37 coordinates

Unnamed: 0_level_0,Chr,Gene_Start_hg37,Gene_End_hg37,Strand,Status,Control_Set
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122.orf:266925-267119:+,chr1,266925,267120,+,orf_0,1
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122.orf:378478-378666:+,chr1,378478,378667,+,orf_0,1
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122.orf:5044699-5044833:+,chr1,5044699,5044834,+,orf_0,1
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122.orf:5089004-5089129:+,chr1,5089004,5089130,+,orf_0,1
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122.orf:10695965-10696429:+,chr1,10695965,10696430,+,orf_0,1
...,...,...,...,...,...,...
GRCh37.Ens87.dna_rm.chr17.intergenic_gt122:70014154-70015328.norf_segment:70014220-70014384:-,chr17,70014220,70014385,-,norf_9,5
GRCh37.Ens87.dna_rm.chr8.intergenic_gt122:35800949-35801423.norf_segment:35801165-35801347:-,chr8,35801165,35801348,-,norf_9,5
GRCh37.Ens87.dna_rm.chr2.intergenic_gt122:4297079-4297567.norf_segment:4297285-4297452:-,chr2,4297285,4297453,-,norf_9,5
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:46912094-46913297.norf_segment:46912814-46912942:-,chr1,46912814,46912943,-,norf_9,5


In [5]:
# Read in BED of GRCh37->38 liftover outputs 
post = pd.read_csv('grch37liftover_38888.bed',sep='\t',header=None)
post # these coordinates are liftedover 

Unnamed: 0,0,1,2,3,4,5
0,chr1,42671,42896,GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:4250...,0,+
1,chr1,58253,58619,GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:5560...,0,-
2,chr1,59878,60067,GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:5920...,0,-
3,chr1,105843,105990,GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:1057...,0,+
4,chr1,107280,107418,GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:1065...,0,+
...,...,...,...,...,...,...
38883,chrY,56881824,56882004,GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:...,0,-
38884,chrY,57063496,57063664,GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:...,0,+
38885,chrY,57193560,57193719,GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:...,0,+
38886,chrY,57206930,57207179,GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:...,0,+


In [6]:
# Rename and reformat these , as appropriate , to make merging and formatting easier
post = post.rename(columns = {0:'Chr',
               1:'Gene_Start_hg87',
               2:'Gene_End_hg38',
               3:'Name',
               4:'Quality',
               5:'Strand'
              })
post

Unnamed: 0,Chr,Gene_Start_hg87,Gene_End_hg38,Name,Quality,Strand
0,chr1,42671,42896,GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:4250...,0,+
1,chr1,58253,58619,GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:5560...,0,-
2,chr1,59878,60067,GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:5920...,0,-
3,chr1,105843,105990,GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:1057...,0,+
4,chr1,107280,107418,GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:1065...,0,+
...,...,...,...,...,...,...
38883,chrY,56881824,56882004,GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:...,0,-
38884,chrY,57063496,57063664,GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:...,0,+
38885,chrY,57193560,57193719,GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:...,0,+
38886,chrY,57206930,57207179,GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:...,0,+


In [7]:
# REMOVE controls which mapped to non-canonical chromosomes 
chr_cannon = []
for chri in post.Chr.unique():
    print(chri)
    if len(chri)<7:
        chr_cannon.append(chri)
    else:
        print('dropped!')
        
post = post[post.Chr.isin(chr_cannon)]

chr1
chr10
chr11
chr12
chr13
chr14
chr14_GL000009v2_random
dropped!
chr15
chr15_KI270850v1_alt
dropped!
chr16
chr17
chr18
chr19
chr2
chr20
chr21
chr22
chr22_KI270879v1_alt
dropped!
chr3
chr4
chr4_GL000008v2_random
dropped!
chr5
chr6
chr7
chr7_KI270803v1_alt
dropped!
chr8
chr8_KI270821v1_alt
dropped!
chr9
chrUn_KI270742v1
dropped!
chrX
chrY


In [8]:
# String version of control version 
mega_37['Control_Str'] = [str(control_name) for control_name in mega_37['Control_Set']]

In [9]:
# Create new index containing control set information , as well as our new grouped control sets 
mega_37['New_Names'] = mega_37.index + mega_37['Status']+'_control_set_'+mega_37['Control_Str']
mega_37_nn = mega_37.set_index('New_Names')

In [10]:
# Filter mega_37_nn (new names) to ONLY those which appear in the filtered liftover file 
native37 = mega_37_nn.join(post[['Name']].set_index('Name'),how='inner')
native37.drop(['Control_Str'],axis=1).sort_values(by=['Chr','Gene_Start_hg37','Control_Set','Status'])

Unnamed: 0,Chr,Gene_Start_hg37,Gene_End_hg37,Strand,Status,Control_Set
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:42504-43241.norf_segment:42671-42895:+norf_1_control_set_1,chr1,42671,42896,+,norf_1,1
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:58253-58618:-norf_1_control_set_1,chr1,58253,58619,-,norf_1,1
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:59208-60377.norf_segment:59878-60066:-norf_0_control_set_1,chr1,59878,60067,-,norf_0,1
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:105742-106428.norf_segment:105843-105989:+norf_9_control_set_5,chr1,105843,105990,+,norf_9,5
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:106531-107479.norf_segment:107280-107417:+norf_3_control_set_2,chr1,107280,107418,+,norf_3,2
...,...,...,...,...,...,...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59027971-59028150:-orf_0_control_set_1,chrY,59027971,59028151,-,orf_0,1
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59209645-59209812:+orf_9_control_set_5,chrY,59209645,59209813,+,orf_9,5
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59339711-59339869:+orf_0_control_set_1,chrY,59339711,59339870,+,orf_0,1
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59353081-59353329:+orf_5_control_set_3,chrY,59353081,59353330,+,orf_5,3


In [11]:
# Create native length from coordinates 
native37['Length_from_Coordinates'] = native37.Gene_End_hg37 - native37.Gene_Start_hg37

In [12]:
# Sort values and the unecessary Control_Str columns
native37 = native37.drop(['Control_Str'],axis=1).sort_values(by=['Chr','Gene_Start_hg37','Control_Set','Status'])
native37

Unnamed: 0,Chr,Gene_Start_hg37,Gene_End_hg37,Strand,Status,Control_Set,Length_from_Coordinates
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:42504-43241.norf_segment:42671-42895:+norf_1_control_set_1,chr1,42671,42896,+,norf_1,1,225
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:58253-58618:-norf_1_control_set_1,chr1,58253,58619,-,norf_1,1,366
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:59208-60377.norf_segment:59878-60066:-norf_0_control_set_1,chr1,59878,60067,-,norf_0,1,189
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:105742-106428.norf_segment:105843-105989:+norf_9_control_set_5,chr1,105843,105990,+,norf_9,5,147
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:106531-107479.norf_segment:107280-107417:+norf_3_control_set_2,chr1,107280,107418,+,norf_3,2,138
...,...,...,...,...,...,...,...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59027971-59028150:-orf_0_control_set_1,chrY,59027971,59028151,-,orf_0,1,180
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59209645-59209812:+orf_9_control_set_5,chrY,59209645,59209813,+,orf_9,5,168
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59339711-59339869:+orf_0_control_set_1,chrY,59339711,59339870,+,orf_0,1,159
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59353081-59353329:+orf_5_control_set_3,chrY,59353081,59353330,+,orf_5,3,249


In [13]:
# Create dictionary for later DFs to be turned into BEDs
df_by_set = {}

In [14]:
# for each control set value, create ORF and NORF dfs as keyed by control set 

for control_set in sorted(native37.Control_Set.unique()):
    # print(control_set)
    df_temp = native37[native37.Control_Set == control_set].sort_values(by=['Chr','Gene_Start_hg37','Gene_End_hg37']).reset_index(drop=False).rename(columns={'index':'Name'})
    df_temp_orf = df_temp[~df_temp.Status.str.contains('norf')]
    df_temp_norf = df_temp[df_temp.Status.str.contains('norf')]
    print(f'quick maths: {df_temp_orf.shape[0]} + {df_temp_norf.shape[0]} == {df_temp.shape[0]} ? {df_temp_orf.shape[0] + df_temp_norf.shape[0] == df_temp.shape[0]} ')
    df_by_set[control_set] = {'orf':bedify_37(df_temp_orf),'norf':bedify_37(df_temp_norf)}
    bedify_37(df_temp_orf).to_csv(f'grch37_intergenic_controls_ORFs_{control_set}.bed',sep='\t',index=False,header=False,encoding="utf-8")
    bedify_37(df_temp_norf).to_csv(f'grch37_intergenic_controls_non_ORFs_{control_set}.bed',sep='\t',index=False,header=False,encoding="utf-8")
    bedify_37(df_temp).to_csv(f'grch37_intergenic_controls_combined_ORFs_non_ORFs_{control_set}.bed',sep='\t',index=False,header=False,encoding="utf-8")

quick maths: 3924 + 3892 == 7816 ? True 
quick maths: 3904 + 3868 == 7772 ? True 
quick maths: 3896 + 3871 == 7767 ? True 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ret_df_2['zeroes'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ret_df_2['zeroes'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ret_df_2['zeroes'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cave

quick maths: 3877 + 3882 == 7759 ? True 
quick maths: 3857 + 3877 == 7734 ? True 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ret_df_2['zeroes'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ret_df_2['zeroes'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ret_df_2['zeroes'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cave

In [15]:
# New DF temp , renamed, to be ready for further analysis
df_temp = native37.sort_values(by=['Chr','Gene_Start_hg37','Gene_End_hg37']).reset_index(drop=False).rename(columns={'index':'Name'})

In [16]:
## START FURTHER/SEPARATE WORK 
## Create output file, appended with DNA and Amino Acid sequences of these ORFs 

In [17]:
# Separately, fastas had been generated using bedtools and a downloaded version of the GRCh37 genome
# https://bedtools.readthedocs.io/en/latest/content/tools/getfasta.html
# for all GRCh37 controls

fastas = pd.read_csv(r'fasta_grch37.txt',sep='\t',header=None)
fastas_2 = fastas.rename(columns={0:'Name',1:'J'})#.set_index('Name')
fastas_2.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
38843    False
38844    False
38845    False
38846    False
38847    False
Length: 38848, dtype: bool

In [18]:
first_names = []
for xi in fastas_2.Name:
    xi = xi.split('::')[0]
    first_names.append(xi)
fastas_2['Name'] = first_names
fastas_named = fastas_2.set_index('Name')
fastas_named

Unnamed: 0_level_0,J
Name,Unnamed: 1_level_1
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:42504-43241.norf_segment:42671-42895:+norf_1_control_set_1,CAATCAGGAAGGAGTTGTGGTAGGAAGTCTGTGCTGTTGAATGTAC...
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:58253-58618:-norf_1_control_set_1,GAAAGGGAATTGGGAAGAGCAATGCCCAGTGAAAAAGAAGAAATAA...
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:59208-60377.norf_segment:59878-60066:-norf_0_control_set_1,CACTCCAAGACAAACACCATTTCAGTAGCAATATGAATTTCAGTAG...
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:105742-106428.norf_segment:105843-105989:+norf_9_control_set_5,AATTGACATATCCTGATTTCTTCCATAGCTTGGATCTTGACCTAGA...
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:106531-107479.norf_segment:107280-107417:+norf_3_control_set_2,CCTTACACTTAGCTATAAAGGAGTGGAAAACACAAAGATGAGTAAC...
...,...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59027971-59028150:-orf_0_control_set_1,TTAACACTGCTTCCTTCTCCCATCGAGAAGTAAAGCCCAGGTTCTG...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59209645-59209812:+orf_9_control_set_5,ATGTTCGTTTTCTGTGAAGCAGGCCTTTTCTGTTACTCTCCATGCA...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59339711-59339869:+orf_0_control_set_1,ATGGGGGACAGCCTTGCAGGGAGGTTGGTGAAGTCTATTTGGACCT...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59353081-59353329:+orf_5_control_set_3,ATGCCTGAGCCCCCTGCACACAAGGAGCCAGGAGTAATCAGGGCAG...


In [19]:
# Create relevant dataframe , appended with all FASTAs 
df_relevant = native37.join(fastas_named)

In [20]:
# If negative stranded , then pass change to reverse_complement
# so that we ensure all ORFs start with Methionine/ATG, if they would 
for xi,yi in df_relevant.iterrows():
    if yi['Strand'] == '-':
        df_relevant.loc[xi,'J'] = Seq(yi['J']).reverse_complement().__str__()

In [21]:
# brief check: show that all negative stranded ORFs start with "ATG"
plusorf37 = df_relevant[(df_relevant.Strand=='-')&(df_relevant.Status.str[:3]=='orf')]
display(plusorf37)

sum(plusorf37[plusorf37.J.str[:3].isin(['ATG','atg'])].Chr.value_counts())

Unnamed: 0,Chr,Gene_Start_hg37,Gene_End_hg37,Strand,Status,Control_Set,Length_from_Coordinates,J
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122.orf:132308-132529:-orf_7_control_set_4,chr1,132308,132530,-,orf_7,4,222,ATGACAGGTGCAAATACCTTCCCACCAAAGCCCTTGTTGCCCTCTG...
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122.orf:330011-330244:-orf_6_control_set_4,chr1,330011,330245,-,orf_6,4,234,ATGGGGACAGCTTCCCCTTTGCCTGTACAGGTGGAGAAGCCGAGGA...
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122.orf:340419-340610:-orf_8_control_set_5,chr1,340419,340611,-,orf_8,5,192,ATGGAATCTGTTCCTCCTCCAAAACGGAATTTGGTCACCCTTAAAT...
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122.orf:610602-610811:-orf_9_control_set_5,chr1,610602,610812,-,orf_9,5,210,ATGCTGCTGAATATACCTGAGTACATAGTAAGACATTTGTTTGGTA...
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122.orf:658672-658893:-orf_4_control_set_3,chr1,658672,658894,-,orf_4,3,222,ATGACAGGTGCAAATACCTTCCCACCAAAGCCCTTGTTGCCCTCTG...
...,...,...,...,...,...,...,...,...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:28722098-28722334:-orf_2_control_set_2,chrY,28722098,28722335,-,orf_2,2,237,ATGAACAAAAATTATTTAAGTTACATATGCTCTTTTTTAAAAGTTT...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:28769186-28769329:-orf_0_control_set_1,chrY,28769186,28769330,-,orf_0,1,144,ATGCTGTGCAGACCACGGCCTCCGCAGAGGATCCCCTCACCCAGGC...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:28772721-28773188:-orf_1_control_set_1,chrY,28772721,28773189,-,orf_1,1,468,ATGTCTTCCATTTACTTGGCCGGCAAAGTGGAAGAGCAGCACCTGC...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59027971-59028150:-orf_0_control_set_1,chrY,59027971,59028151,-,orf_0,1,180,ATGGTAAAATTCATTGTGATCCTGACCTCTGATGCTGTCAGCCACA...


9696

In [22]:
# Now that Nucleic Acid was sanity checked, do the same with Amino Acid sequence
df_relevant['K'] = [Seq(dna).translate().__str__() for dna in df_relevant['J']]

df_relevant

Unnamed: 0,Chr,Gene_Start_hg37,Gene_End_hg37,Strand,Status,Control_Set,Length_from_Coordinates,J,K
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:42504-43241.norf_segment:42671-42895:+norf_1_control_set_1,chr1,42671,42896,+,norf_1,1,225,CAATCAGGAAGGAGTTGTGGTAGGAAGTCTGTGCTGTTGAATGTAC...,QSGRSCGRKSVLLNVH*SMIP*IIHNKKKD*NSFFKKKAQKLI*VL...
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:58253-58618:-norf_1_control_set_1,chr1,58253,58619,-,norf_1,1,366,CAAAAAGGATGTTCAGAATTATAGTTTTGTATAGAAGTGGGAAAGA...,QKGCSEL*FCIEVGKNFEILQVE*ETTQCG*MANNTFKKSESEGRP...
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:59208-60377.norf_segment:59878-60066:-norf_0_control_set_1,chr1,59878,60067,-,norf_0,1,189,TCCTGTTAAAACTAAGATACAATATATAAACCAAGCACTCTAGGAC...,SC*N*DTIYKPSTLGLSS*K*YFIL*EQFQNKSN*CLNYFVIFGDS...
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:105742-106428.norf_segment:105843-105989:+norf_9_control_set_5,chr1,105843,105990,+,norf_9,5,147,AATTGACATATCCTGATTTCTTCCATAGCTTGGATCTTGACCTAGA...,N*HILISSIAWILT*REI*KC*LEPEGCHFHC*SSFMDHELEK*LQQHG
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:106531-107479.norf_segment:107280-107417:+norf_3_control_set_2,chr1,107280,107418,+,norf_3,2,138,CCTTACACTTAGCTATAAAGGAGTGGAAAACACAAAGATGAGTAAC...,PYT*L*RSGKHKDE*LHFSKD*SYTNNTKV*TNNDEMTKAECFLFG
...,...,...,...,...,...,...,...,...,...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59027971-59028150:-orf_0_control_set_1,chrY,59027971,59028151,-,orf_0,1,180,ATGGTAAAATTCATTGTGATCCTGACCTCTGATGCTGTCAGCCACA...,MVKFIVILTSDAVSHRKGQNNLRLKPCLSQSSGSPNMVSRWSLLLR...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59209645-59209812:+orf_9_control_set_5,chrY,59209645,59209813,+,orf_9,5,168,ATGTTCGTTTTCTGTGAAGCAGGCCTTTTCTGTTACTCTCCATGCA...,MFVFCEAGLFCYSPCNALVVALKRKISSQFLQSYKLLGKKEKSKSE...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59339711-59339869:+orf_0_control_set_1,chrY,59339711,59339870,+,orf_0,1,159,ATGGGGGACAGCCTTGCAGGGAGGTTGGTGAAGTCTATTTGGACCT...,MGDSLAGRLVKSIWTCWVGAPGDILVGGSVDPWQQVWCSGRHWVGW...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59353081-59353329:+orf_5_control_set_3,chrY,59353081,59353330,+,orf_5,3,249,ATGCCTGAGCCCCCTGCACACAAGGAGCCAGGAGTAATCAGGGCAG...,MPEPPAHKEPGVIRADPLGHGDFWIVKLALWGPRPSNVGGSGFGLL...


In [23]:
# Same patterm of checking, make sure ALL ORFs end with '*'
# sum of that == shape, as shown printed 
# same check down with starting with 'M' if desired
plusorf37 = df_relevant[df_relevant.Status.str[:3]=='orf']
display(plusorf37)

sum(plusorf37[plusorf37.K.str[-1]=='*'].Chr.value_counts())

Unnamed: 0,Chr,Gene_Start_hg37,Gene_End_hg37,Strand,Status,Control_Set,Length_from_Coordinates,J,K
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122.orf:132308-132529:-orf_7_control_set_4,chr1,132308,132530,-,orf_7,4,222,ATGACAGGTGCAAATACCTTCCCACCAAAGCCCTTGTTGCCCTCTG...,MTGANTFPPKPLLPSGSSPELSPLSVGHPLLELEIGVGGAKAIISN...
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122.orf:266925-267119:+orf_0_control_set_1,chr1,266925,267120,+,orf_0,1,195,ATGGTAAATCATTTTCTACCAAAAGAAAGAAATGTCTTGTCTATTC...,MVNHFLPKERNVLSIQVLLYLKVFLVGEQVVRKSYFIRTFSLTIIQ...
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122.orf:330011-330244:-orf_6_control_set_4,chr1,330011,330245,-,orf_6,4,234,ATGGGGACAGCTTCCCCTTTGCCTGTACAGGTGGAGAAGCCGAGGA...,MGTASPLPVQVEKPRTGSGNRRPRRRCPLHCTYPGPVPALIMQLFQ...
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122.orf:340419-340610:-orf_8_control_set_5,chr1,340419,340611,-,orf_8,5,192,ATGGAATCTGTTCCTCCTCCAAAACGGAATTTGGTCACCCTTAAAT...,MESVPPPKRNLVTLKLLNPKQYVVFIFTSLWHLMIRPLLSSLLPFF...
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122.orf:378478-378666:+orf_0_control_set_1,chr1,378478,378667,+,orf_0,1,189,ATGCCAGCTTGCTTGCCCCTGTTGGATTCAGCAGAGGGAGATAGGC...,MPACLPLLDSAEGDRPCHTCGVCQSFLLAILGSADTWATVSPSLSL...
...,...,...,...,...,...,...,...,...,...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59027971-59028150:-orf_0_control_set_1,chrY,59027971,59028151,-,orf_0,1,180,ATGGTAAAATTCATTGTGATCCTGACCTCTGATGCTGTCAGCCACA...,MVKFIVILTSDAVSHRKGQNNLRLKPCLSQSSGSPNMVSRWSLLLR...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59209645-59209812:+orf_9_control_set_5,chrY,59209645,59209813,+,orf_9,5,168,ATGTTCGTTTTCTGTGAAGCAGGCCTTTTCTGTTACTCTCCATGCA...,MFVFCEAGLFCYSPCNALVVALKRKISSQFLQSYKLLGKKEKSKSE...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59339711-59339869:+orf_0_control_set_1,chrY,59339711,59339870,+,orf_0,1,159,ATGGGGGACAGCCTTGCAGGGAGGTTGGTGAAGTCTATTTGGACCT...,MGDSLAGRLVKSIWTCWVGAPGDILVGGSVDPWQQVWCSGRHWVGW...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59353081-59353329:+orf_5_control_set_3,chrY,59353081,59353330,+,orf_5,3,249,ATGCCTGAGCCCCCTGCACACAAGGAGCCAGGAGTAATCAGGGCAG...,MPEPPAHKEPGVIRADPLGHGDFWIVKLALWGPRPSNVGGSGFGLL...


19458

In [24]:
# Write to CSV to share with collaborators
df_relevant.to_csv(r'grch37_intergenic_controls_combined_ORFs_non_ORFs_all_control_sets_with_sequences.tsv',sep='\t')
    

In [25]:
df_relevant

Unnamed: 0,Chr,Gene_Start_hg37,Gene_End_hg37,Strand,Status,Control_Set,Length_from_Coordinates,J,K
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:42504-43241.norf_segment:42671-42895:+norf_1_control_set_1,chr1,42671,42896,+,norf_1,1,225,CAATCAGGAAGGAGTTGTGGTAGGAAGTCTGTGCTGTTGAATGTAC...,QSGRSCGRKSVLLNVH*SMIP*IIHNKKKD*NSFFKKKAQKLI*VL...
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:58253-58618:-norf_1_control_set_1,chr1,58253,58619,-,norf_1,1,366,CAAAAAGGATGTTCAGAATTATAGTTTTGTATAGAAGTGGGAAAGA...,QKGCSEL*FCIEVGKNFEILQVE*ETTQCG*MANNTFKKSESEGRP...
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:59208-60377.norf_segment:59878-60066:-norf_0_control_set_1,chr1,59878,60067,-,norf_0,1,189,TCCTGTTAAAACTAAGATACAATATATAAACCAAGCACTCTAGGAC...,SC*N*DTIYKPSTLGLSS*K*YFIL*EQFQNKSN*CLNYFVIFGDS...
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:105742-106428.norf_segment:105843-105989:+norf_9_control_set_5,chr1,105843,105990,+,norf_9,5,147,AATTGACATATCCTGATTTCTTCCATAGCTTGGATCTTGACCTAGA...,N*HILISSIAWILT*REI*KC*LEPEGCHFHC*SSFMDHELEK*LQQHG
GRCh37.Ens87.dna_rm.chr1.intergenic_gt122:106531-107479.norf_segment:107280-107417:+norf_3_control_set_2,chr1,107280,107418,+,norf_3,2,138,CCTTACACTTAGCTATAAAGGAGTGGAAAACACAAAGATGAGTAAC...,PYT*L*RSGKHKDE*LHFSKD*SYTNNTKV*TNNDEMTKAECFLFG
...,...,...,...,...,...,...,...,...,...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59027971-59028150:-orf_0_control_set_1,chrY,59027971,59028151,-,orf_0,1,180,ATGGTAAAATTCATTGTGATCCTGACCTCTGATGCTGTCAGCCACA...,MVKFIVILTSDAVSHRKGQNNLRLKPCLSQSSGSPNMVSRWSLLLR...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59209645-59209812:+orf_9_control_set_5,chrY,59209645,59209813,+,orf_9,5,168,ATGTTCGTTTTCTGTGAAGCAGGCCTTTTCTGTTACTCTCCATGCA...,MFVFCEAGLFCYSPCNALVVALKRKISSQFLQSYKLLGKKEKSKSE...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59339711-59339869:+orf_0_control_set_1,chrY,59339711,59339870,+,orf_0,1,159,ATGGGGGACAGCCTTGCAGGGAGGTTGGTGAAGTCTATTTGGACCT...,MGDSLAGRLVKSIWTCWVGAPGDILVGGSVDPWQQVWCSGRHWVGW...
GRCh37.Ens87.dna_rm.chrY.intergenic_gt122.orf:59353081-59353329:+orf_5_control_set_3,chrY,59353081,59353330,+,orf_5,3,249,ATGCCTGAGCCCCCTGCACACAAGGAGCCAGGAGTAATCAGGGCAG...,MPEPPAHKEPGVIRADPLGHGDFWIVKLALWGPRPSNVGGSGFGLL...


In [26]:
# If desired, output only ORF Set #1 (does not contain norf)
df_relevant_orf_1 = df_relevant[(df_relevant.Control_Set==1)&(~df_relevant.Status.str.contains('norf'))]
df_relevant_orf_1.to_csv(r'grch37_intergenic_controls_ORFs_set_1_with_sequences.tsv',sep='\t')


In [27]:
# If desired, output only NORF Set #1
df_relevant_norf_1 = df_relevant[(df_relevant.Control_Set==1)&(df_relevant.Status.str.contains('norf'))]
df_relevant_norf_1.to_csv(r'grch37_intergenic_controls_non_ORFs_set_1_with_sequences.tsv',sep='\t')
