In [1]:
# Daniel Marten
# Custom notebook to remove controls that don't map cleanly to GRCh37

import pandas as pd
import qtl.io as io
import qtl.norm as norm
import csv
from Bio.Seq import Seq



In [2]:
# Version of the BED file read in 
bed_001 = pd.read_csv(r'grch38_bed_complete_allsets.bed',sep='\t',header=None)
bed_001

Unnamed: 0,0,1,2,3,4,5
0,chr1,35726,35855,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...,0,-
1,chr1,57850,58015,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:5560...,0,-
2,chr1,59864,60014,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...,0,-
3,chr1,67741,67906,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:6663...,0,+
4,chr1,108819,108960,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:1086...,0,+
...,...,...,...,...,...,...
39172,chrY,26623039,26623183,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...,0,-
39173,chrY,26627091,26627364,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...,0,-
39174,chrY,56881824,56882004,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...,0,-
39175,chrY,57070959,57071091,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122:5707...,0,-


In [3]:
sum(bed_001.duplicated([0,1])) # no duplications

0

In [4]:
# Read in result of listing our complete GRCh38 BED to GRCh37 with default settings
# with: https://genome.ucsc.edu/cgi-bin/hgLiftOver
bed_to_keep = pd.read_csv(r'/Users/marten/Downloads/hglft_genome_32ff1_a88a10.bed',sep='\t',header=None)
bed_to_keep 

Unnamed: 0,0,1,2,3,4,5
0,chr1,35726,35855,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...,1,-
1,chr1,57850,58015,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:5560...,1,-
2,chr1,59864,60014,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...,1,-
3,chr1,67741,67906,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:6663...,1,+
4,chr1,108819,108960,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:1086...,1,+
...,...,...,...,...,...,...
39022,chrY,28769186,28769330,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...,1,-
39023,chrY,28773238,28773511,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...,1,-
39024,chrY,59027971,59028151,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...,1,-
39025,chrY,59217108,59217240,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122:5707...,1,-


In [5]:
sum(bed_to_keep.duplicated([0,1])) # three existing controls were lifted over to same location

3

In [6]:
# Proper renaming
bed_001.rename(columns={0:'Chr',1:'Gene_Start_hg38',2:'Gene_End_hg38',3:'Name',4:'Quality',5:'Strand'},inplace=True)

In [7]:
# Further renaming for GRCh37
bed_to_keep.rename(columns={0:'Chr',1:'Gene_Start_GRCh37_Lift',2:'Gene_End_GRCh37_Lift',3:'Name',4:'Quality',5:'Strand'},inplace=True)

In [8]:
chr_unique = bed_to_keep.Chr.unique() # unique chromosomes

In [9]:
# Removing NONCANONICAL Chromosome controls, since they don't map cleanly
chr_cannon = []
for chri in chr_unique:
    print(chri)
    if len(chri)<7:
        chr_cannon.append(chri)
    else:
        print('dropped!')

chr1
chr2
chr5
chr19
chr16
chr9
chr1_gl000192_random
dropped!
chr10
chrUn_gl000228
dropped!
chr11
chr11_gl000202_random
dropped!
chr12
chrUn_gl000212
dropped!
chr21
chr13
chrUn_gl000234
dropped!
chr14
chr15
chr17
chr17_gl000204_random
dropped!
chr18
chr20
chrY
chr7_gl000195_random
dropped!
chrUn_gl000220
dropped!
chr17_gl000205_random
dropped!
chr4_gl000193_random
dropped!
chrUn_gl000217
dropped!
chrUn_gl000244
dropped!
chrUn_gl000241
dropped!
chrUn_gl000237
dropped!
chrUn_gl000233
dropped!
chrUn_gl000232
dropped!
chrUn_gl000229
dropped!
chrUn_gl000224
dropped!
chr4
chrUn_gl000240
dropped!
chrUn_gl000235
dropped!
chr22
chr3
chr6
chr7
chr8
chrUn_gl000211
dropped!
chrX


In [10]:
chr_cannon # list of canonical chromosomes

['chr1',
 'chr2',
 'chr5',
 'chr19',
 'chr16',
 'chr9',
 'chr10',
 'chr11',
 'chr12',
 'chr21',
 'chr13',
 'chr14',
 'chr15',
 'chr17',
 'chr18',
 'chr20',
 'chrY',
 'chr4',
 'chr22',
 'chr3',
 'chr6',
 'chr7',
 'chr8',
 'chrX']

In [11]:
len(chr_cannon) # 22 and two sex chrs

24

In [12]:
bed_to_keep = bed_to_keep[bed_to_keep.Chr.isin(chr_cannon)] # only keep ones which map cleanly to canonical

In [13]:
# Miniature check of their names
bed_to_keep_mini = bed_to_keep[['Name']]
bed_to_keep_mini

Unnamed: 0,Name
0,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...
1,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:5560...
2,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...
3,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:6663...
4,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:1086...
...,...
39022,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...
39023,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...
39024,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...
39025,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122:5707...


In [14]:
# To remove duplicates - only keep the first instance of each name 
bed_to_keep_mini = bed_to_keep_mini[~bed_to_keep_mini.duplicated(['Name'],keep='first')]

In [15]:
bed_to_keep_mini

Unnamed: 0,Name
0,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...
1,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:5560...
2,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:...
3,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:6663...
4,GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:1086...
...,...
39022,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...
39023,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...
39024,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:...
39025,GRCh38.Ens89.dna_rm.chrY.intergenic_gt122:5707...


In [16]:
bedo = bed_001.set_index('Name').join(bed_to_keep_mini.set_index('Name'),how='inner')

In [17]:
bedo.dropna() # visual of table with no NaN values 

Unnamed: 0_level_0,Chr,Gene_Start_hg38,Gene_End_hg38,Quality,Strand
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:35726-35854:-orf_6_control_set_4,chr1,35726,35855,0,-
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:57850-58014:-norf_0_control_set_1,chr1,57850,58015,0,-
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:59864-60013:-orf_8_control_set_5,chr1,59864,60014,0,-
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:66632-67997.norf_segment:67741-67905:+norf_3_control_set_2,chr1,67741,67906,0,+
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:108680-109563.norf_segment:108819-108959:+norf_9_control_set_5,chr1,108819,108960,0,+
...,...,...,...,...,...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26623039-26623182:-orf_0_control_set_1,chrY,26623039,26623183,0,-
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26627091-26627363:-orf_1_control_set_1,chrY,26627091,26627364,0,-
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:56881824-56882003:-orf_0_control_set_1,chrY,56881824,56882004,0,-
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122:57070946-57071187.norf_segment:57070959-57071090:-norf_2_control_set_2,chrY,57070959,57071091,0,-


In [18]:
# Checking no duplicate start sites 
sum(bedo.duplicated(['Chr','Gene_Start_hg38']))

0

In [19]:
## TESTING
from collections import Counter

Counter(list(bed_to_keep[bed_to_keep.duplicated(['Name'],keep=False)].Name.value_counts()))

Counter({2: 44, 3: 5, 7: 1})

In [20]:
bedo['Length'] = bedo.Gene_End_hg38 - bedo.Gene_Start_hg38

In [21]:
bedo

Unnamed: 0_level_0,Chr,Gene_Start_hg38,Gene_End_hg38,Quality,Strand,Length
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:35726-35854:-orf_6_control_set_4,chr1,35726,35855,0,-,129
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:57850-58014:-norf_0_control_set_1,chr1,57850,58015,0,-,165
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:59864-60013:-orf_8_control_set_5,chr1,59864,60014,0,-,150
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:66632-67997.norf_segment:67741-67905:+norf_3_control_set_2,chr1,67741,67906,0,+,165
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:108680-109563.norf_segment:108819-108959:+norf_9_control_set_5,chr1,108819,108960,0,+,141
...,...,...,...,...,...,...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26623039-26623182:-orf_0_control_set_1,chrY,26623039,26623183,0,-,144
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26627091-26627363:-orf_1_control_set_1,chrY,26627091,26627364,0,-,273
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:56881824-56882003:-orf_0_control_set_1,chrY,56881824,56882004,0,-,180
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122:57070946-57071187.norf_segment:57070959-57071090:-norf_2_control_set_2,chrY,57070959,57071091,0,-,132


In [22]:
# code to turn the DFs as we have them formatted into properly formatted BEDs with the control set in their name
def bedify(input_df):
    ret_df = input_df.reset_index()
    #newnames = [ret_df.loc[xi,'Name']+ret_df.loc[xi,'Status']+'_control_set_'+str(ret_df.loc[xi,'Control_Set']) for xi in ret_df.index] 
    ret_df['new_index'] = ret_df.Name # already included at this point in pipeline
    ret_df_2 = ret_df[['Chr','Gene_Start_hg38','Gene_End_hg38','new_index','Strand']]
    ret_df_2['zeroes'] = 0
    ret_df_2 = ret_df_2[['Chr','Gene_Start_hg38','Gene_End_hg38','new_index','zeroes','Strand']]
    ret_df_2.new_index[1]
    return ret_df_2

In [23]:
bedo['Control_Set'] = [xi[-1] for xi in bedo.index.to_series()]

In [24]:
bedo['Status'] = [xi.split(':')[-1].split('_')[0][1:] for xi in bedo.index.to_series()]

In [25]:
df_by_set = {}

In [26]:
bedo

Unnamed: 0_level_0,Chr,Gene_Start_hg38,Gene_End_hg38,Quality,Strand,Length,Control_Set,Status
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:35726-35854:-orf_6_control_set_4,chr1,35726,35855,0,-,129,4,orf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:57850-58014:-norf_0_control_set_1,chr1,57850,58015,0,-,165,1,norf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:59864-60013:-orf_8_control_set_5,chr1,59864,60014,0,-,150,5,orf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:66632-67997.norf_segment:67741-67905:+norf_3_control_set_2,chr1,67741,67906,0,+,165,2,norf
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:108680-109563.norf_segment:108819-108959:+norf_9_control_set_5,chr1,108819,108960,0,+,141,5,norf
...,...,...,...,...,...,...,...,...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26623039-26623182:-orf_0_control_set_1,chrY,26623039,26623183,0,-,144,1,orf
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26627091-26627363:-orf_1_control_set_1,chrY,26627091,26627364,0,-,273,1,orf
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:56881824-56882003:-orf_0_control_set_1,chrY,56881824,56882004,0,-,180,1,orf
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122:57070946-57071187.norf_segment:57070959-57071090:-norf_2_control_set_2,chrY,57070959,57071091,0,-,132,2,norf


In [27]:
# For each control set, only output MERGED version of ORF and Non-ORF with controls removed which don't map cleeanly
for control_set in sorted(bedo.Control_Set.unique()):
    # print(control_set)
    df_temp = bedo[bedo.Control_Set == control_set].sort_values(by=['Chr','Gene_Start_hg38','Gene_End_hg38'])
    df_temp_orf = df_temp[~df_temp.Status.str.contains('norf')]
    df_temp_norf = df_temp[df_temp.Status.str.contains('norf')]
    print(f'quick check: {df_temp_orf.shape[0]} + {df_temp_norf.shape[0]} == {df_temp.shape[0]} ? {df_temp_orf.shape[0] + df_temp_norf.shape[0] == df_temp.shape[0]} ')
    df_by_set[control_set] = {'orf':bedify(df_temp_orf),'norf':bedify(df_temp_norf)}
    bedify(df_temp).to_csv(f'grch38_intergenic_controls_combined_ORFs_non_ORFs_{control_set}_withGRCh37CrossRemoval.bed',sep='\t')


quick check: 3916 + 3889 == 7805 ? True 
quick check: 3915 + 3883 == 7798 ? True 
quick check: 3882 + 3875 == 7757 ? True 
quick check: 3895 + 3882 == 7777 ? True 
quick check: 3877 + 3874 == 7751 ? True 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ret_df_2['zeroes'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ret_df_2['zeroes'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ret_df_2['zeroes'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cave

In [28]:
# Also export one version with all ORFs and Non-ORFs from all control sets included 

df_temp = bedo.sort_values(by=['Chr','Gene_Start_hg38','Gene_End_hg38'])
bedify(df_temp).to_csv(f'grch38_intergenic_controls_combined_ORFs_non_ORFs_all_control_sets_withGRCh37CrossRemoval.bed',sep='\t',index=False,header=False,encoding="utf-8")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ret_df_2['zeroes'] = 0


In [29]:
# BEGIN second part of this notebook , generating genetic sequences for all of the controls 
# or genetic sequences (or reverse complement of them when appropriate, then amino acid sequences
# Naming convention as this is a very sophomoric attempt at this
# FASTAS derived from bedtools getfasta: https://bedtools.readthedocs.io/en/latest/content/tools/getfasta.html
# and a downloaded version of GRCh38

fastas = pd.read_csv('grch38_control_fastas.txt',sep='\t',header=None)
fastas_2 = fastas.rename(columns={0:'Name',1:'J'})#.set_index('Name')

first_names = []
for xi in fastas_2.Name:
    xi = xi.split('::')[0]
    first_names.append(xi)
fastas_2['Name'] = first_names
fastas_named = fastas_2.set_index('Name')
fastas_named

Unnamed: 0_level_0,J
Name,Unnamed: 1_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:35726-35854:-orf_6_control_set_4,TCAGCAGCATCGGGGGTCAGGAAAGACTTCACGAAGCCATAAATGC...
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:57850-58014:-norf_0_control_set_1,AAAACTAATCCCACATATAAACCCCTATGATAATTTCAGTTTGTCC...
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:59864-60013:-orf_8_control_set_5,TTAATTATCACTCACACTCCAAGACAAACACCATTTCAGTAGCAAT...
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:66632-67997.norf_segment:67741-67905:+norf_3_control_set_2,CTGGCAGATAGCAACTTACAAAGATGCCCCAACAATACCTCCTTGT...
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:108680-109563.norf_segment:108819-108959:+norf_9_control_set_5,TTTCCAGTTATATATCTGGTAGAGATGAGGCCATTGATAGGAATGG...
...,...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26623039-26623182:-orf_0_control_set_1,TCACGCTGGCACCTTCTTGTGGGGATGGCATTGAGGACTAACACTC...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26627091-26627363:-orf_1_control_set_1,TTATGGTAAATGGTGCAAGCAGTGGCAATGGGAATGGACTGCACCC...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:56881824-56882003:-orf_0_control_set_1,TTAACACTGCTTCCTTCTCCCATCGAGAAGTAAAGCCCAGGTTCTG...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122:57070946-57071187.norf_segment:57070959-57071090:-norf_2_control_set_2,TATTCCATGCAGGTATCTTTCAATTGTAAATCATTACTGCAAAAGA...


In [30]:
df38 = bedo.join(fastas_named)
df38

Unnamed: 0_level_0,Chr,Gene_Start_hg38,Gene_End_hg38,Quality,Strand,Length,Control_Set,Status,J
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:35726-35854:-orf_6_control_set_4,chr1,35726,35855,0,-,129,4,orf,TCAGCAGCATCGGGGGTCAGGAAAGACTTCACGAAGCCATAAATGC...
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:57850-58014:-norf_0_control_set_1,chr1,57850,58015,0,-,165,1,norf,AAAACTAATCCCACATATAAACCCCTATGATAATTTCAGTTTGTCC...
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:59864-60013:-orf_8_control_set_5,chr1,59864,60014,0,-,150,5,orf,TTAATTATCACTCACACTCCAAGACAAACACCATTTCAGTAGCAAT...
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:66632-67997.norf_segment:67741-67905:+norf_3_control_set_2,chr1,67741,67906,0,+,165,2,norf,CTGGCAGATAGCAACTTACAAAGATGCCCCAACAATACCTCCTTGT...
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:108680-109563.norf_segment:108819-108959:+norf_9_control_set_5,chr1,108819,108960,0,+,141,5,norf,TTTCCAGTTATATATCTGGTAGAGATGAGGCCATTGATAGGAATGG...
...,...,...,...,...,...,...,...,...,...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26623039-26623182:-orf_0_control_set_1,chrY,26623039,26623183,0,-,144,1,orf,TCACGCTGGCACCTTCTTGTGGGGATGGCATTGAGGACTAACACTC...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26627091-26627363:-orf_1_control_set_1,chrY,26627091,26627364,0,-,273,1,orf,TTATGGTAAATGGTGCAAGCAGTGGCAATGGGAATGGACTGCACCC...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:56881824-56882003:-orf_0_control_set_1,chrY,56881824,56882004,0,-,180,1,orf,TTAACACTGCTTCCTTCTCCCATCGAGAAGTAAAGCCCAGGTTCTG...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122:57070946-57071187.norf_segment:57070959-57071090:-norf_2_control_set_2,chrY,57070959,57071091,0,-,132,2,norf,TATTCCATGCAGGTATCTTTCAATTGTAAATCATTACTGCAAAAGA...


In [31]:
# Correct for reverse complements
for xi,yi in df38.iterrows():
    if yi['Strand'] == '-':
        df38.loc[xi,'J'] = Seq(yi['J']).reverse_complement().__str__()

In [32]:
# brief check to assure everything starts with 'ATG' as DNA
# Check that the count here is equal to the number of rows displayed
df38plus = df38[(df38.Strand=='+')&(df38.Status.str[:3]=='orf')]
display(df38plus)
sum(df38plus[df38plus.J.str[:3].isin(['ATG','atg'])].Chr.value_counts())

Unnamed: 0_level_0,Chr,Gene_Start_hg38,Gene_End_hg38,Quality,Strand,Length,Control_Set,Status,J
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:188174-188317:+orf_0_control_set_1,chr1,188174,188318,0,+,144,1,orf,ATGTTGCTGGGAAGACCCCCAAGTCCCTCTTCTGCATCGTCCTCGG...
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:274832-275053:+orf_0_control_set_1,chr1,274832,275054,0,+,222,1,orf,ATGACAGAAGTAATTCCTGAGTTGCTTCTGAAACCAGAGCTTCCCT...
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:363585-363872:+orf_5_control_set_3,chr1,363585,363873,0,+,288,3,orf,ATGCTCCAAGCACATCCCACGGGGAGGACCATGAACAACTCAGCTG...
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:384407-384607:+orf_3_control_set_2,chr1,384407,384608,0,+,201,2,orf,ATGTGGTCTTATTCCCATAACTGGGGCCAGATGATCCCACCCTGGG...
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:789783-790127:+orf_2_control_set_2,chr1,789783,790128,0,+,345,2,orf,ATGGAATGGAAAGGACTCGAGTGGGATGGAATGGAGTGGAATGGAC...
...,...,...,...,...,...,...,...,...,...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26127501-26127659:+orf_3_control_set_2,chrY,26127501,26127660,0,+,159,2,orf,ATGAATGCTGATTGCCTGTACTCAACAGGTTTTCTGAAGAACTCAT...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26202219-26202383:+orf_0_control_set_1,chrY,26202219,26202384,0,+,165,1,orf,ATGTTCAAATTATGTGGACCTTCCTGCCCACCCCCCGCAACCAAAA...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26588067-26588210:+orf_8_control_set_5,chrY,26588067,26588211,0,+,144,5,orf,ATGCATTTGCATTCACGGCAGTGGGGTGTGATGGTGTTTGGTCCCT...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26588359-26588487:+orf_0_control_set_1,chrY,26588359,26588488,0,+,129,1,orf,ATGAGTTTCCCAATTGGGGAATCACCTATAAATAATCTTGAAATAA...


9887

In [33]:
# Then return the amino acid sequence for these controls, 'K'
df38['K'] = [Seq(dna).translate().__str__() for dna in df38['J']]
df38

Unnamed: 0_level_0,Chr,Gene_Start_hg38,Gene_End_hg38,Quality,Strand,Length,Control_Set,Status,J,K
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:35726-35854:-orf_6_control_set_4,chr1,35726,35855,0,-,129,4,orf,ATGGGTGCTGAAGCTCCCACGCCTGCCTGTGAAAATGGAGTCCTCT...,MGAEAPTPACENGVLSHLGEPGAAPRRMHLWLREVFPDPRCC*
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:57850-58014:-norf_0_control_set_1,chr1,57850,58015,0,-,165,1,norf,AATAGGATGGAAATAGCTGAGATCAGACATCTCCTTTTCAGAGTGG...,NRMEIAEIRHLLFRVENEVYN*LESMKVRKLHIFKELHTRDKLKLS...
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:59864-60013:-orf_8_control_set_5,chr1,59864,60014,0,-,150,5,orf,ATGAAAGTGATATTTCATCTTATAAGAGCAATTCCAAAACAAAAGC...,MKVIFHLIRAIPKQKQLMSELLCHIWRFLLLLKFILLLKWCLSWSV...
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:66632-67997.norf_segment:67741-67905:+norf_3_control_set_2,chr1,67741,67906,0,+,165,2,norf,CTGGCAGATAGCAACTTACAAAGATGCCCCAACAATACCTCCTTGT...,LADSNLQRCPNNTSLCLDSHHYPLPFSVFISAPKRDLYVKYCYTSA...
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:108680-109563.norf_segment:108819-108959:+norf_9_control_set_5,chr1,108819,108960,0,+,141,5,norf,TTTCCAGTTATATATCTGGTAGAGATGAGGCCATTGATAGGAATGG...,FPVIYLVEMRPLIGMGRRSPFILMTQHG*TLSDYHCTLLYFQH*RCQ
...,...,...,...,...,...,...,...,...,...,...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26623039-26623182:-orf_0_control_set_1,chrY,26623039,26623183,0,-,144,1,orf,ATGCTGTGCAGACCACGGCCTCCGCAGAGGATCCCCTCACCCAGGC...,MLCRPRPPQRIPSPRPQGLPSCSLRDEIFNNAVSVSPQCHPHKKVPA*
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26627091-26627363:-orf_1_control_set_1,chrY,26627091,26627364,0,-,273,1,orf,ATGCCGCGAGCACCCGCCTCCCCGCGCTCTGCTGGCCTCCACACCC...,MPRAPASPRSAGLHTLGPPFLSQGQRGVLEAPELHGSLGGRRSGVR...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:56881824-56882003:-orf_0_control_set_1,chrY,56881824,56882004,0,-,180,1,orf,ATGGTAAAATTCATTGTGATCCTGACCTCTGATGCTGTCAGCCACA...,MVKFIVILTSDAVSHRKGQNNLRLKPCLSQSSGSPNMVSRWSLLLR...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122:57070946-57071187.norf_segment:57070959-57071090:-norf_2_control_set_2,chrY,57070959,57071091,0,-,132,2,norf,GTTGGCACATCCTAAATTCACGACAGTCATGACTCAAGCACTCTTG...,VGTS*IHDSHDSSTLVLLLFLISYKDKICLLQ**FTIERYLHGI


In [34]:
# Status of which (ORF or NORF) and original control group it is 
for xi,yi in df38.iterrows():
    df38.loc[xi,'Status'] = xi.split(':')[-1].split('_')[0][1:]+'_'+xi.split(':')[-1].split('_')[1]

In [35]:
df38

Unnamed: 0_level_0,Chr,Gene_Start_hg38,Gene_End_hg38,Quality,Strand,Length,Control_Set,Status,J,K
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:35726-35854:-orf_6_control_set_4,chr1,35726,35855,0,-,129,4,orf_6,ATGGGTGCTGAAGCTCCCACGCCTGCCTGTGAAAATGGAGTCCTCT...,MGAEAPTPACENGVLSHLGEPGAAPRRMHLWLREVFPDPRCC*
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:55608-59054.norf_segment:57850-58014:-norf_0_control_set_1,chr1,57850,58015,0,-,165,1,norf_0,AATAGGATGGAAATAGCTGAGATCAGACATCTCCTTTTCAGAGTGG...,NRMEIAEIRHLLFRVENEVYN*LESMKVRKLHIFKELHTRDKLKLS...
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122.orf:59864-60013:-orf_8_control_set_5,chr1,59864,60014,0,-,150,5,orf_8,ATGAAAGTGATATTTCATCTTATAAGAGCAATTCCAAAACAAAAGC...,MKVIFHLIRAIPKQKQLMSELLCHIWRFLLLLKFILLLKWCLSWSV...
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:66632-67997.norf_segment:67741-67905:+norf_3_control_set_2,chr1,67741,67906,0,+,165,2,norf_3,CTGGCAGATAGCAACTTACAAAGATGCCCCAACAATACCTCCTTGT...,LADSNLQRCPNNTSLCLDSHHYPLPFSVFISAPKRDLYVKYCYTSA...
GRCh38.Ens89.dna_rm.chr1.intergenic_gt122:108680-109563.norf_segment:108819-108959:+norf_9_control_set_5,chr1,108819,108960,0,+,141,5,norf_9,TTTCCAGTTATATATCTGGTAGAGATGAGGCCATTGATAGGAATGG...,FPVIYLVEMRPLIGMGRRSPFILMTQHG*TLSDYHCTLLYFQH*RCQ
...,...,...,...,...,...,...,...,...,...,...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26623039-26623182:-orf_0_control_set_1,chrY,26623039,26623183,0,-,144,1,orf_0,ATGCTGTGCAGACCACGGCCTCCGCAGAGGATCCCCTCACCCAGGC...,MLCRPRPPQRIPSPRPQGLPSCSLRDEIFNNAVSVSPQCHPHKKVPA*
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:26627091-26627363:-orf_1_control_set_1,chrY,26627091,26627364,0,-,273,1,orf_1,ATGCCGCGAGCACCCGCCTCCCCGCGCTCTGCTGGCCTCCACACCC...,MPRAPASPRSAGLHTLGPPFLSQGQRGVLEAPELHGSLGGRRSGVR...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122.orf:56881824-56882003:-orf_0_control_set_1,chrY,56881824,56882004,0,-,180,1,orf_0,ATGGTAAAATTCATTGTGATCCTGACCTCTGATGCTGTCAGCCACA...,MVKFIVILTSDAVSHRKGQNNLRLKPCLSQSSGSPNMVSRWSLLLR...
GRCh38.Ens89.dna_rm.chrY.intergenic_gt122:57070946-57071187.norf_segment:57070959-57071090:-norf_2_control_set_2,chrY,57070959,57071091,0,-,132,2,norf_2,GTTGGCACATCCTAAATTCACGACAGTCATGACTCAAGCACTCTTG...,VGTS*IHDSHDSSTLVLLLFLISYKDKICLLQ**FTIERYLHGI


In [36]:
# below cells are relevant export steps

In [37]:
df38_orf = df38[(df38.Control_Set=='1')&(~df38.Status.str.contains('norf'))]
df38_orf.to_csv(r'grch38_intergenic_controls_ORFs_set_1_with_sequences_withGRCh37CrossRemoval.tsv',sep='\t')



In [38]:
df38_norf = df38[(df38.Control_Set=='1')&(df38.Status.str.contains('norf'))]
df38_norf.to_csv(r'grch38_intergenic_controls_non_ORFs_set_1_with_sequences_withGRCh37CrossRemoval.tsv',sep='\t')



In [39]:
df38.to_csv(r'grch38_intergenic_controls_combined_ORFs_non_ORFs_all_sets_38888_with_sequences_withGRCh37CrossRemoval.tsv',sep='\t')


In [40]:
df38[df38.Control_Set=='1'].to_csv(r'grch38_intergenic_controls_combined_ORFs_non_ORFs_set_1_7805_with_sequences_withGRCh37CrossRemoval.tsv',sep='\t')
