**Purpose**

Read in codon record table and synonymous variant record table, filter both to keep records with full sequence contexts, records for codons with >1 degeneracy, and records without STOP codons in the sequence context

# Setup

In [2]:
import pandas as pd
import importlib.util
from textwrap import wrap

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
ccv_spec = importlib.util.spec_from_file_location("codon_context_variables", 
                                                  "../codon_context_variables.py")
ccv = importlib.util.module_from_spec(ccv_spec)
ccv_spec.loader.exec_module(ccv)

## Files - read

In [4]:
rnas_codon_cp3_filename = "../../data/0_data_processing/rna_stability_exports/RNAStability_v10.5.1_hg38_filterDups_noOverlaps_CP3.tsv"

In [5]:
transcript_anno_filename = "../../data/0_data_processing/transcripts/RNAStability_v10.5.1_transcript_entrez_summary.tsv"

In [5]:
rnas_variant_syn_filename = "../../data/0_data_processing/rna_stability_exports/RNAStability_v10.5.1_hg38_filterDups_noOverlaps_synonymous.tsv"

## Files - write

In [15]:
rnas_codon_cp3_noedge_syn_noSTOP_filename = \
    "../../data/0_data_processing/rna_stability_exports/RNAStability_v10.5.1_hg38_filterDups_noOverlaps_noEdge_wSyn_noSTOP_CP3.tsv"

In [16]:
rnas_codon_cp3_noedge_syn_noSTOP_nostruct_filename = \
    "../../data/0_data_processing/rna_stability_exports/RNAStability_v10.5.1_hg38_filterDups_noOverlaps_noEdge_wSyn_noSTOP_noStruct_CP3.tsv"

In [17]:
transcript_anno_selected_0a_filename = "../../data/0_data_processing/transcripts/RNAStability_v10.5.1_transcript_entrez_selected_0a.tsv"

In [None]:
root = "../../data/0_data_processing/rna_stability_exports/byAminoAcid_sub/RNAStability_v10.5.1_hg38_filterDups_noOverlaps_synonymous_noEdge_noSTOP_CP3_"
rnas_variant_syn_out = {}
for amin in ccv.aa_sub_with_syn_noSTOP :
    rnas_variant_syn_out[amin] = {"seq_filename": (root+"seqCol_AminoAcid"+amin+".tsv"),
                                  "nonseq_filename": (root+"nonseqCol_AminoAcid"+amin+".tsv")}
rnas_variant_syn_out

## Functions

In [15]:
def filter_contexts_for_codons (r, codons_omit=ccv.codons_STOP) :
    window_pos = r["WindowPosition"]
    codon_pos = r["CodonPosition"]
    x = r["REF_Sequence"]
    
    if (window_pos%3 == 0 and codon_pos == 3):
        codons_all = wrap(x,3)
    elif (window_pos == 51 and codon_pos == 2):
        codons_all = wrap(x[1:],3)
    elif (window_pos == 51 and codon_pos == 1) :
        codons_all = wrap(x[2:],3)
    else :
        print("Not an expected configuration.")
        
    if len(set(codons_all) & set(codons_omit)) > 0 :
        return False
    else :
        return True

# Filter codon table

Read in main codon data frame:

In [12]:
rnas_codon_cp3_df = pd.read_csv(rnas_codon_cp3_filename,
                                sep="\t",
                                dtype={"CHR":str})
rnas_codon_cp3_df.head()

Unnamed: 0,CHR,POS,REF,NM_ID,entrezgene,symbol,Sense,CDS_Start,CDS_Stop,CDS_Length,Trans_Length,Trans_POS,PosRelative2Start,PosRelative2Stop,PosRelative2StartPct,WindowPosition,REF_Base,PrecedingBase,TrailingBase,PrecedingBases,TrailingBases,TribaseContext,CodonPosition,REF_Codon,PrecedingCodon,TrailingCodon,PrecedingBicodon,TrailingBicodon,TricodonContext,REF_AminoAcid,REF_AminoAcid_sub,REF_Sequence,REF_mfeValue,REF_cfeValue,REF_meafeValue,REF_efeValue,REF_cdValue,REF_endValue,REF_meaValue,REF_freqMfeEnsemble,REF_mfeStructure,REF_cfeStructure,REF_meafeStructure,REF_efeStructure,REF_BondMFE,REF_LocalMFEStructure
0,5,149977727,T,NM_000112.3,1836,SLC26A2,+,269,2488,2220,8082,343,75,2146,0.034,51,T,C,G,ATC,GGG,CTG,3,TCT,CCA,GGG,CCATCT,TCTGGG,CCATCTGGG,S,S4,CATAACGTTTCACCCAGAGACTCAGCTGAAGGAAATGACAGTTATC...,-18.1,-9.4,-10.2,-20.33,22.62,29.89,63.62,0.026939,......((((((((((((.....(((((..........)))))......,......(((((.....)))))...((.((((..................,......(((((.....)))))...((.((((.............((...,"......(((((.,,,,||}}}..,(((({{(....,..|}}},.{{...",),.))))))..
1,5,149981141,T,NM_000112.3,1836,SLC26A2,+,269,2488,2220,8082,1816,1548,673,0.697,51,T,T,G,GTT,GTT,TTG,3,TTT,TGG,GTT,TGGTTT,TTTGTT,TGGTTTGTT,F,F,GATCTTCCCAAAATGTGGAGTATTAGTAGAATGGATACAGTTATCT...,-17.7,-11.3,-16.6,-20.7,17.0,25.15,75.2,0.007678,.......(((.....)))(((((((((((..((((.(((((....(...,..................(((((((((((...(((.(((((........,.......(((.....)))(((((((((((...(((.(((((....(...,"......,(({....,}))(((((((((((..,(((.(((((....{...",.,((.....))
2,5,149981291,G,NM_000112.3,1836,SLC26A2,+,269,2488,2220,8082,1966,1698,523,0.765,51,G,T,T,TGT,TCT,TGT,3,GTG,TCT,TCT,TCTGTG,GTGTCT,TCTGTGTCT,V,V,AAGAGTTCACTGCTTGGCTTGGTGGAAGAGTCTGAGGTCTTTGAAT...,-26.9,-19.9,-24.2,-30.09,16.5,25.12,74.62,0.005638,.((((((...(((((((((((........(((((((((.((((((....,..(((((...(((((((((((........(((((.(((...........,..(((((...(((((((((((........(((((((((((.((......,".,(((({...(((((((((((,......,(((((((((,({,,......",.,.......).
3,5,149978135,T,NM_000112.3,1836,SLC26A2,+,269,2488,2220,8082,751,483,1738,0.218,51,T,C,G,CTC,GTG,CTG,3,TCT,ATC,GTG,ATCTCT,TCTGTG,ATCTCTGTG,S,S4,TTTTTTGCCAGCATCATTTATTTTCTCTTGGGTACCTCCCGTCACA...,-20.2,-17.8,-18.2,-22.46,17.01,24.82,74.72,0.025643,..((((((((..(((((............(((.....)))..((((...,..((((((((...................(((.....)))..((((...,..((((((((...(((.............(((.....)))..((((...,"..{{{(((((,.,{{{,...........,(((.....)))}.((((...",),...))))((
4,5,149978075,G,NM_000112.3,1836,SLC26A2,+,269,2488,2220,8082,691,423,1798,0.191,51,G,T,T,TCT,TAC,TGT,3,CTG,GGT,TAC,GGTCTG,CTGTAC,GGTCTGTAC,L,L4,CAGTCCATTGCTTATTCCCTGCTGGCTGGCCAAGAACCTGTCTATG...,-20.2,-14.7,-20.2,-21.88,13.43,20.31,80.37,0.065354,...................(((((((.((((((((.....))).))...,...................(((((((..((((.(.......)..))...,...................(((((((.((((((((.....))).))...,...................{((((((.((({((((.....|}}.})...,.,)).......


In [13]:
rnas_codon_cp3_df.dtypes

CHR                       object
POS                        int64
REF                       object
NM_ID                     object
entrezgene                 int64
symbol                    object
Sense                     object
CDS_Start                  int64
CDS_Stop                   int64
CDS_Length                 int64
Trans_Length               int64
Trans_POS                  int64
PosRelative2Start          int64
PosRelative2Stop           int64
PosRelative2StartPct     float64
WindowPosition             int64
REF_Base                  object
PrecedingBase             object
TrailingBase              object
PrecedingBases            object
TrailingBases             object
TribaseContext            object
CodonPosition              int64
REF_Codon                 object
PrecedingCodon            object
TrailingCodon             object
PrecedingBicodon          object
TrailingBicodon           object
TricodonContext           object
REF_AminoAcid             object
REF_AminoA

Check style of CHR column

In [14]:
rnas_codon_cp3_df["CHR"].value_counts()

1     1097936
2      793446
19     701390
11     634475
3      621710
17     618977
12     572109
6      550857
7      507095
5      473065
16     452232
9      437438
4      433575
10     422225
X      415509
15     369945
8      359904
14     341870
20     254792
22     221614
13     196113
18     170845
21     105896
Y       27519
Name: CHR, dtype: int64

Check that we have the expected unique keys

In [15]:
rnas_codon_cp3_df.groupby(["CHR", "POS", "REF"]).\
    agg({"NM_ID":len}).\
    sort_values("NM_ID", ascending=False).\
    head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,NM_ID
CHR,POS,REF,Unnamed: 3_level_1
1,69093,G,1
3,52331224,A,1
3,52331230,G,1
3,52331233,A,1
3,52331236,C,1


In [16]:
rnas_codon_cp3_df.shape

(10780537, 46)

In [17]:
rnas_codon_cp3_df["NM_ID"].unique().size

18718

## Remove near-end variants and single degeneracy codons

Select codons whose CP3 is removed a bit from the start codon:

In [18]:
rnas_codon_cp3_noedge_df = rnas_codon_cp3_df.query("PosRelative2Start > 51 & " + 
                                                   "PosRelative2Stop > 51")

In [19]:
rnas_codon_cp3_noedge_df.shape

(10144879, 46)

Double check that the sequence context is centered on the codon in all cases now:

In [20]:
rnas_codon_cp3_noedge_df["WindowPosition"].value_counts()

51    10144879
Name: WindowPosition, dtype: int64

Grab codons that have synonyms

In [17]:
ccv.aa_with_syn_noSTOP

['F',
 'L',
 'I',
 'V',
 'S',
 'P',
 'T',
 'A',
 'Y',
 'H',
 'Q',
 'N',
 'K',
 'D',
 'E',
 'C',
 'R',
 'G']

In [21]:
rnas_codon_cp3_noedge_syn_df = rnas_codon_cp3_noedge_df.query("REF_AminoAcid in @ccv.aa_with_syn_noSTOP")
rnas_codon_cp3_noedge_syn_df.shape

(9823355, 46)

## Remove records with STOP codons appearing within the sequence context

In [34]:
codon_context_noSTOP = rnas_codon_cp3_noedge_syn_df["REF_Sequence"].apply(filter_contexts_for_codons)

In [39]:
rnas_codon_cp3_noedge_syn_noSTOP_df = rnas_codon_cp3_noedge_syn_df[codon_context_noSTOP]

In [40]:
rnas_codon_cp3_noedge_syn_noSTOP_df.shape

(9822251, 46)

## Save filtered files

Remove structure fields that won't be often used

In [41]:
rnas_codon_cp3_noedge_syn_noSTOP_nostruct_df = \
    rnas_codon_cp3_noedge_syn_noSTOP_df.drop(columns=["REF_mfeStructure",
                                                      "REF_cfeStructure",
                                                      "REF_meafeStructure",
                                                      "REF_efeStructure"])

Save filtered files

In [45]:
rnas_codon_cp3_noedge_syn_noSTOP_df.to_csv(rnas_codon_cp3_noedge_syn_noSTOP_filename,
                                           sep="\t",
                                           index=False)

Save version that doesn't include structural data

In [46]:
rnas_codon_cp3_noedge_syn_noSTOP_nostruct_df.to_csv(rnas_codon_cp3_noedge_syn_noSTOP_nostruct_filename,
                                                    sep="\t",
                                                    index=False)

# Transcript information

In [47]:
transcripts = rnas_codon_cp3_noedge_syn_noSTOP_df["NM_ID"].unique()
len(transcripts)

18704

In [48]:
transcript_anno_df = pd.read_csv(transcript_anno_filename,
                                 sep="\t")
transcript_anno_df.head()

Unnamed: 0,NM_ID,CHR,Sense,CDS_Start,CDS_Stop,CDS_Length,Trans_Length,entrezgene,name,symbol
0,NM_019089.4,1,-,100,621,522,4273,54626.0,hes family bHLH transcription factor 2,HES2
1,NM_015484.4,1,-,56,787,732,1777,25949.0,SYF2 pre-mRNA splicing factor,SYF2
2,NM_015627.2,1,+,115,1041,927,2935,26119.0,low density lipoprotein receptor adaptor prote...,LDLRAP1
3,NM_001011547.2,1,+,53,2098,2046,3218,200010.0,solute carrier family 5 member 9,SLC5A9
4,NM_004153.3,1,-,232,2817,2586,3192,4998.0,origin recognition complex subunit 1,ORC1


In [49]:
transcript_anno_df.shape

(45433, 10)

In [50]:
transcript_anno_selected_df = transcript_anno_df[transcript_anno_df["NM_ID"].isin(transcripts)]
transcript_anno_selected_df.shape

(18704, 10)

## Save filtered file

In [51]:
transcript_anno_selected_df.to_csv(transcript_anno_selected_0a_filename,
                                   sep="\t",
                                   index=False)

# Filter variant table

Read in synonymous variant table

In [6]:
rnas_variant_syn_df = pd.read_csv(rnas_variant_syn_filename,
                                  sep="\t",
                                  dtype={"CHR":str})
rnas_variant_syn_df.head()

Unnamed: 0,NM_ID,CHR,POS,REF,ALT,Sense,CDS_Start,CDS_Stop,CDS_Length,Trans_Length,Trans_POS,REF_Base,ALT_Base,WindowPosition,REF_Sequence,ALT_Sequence,deltaMFE,deltaCFE,deltaMEAFE,deltaEFE,MFEED,CFEED,MEAED,EFEED,deltaCD,deltaEND,REF_mfeValue,REF_cfeValue,REF_meafeValue,REF_efeValue,REF_cdValue,REF_endValue,REF_meaValue,REF_freqMfeEnsemble,ALT_mfeValue,ALT_cfeValue,ALT_meafeValue,ALT_efeValue,ALT_cdValue,ALT_endValue,ALT_meaValue,ALT_freqMfeEnsemble,REF_mfeStructure,REF_cfeStructure,REF_meafeStructure,REF_efeStructure,ALT_mfeStructure,ALT_cfeStructure,ALT_meafeStructure,ALT_efeStructure,GENE,HGVS,LOC_IN_GENE,EFFECT,DIST_TO_CODING_REGION,DIST_TO_SPLICE_SITE,IMPACT,gnomAD3_WG_AC,gnomAD3_WG_AN,gnomAD3_WG_AF,gnomAD3_WG_nhomalt,gnomAD3_Coverage,gnomAD3_Quality,gnomAD3_WG_InbreedingCoeff,gnomAD3_CVGE_OVER_20,gnomAD2_EX_AC,gnomAD2_EX_AN,gnomAD2_EX_AF,gnomAD2_EX_nhomalt,gnomAD2_Coverage,gnomAD2_Quality,gnomAD2_EX_InbreedingCoeff,gnomAD2_EX_rf_tp_probability,gnomAD2_CVGE_OVER_20,gnomAD3_WG_vep,entrezgene,name,symbol,NM_ID_noVersion,is_MANE,is_LongestCDS,is_LongestTrans,snpeffTYPE,PosRelative2Start,PosRelative2Stop,PosRelative2StartPct,PrecedingBase,TrailingBase,PrecedingBases,TrailingBases,SNVContext,TribaseContext,REF_BondMFE,ALT_BondMFE,REF_LocalMFEStructure,ALT_LocalMFEStructure,CodonPosition,REF_Codon,ALT_Codon,PrecedingCodon,TrailingCodon,PrecedingBicodon,TrailingBicodon,TricodonContext,REF_AminoAcid,ALT_AminoAcid,REF_AminoAcid_sub,ALT_AminoAcid_sub,manualTYPE,DupAlignment_select,maxGeneCount
0,NM_000112.3,5,149977727,T,A,+,269,2488,2220,8082,343,T,A,51,CATAACGTTTCACCCAGAGACTCAGCTGAAGGAAATGACAGTTATC...,CATAACGTTTCACCCAGAGACTCAGCTGAAGGAAATGACAGTTATC...,0.9,5.7,3.9,0.6,40,36,24,9.90843,0.34,0.19,-18.1,-9.4,-10.2,-20.33,22.62,29.89,63.62,0.026939,-17.2,-3.7,-6.3,-19.73,22.96,30.08,62.6,0.016616,......((((((((((((.....(((((..........)))))......,......(((((.....)))))...((.((((..................,......(((((.....)))))...((.((((.............((...,"......(((((.,,,,||}}}..,(((({{(....,..|}}},.{{...",.............(((((.......((((.((((((....))).))...,......(((((.....)))))............................,......(((((.....)))))...((.((((((...........))...,"......((({{..,,,||}}}...{(((({({,{,{....},}.||...",SLC26A2,NM_000112.3(SLC26A2):c.75T>A:p.Ser25Ser,coding_sequence,synonymous SNV,,,LOW,,,,,True,PASS,,0.973474,,,,,True,PASS,,,0.99905,,1836,solute carrier family 26 member 2,SLC26A2,NM_000112,True,True,True,synonymous,75,2146,0.034,C,G,ATC,GGG,T>A,CTG,),),.))))))..,.))))....,3,TCT,TCA,CCA,GGG,CCATCT,TCTGGG,CCATCTGGG,S,S,S4,S4,synonymous,False,1
1,NM_000112.3,5,149981141,T,C,+,269,2488,2220,8082,1816,T,C,51,GATCTTCCCAAAATGTGGAGTATTAGTAGAATGGATACAGTTATCT...,GATCTTCCCAAAATGTGGAGTATTAGTAGAATGGATACAGTTATCT...,-2.2,-6.3,-3.3,-1.28,38,46,40,22.7723,1.68,2.89,-17.7,-11.3,-16.6,-20.7,17.0,25.15,75.2,0.007678,-19.9,-17.6,-19.9,-21.98,18.68,28.04,72.11,0.03431,.......(((.....)))(((((((((((..((((.(((((....(...,..................(((((((((((...(((.(((((........,.......(((.....)))(((((((((((...(((.(((((....(...,"......,(({....,}))(((((((((((..,(((.(((((....{...",.............(((((((.((..(((((((((((.(((....))...,.............(((((((.....(((((((((((.(((....))...,.............(((((((.((..(((((((((((.(((....))...,"........,....{((((((.({,,(((((((((({.(((....))...",SLC26A2,NM_000112.3(SLC26A2):c.1548T>C:p.Phe516Phe,coding_sequence,synonymous SNV,,,LOW,,,,,True,PASS,,0.965803,,,,,True,PASS,,,0.998,,1836,solute carrier family 26 member 2,SLC26A2,NM_000112,True,True,True,synonymous,1548,673,0.697,T,G,GTT,GTT,T>C,TTG,.,),((.....)),))))))).),3,TTT,TTC,TGG,GTT,TGGTTT,TTTGTT,TGGTTTGTT,F,F,F,F,synonymous,False,1
2,NM_000112.3,5,149981291,G,A,+,269,2488,2220,8082,1966,G,A,51,AAGAGTTCACTGCTTGGCTTGGTGGAAGAGTCTGAGGTCTTTGAAT...,AAGAGTTCACTGCTTGGCTTGGTGGAAGAGTCTGAGGTCTTTGAAT...,-0.4,0.0,-2.5,-0.2,14,12,16,0.395924,-0.84,-1.82,-26.9,-19.9,-24.2,-30.09,16.5,25.12,74.62,0.005638,-27.3,-19.9,-26.7,-30.29,15.66,23.3,76.75,0.007827,.((((((...(((((((((((........(((((((((.((((((....,..(((((...(((((((((((........(((((.(((...........,..(((((...(((((((((((........(((((((((((.((......,".,(((({...(((((((((((,......,(((((((((,({,,......",.((((((...(((((((((((........(((((((((((.........,..(((((...(((((((((((........(((((.(((...........,..(((((...(((((((((((........(((((((((((.........,".,(((({...(((((((((((.......,(((((((((,{,,,......",SLC26A2,NM_000112.3(SLC26A2):c.1698G>A:p.Val566Val,coding_sequence,synonymous SNV,,,LOW,,,,,True,PASS,,0.954799,,,,,True,PASS,,,0.99916,,1836,solute carrier family 26 member 2,SLC26A2,NM_000112,True,True,True,synonymous,1698,523,0.765,T,T,TGT,TCT,G>A,TGT,.,(,.......).,((((.....,3,GTG,GTA,TCT,TCT,TCTGTG,GTGTCT,TCTGTGTCT,V,V,V,V,synonymous,False,1
3,NM_000112.3,5,149978135,T,C,+,269,2488,2220,8082,751,T,C,51,TTTTTTGCCAGCATCATTTATTTTCTCTTGGGTACCTCCCGTCACA...,TTTTTTGCCAGCATCATTTATTTTCTCTTGGGTACCTCCCGTCACA...,0.2,12.48,0.7,0.35,68,36,4,5.35652,2.78,3.67,-20.2,-17.8,-18.2,-22.46,17.01,24.82,74.72,0.025643,-20.0,-5.32,-17.5,-22.11,19.79,28.49,68.23,0.032343,..((((((((..(((((............(((.....)))..((((...,..((((((((...................(((.....)))..((((...,..((((((((...(((.............(((.....)))..((((...,"..{{{(((((,.,{{{,...........,(((.....)))}.((((...",........((((.................(((.....)))(((.((...,..((((((((...................(((.....))).........,..((((((((...(((.............(((.....)))..(((....,"..{{{((((({,,{{,,............(((.....)))|.((({...",SLC26A2,NM_000112.3(SLC26A2):c.483T>C:p.Ser161Ser,coding_sequence,synonymous SNV,,,LOW,,,,,True,PASS,,0.96968,,,,,True,PASS,,,0.89147,,1836,solute carrier family 26 member 2,SLC26A2,NM_000112,True,True,True,synonymous,483,1738,0.218,C,G,CTC,GTG,T>C,CTG,),(,...))))((,(..((((((,3,TCT,TCC,ATC,GTG,ATCTCT,TCTGTG,ATCTCTGTG,S,S,S4,S4,synonymous,False,1
4,NM_000112.3,5,149978075,G,A,+,269,2488,2220,8082,691,G,A,51,CAGTCCATTGCTTATTCCCTGCTGGCTGGCCAAGAACCTGTCTATG...,CAGTCCATTGCTTATTCCCTGCTGGCTGGCCAAGAACCTGTCTATG...,0.0,-0.09,1.8,0.16,0,20,4,1.49045,1.8,1.45,-20.2,-14.7,-20.2,-21.88,13.43,20.31,80.37,0.065354,-20.2,-14.79,-18.4,-21.72,15.23,21.76,76.45,0.085024,...................(((((((.((((((((.....))).))...,...................(((((((..((((.(.......)..))...,...................(((((((.((((((((.....))).))...,...................{((((((.((({((((.....|}}.})...,...................(((((((.((((((((.....))).))...,...................(((((((.......................,...................(((((((..(((((((.....))).))...,".,,................{((((((,(((,((((,..,,|,|,||...",SLC26A2,NM_000112.3(SLC26A2):c.423G>A:p.Leu141Leu,coding_sequence,synonymous SNV,,,LOW,,,,,True,PASS,,0.971716,2.0,246752.0,8e-06,0.0,True,PASS,0.0061,0.914492,0.92406,,1836,solute carrier family 26 member 2,SLC26A2,NM_000112,True,True,True,synonymous,423,1798,0.191,T,T,TCT,TAC,G>A,TGT,.,.,)).......,)).......,3,CTG,CTA,GGT,TAC,GGTCTG,CTGTAC,GGTCTGTAC,L,L,L4,L4,synonymous,False,1


In [33]:
for col in rnas_variant_syn_df.columns:
    print(col)

NM_ID
CHR
POS
REF
ALT
Sense
CDS_Start
CDS_Stop
CDS_Length
Trans_Length
Trans_POS
REF_Base
ALT_Base
WindowPosition
REF_Sequence
ALT_Sequence
deltaMFE
deltaCFE
deltaMEAFE
deltaEFE
MFEED
CFEED
MEAED
EFEED
deltaCD
deltaEND
REF_mfeValue
REF_cfeValue
REF_meafeValue
REF_efeValue
REF_cdValue
REF_endValue
REF_meaValue
REF_freqMfeEnsemble
ALT_mfeValue
ALT_cfeValue
ALT_meafeValue
ALT_efeValue
ALT_cdValue
ALT_endValue
ALT_meaValue
ALT_freqMfeEnsemble
REF_mfeStructure
REF_cfeStructure
REF_meafeStructure
REF_efeStructure
ALT_mfeStructure
ALT_cfeStructure
ALT_meafeStructure
ALT_efeStructure
GENE
HGVS
LOC_IN_GENE
EFFECT
DIST_TO_CODING_REGION
DIST_TO_SPLICE_SITE
IMPACT
gnomAD3_WG_AC
gnomAD3_WG_AN
gnomAD3_WG_AF
gnomAD3_WG_nhomalt
gnomAD3_Coverage
gnomAD3_Quality
gnomAD3_WG_InbreedingCoeff
gnomAD3_CVGE_OVER_20
gnomAD2_EX_AC
gnomAD2_EX_AN
gnomAD2_EX_AF
gnomAD2_EX_nhomalt
gnomAD2_Coverage
gnomAD2_Quality
gnomAD2_EX_InbreedingCoeff
gnomAD2_EX_rf_tp_probability
gnomAD2_CVGE_OVER_20
gnomAD3_WG_vep
entrezgene


In [8]:
rnas_variant_syn_df.shape

(22050128, 111)

## Remove near-end variants and single degeneracy codons

Select sites removed a bit from the start codon:

In [7]:
rnas_variant_syn_noedge_df = rnas_variant_syn_df.query("PosRelative2Start > 51 & " + 
                                                       "PosRelative2Stop > 51")

In [9]:
rnas_variant_syn_noedge_df.shape

(20789723, 111)

Filter on REF and ALT amino acids for those with synonyms (should already be selected) and no STOP codons

In [11]:
rnas_variant_syn_noedge_nostop_df = rnas_variant_syn_noedge_df.query(("REF_AminoAcid in @ccv.aa_with_syn_noSTOP & " + 
                                                                      "ALT_AminoAcid in @ccv.aa_with_syn_noSTOP"))
rnas_variant_syn_noedge_nostop_df.shape

(20789723, 111)

In [13]:
rnas_variant_syn_noedge_nostop_df["WindowPosition"].value_counts()

51    20789723
Name: WindowPosition, dtype: int64

## Remove records with STOP codons appearing within sequence context

In [20]:
variant_codon_context_noSTOP = rnas_variant_syn_noedge_nostop_df.apply(filter_contexts_for_codons, axis=1)

In [21]:
rnas_variant_syn_noedge_noSTOP_df= rnas_variant_syn_noedge_nostop_df[variant_codon_context_noSTOP]

In [29]:
rnas_variant_syn_noedge_noSTOP_df.shape

(20787246, 111)

## Subset records within the same REF/ALT Amino Acid sub class 

Check how many sSNVs switch between different AA_sub classes

In [24]:
sum(rnas_variant_syn_noedge_noSTOP_df["REF_AminoAcid_sub"] != rnas_variant_syn_noedge_noSTOP_df["ALT_AminoAcid_sub"])

1091987

In [27]:
rnas_variant_syn_noedge_noSTOP_CP3_df = rnas_variant_syn_noedge_noSTOP_df.query("REF_AminoAcid_sub == ALT_AminoAcid_sub")

In [28]:
rnas_variant_syn_noedge_noSTOP_CP3_df.shape

(19695259, 111)

## Save filtered files

Save per Amino Acid files

In [37]:
key_columns = ["CHR", "POS", "REF", "ALT"]
seq_columns = ["Sense",
               "REF_AminoAcid", "ALT_AminoAcid", 
               "REF_AminoAcid_sub", "ALT_AminoAcid_sub",
               "REF_Codon", "ALT_Codon",
               "CodonPosition", "WindowPosition",
               "PrecedingBase", "TrailingBase",
               "PrecedingBases", "TrailingBases",
               "SNVContext", "TribaseContext",
               "PrecedingCodon", "TrailingCodon", 
               "PrecedingBicodon", "TrailingBicodon",
               "TricodonContext", 
               "REF_Sequence"]
other_columns = [x for x in rnas_variant_syn_noedge_noSTOP_CP3_df.columns
                 if x not in key_columns+seq_columns]
print(other_columns)

['NM_ID', 'CDS_Start', 'CDS_Stop', 'CDS_Length', 'Trans_Length', 'Trans_POS', 'REF_Base', 'ALT_Base', 'ALT_Sequence', 'deltaMFE', 'deltaCFE', 'deltaMEAFE', 'deltaEFE', 'MFEED', 'CFEED', 'MEAED', 'EFEED', 'deltaCD', 'deltaEND', 'REF_mfeValue', 'REF_cfeValue', 'REF_meafeValue', 'REF_efeValue', 'REF_cdValue', 'REF_endValue', 'REF_meaValue', 'REF_freqMfeEnsemble', 'ALT_mfeValue', 'ALT_cfeValue', 'ALT_meafeValue', 'ALT_efeValue', 'ALT_cdValue', 'ALT_endValue', 'ALT_meaValue', 'ALT_freqMfeEnsemble', 'REF_mfeStructure', 'REF_cfeStructure', 'REF_meafeStructure', 'REF_efeStructure', 'ALT_mfeStructure', 'ALT_cfeStructure', 'ALT_meafeStructure', 'ALT_efeStructure', 'GENE', 'HGVS', 'LOC_IN_GENE', 'EFFECT', 'DIST_TO_CODING_REGION', 'DIST_TO_SPLICE_SITE', 'IMPACT', 'gnomAD3_WG_AC', 'gnomAD3_WG_AN', 'gnomAD3_WG_AF', 'gnomAD3_WG_nhomalt', 'gnomAD3_Coverage', 'gnomAD3_Quality', 'gnomAD3_WG_InbreedingCoeff', 'gnomAD3_CVGE_OVER_20', 'gnomAD2_EX_AC', 'gnomAD2_EX_AN', 'gnomAD2_EX_AF', 'gnomAD2_EX_nhomalt',

In [43]:
for amin, amin_variant_filtered_df in rnas_variant_syn_noedge_noSTOP_CP3_df.groupby("REF_AminoAcid_sub") :
    print(amin)
    print(amin_variant_filtered_df.shape)
    
    #-extract just key columns and sequence information
    amin_variant_filtered_seq_df = amin_variant_filtered_df[key_columns+seq_columns]
    #-extract key columns and non-seq columns
    amin_variant_filtered_nonseq_df = amin_variant_filtered_df[key_columns+other_columns]
    
    #Write out
    amin_variant_filtered_seq_df.to_csv(rnas_variant_syn_out[amin]["seq_filename"],
                                        sep="\t",
                                        index=False)
    amin_variant_filtered_nonseq_df.to_csv(rnas_variant_syn_out[amin]["nonseq_filename"],
                                           sep="\t",
                                           index=False)

A
(2097117, 111)
C
(230056, 111)
D
(483497, 111)
E
(723696, 111)
F
(369312, 111)
G
(1979203, 111)
H
(266642, 111)
I
(885905, 111)
K
(580497, 111)
L2
(211848, 111)
L4
(2362456, 111)
N
(366106, 111)
P
(1894828, 111)
Q
(481529, 111)
R2
(238434, 111)
R4
(976715, 111)
S2
(328278, 111)
S4
(1521162, 111)
T
(1613996, 111)
V
(1812349, 111)
Y
(271633, 111)
