-
Notifications
You must be signed in to change notification settings - Fork 0
/
RNALandscape_1_24_19.py
4213 lines (3571 loc) · 237 KB
/
RNALandscape_1_24_19.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Thu Dec 20 14:37:24 2018
@author: Ofer
"""
# =============================================================================
# Things left to do (not urgent so I haven't done them yet):
# make structure/graph histograms, parallelizing, specialLoops, different vs for the two strands,
# have inputs corruptE_Std, corruptS_Std, include corruptFE_Std
# =============================================================================
import numpy as np
import copy
import time
import scipy
import pickle #for saving and loading
import glob, os #for saving and loading
import cProfile #for profiling
import itertools
import networkx as nx #for graphs
import networkx.algorithms.isomorphism as iso #for graph isomorphism comparison
import matplotlib.pyplot as plt #for plotting
# =============================================================================
# # ===========================================================================
# # # =========================================================================
# # #
# # # TO RUN THE CODE:
# # #
# # # a = RNALandscape(['GCGCAAAAGCGC'],storeGraphs=True) #define the object with the parameters you want
# # # sol = a.calculateFELandscape() #calculate the FE Lanscape
# # # # The results are stored in, e.g., sol.sortedFEs (the free energies, sorted from lowest to highest)
# # #
# # # =========================================================================
# # ===========================================================================
# =============================================================================
class RNALandscape:
def __init__(self, sequences, DNA = [False,False], sequenceFrequencies = [1,1], b = 0.8/0.33, T = 310.15,
vs=0.02, duplexEntropyPenaltyInKB = 0, minBPInStem = 2, numParForLoops = 1,
storeGraphs = False, makeFigures = True, printProgressUpdate = True,
allowPseudoknots = True, minNumStemsInStructure = 0, substems = 'all', frozenBPs = [],
toSave = True, tryToLoadVariables = True, maxSizeSTable = 10**4,
onlyAllowSubsetsOfLongestStems = False, onlyConsiderSubstemsFromEdges = False,
onlyConsiderBondedStrands = False, considerTabulatedLoops = False,
unmatchedBPPenalty = True, minNtsInHairpin = 3, considerAllAsTerminalMismatches = False,
includeTerminalMismatches = True, includeFlushCoaxialStacks = True,
includeDanglingEnds = True, includeTerminalAUATPenalties = True,
considerC3andC4 = True, corruptFESeed = False, unboundButCouldBindPenalties = [0,0]):
#From the MatLab, we removed the following functionalities:
# pairwiseEnergies is now forced to be True;
# allowParallelStrands is now forced to be False
# =============================================================================
# Description of inputs:
#
# This function takes as input an array of sequences (eg:
# ['AUGCGC','GCGCAU']) and outputs the entire free energy landscape of these sequences interacting.
# These sequences can be DNA or RNA. The code can be run with only one sequence e.g.
# ['GCGAAAACGC'].
# DNA is a list the same length as sequences. For each sequence, it is True if
# that sequence is DNA and False if it is RNA
# sequenceFrequencies has no effect (it is a placeholder; in the future we should be able to give
# More precise entropic costs of binding based on the concentrations of each sequence)
# b is the persistence length of single stranded RNA, measured in units of nucleotides (nts) (~3.3 angstroms)
# This assumes a persistence length of RNA of ~0.8 nm found in The Snakelike Chain Character of Unstructured RNA
# by David R. Jacobson, Dustin B. McIntosh, and Omar A. Saleh
# As a plus, this agrees well with Siggia's formula b=2.5a (where a is the size of one ntd).
# This isn't very different for s.s. DNA. See Ionic strength-dependent persistence lengths of single-stranded RNA and DNA
# by Huimin Chen, Steve P. Meisburger, Suzette A. Pabit, Julie L. Sutton, Watt W. Webb, and Lois Pollack
#
# We use g = 3/(2*b) = gamma in place of b for practical reasons
# (that's the way b enters into the entropy formulae).
# T sets the temperature, which affects the free energy calculation FE=E-TS.
# It is measured in Kelvin
# vs is the volume (in ntds^3) within which two nucleotides (ntds or nts, I use
# them interchangably) can bond.
# duplexEntropyPenaltyInKB only affects runs with more than one sequence --
# it defines the entropy penalty (in units of Boltzmann's constant) of two
# strands being bound. This will in general depend on the size of the container and the
# relative concentrations of the two sequences. Future iterations of this
# code will take the latter into account as well.
# minBPInStem defines the minimum length of a stem (it is equal to the
# parameter m in the paper). It is an integer > 0. min value it can take is
# 1; Pipas & McMahon set it to 3.
#
# numParForLoops affects the run itself. Set it to 0 to use a for loop
# rather than a parfor loop (parallel computing) to calculate free energies.
# Otherwise set it to any integer >0 to define how many iterations of parfor
# will be run.
#
# pairwiseEnergies is a boolean. It is true if you want to use the Turner
# energy parameters. Otherwise, we have defined a simple model that uses
# only the energies of single base pair (bp) interactions based on work by
# Tristan Cragnolini. This functionality was removed in the Python code
# (it is now forced to be True)
#
# storeGraphs is a boolean. Set it to True if you'd like to store the
# topology of each structure and all the topologies. The code runs faster if
# it is set to False, but this disallows coarse-graining by topology.
#
# makeFigures defines whether you'd like the code to output figures at the
# end of the calculation. It is a boolean
#
# printProgressUpdate is a boolean. Set it to true to print messages
# updating you on how the algorithm is progressing (e.g. how long it's spent
# on each subfunction).
#
# allowParallelStrands defines whether to allow parallel stems in the
# computation. If pairwiseEnergies is True, it should be set to False;
# otherwise, you can set it to True if you'd like. This functionality was removed in the Python code
# (it is now forced to be False)
# allowPseudoknots defines whether to allow pseudoknots in the computation.
# It is a boolean. It does not affect ``pseudoknots" created by the binding of two strands.
# minNumStemsInStructure (integer) defines the minimum number of compatible stems that can
# comprise a structure. Set it to 0 to get the most general results.
# Otherwise set it to an integer > 0. If it is zero, it also includes the fully unfolded chain
# If it is 1 it includes structures created by single stems; etc.
# In the MatLab code, this parameter was called minNumCompatible.
# substems defines whether to allow all possible substems. If a stem of
# length l is found to be possible, but minBPInStem = m is < l, subsets of this
# stem of any length between l & m are also allowed. If substems is set to 'all', all of
# these possible stems will be considered as well. If substems is set to an
# integer >= 0, the code will only consider subsets of the full stem of
# length l-substems.
# frozenBPs is a list of base pairs that are forced to be in each returned structure.
# An example: frozenBPs = [[2, 9], [3, 8], [37, 46], [38, 45]]
# toSave is a boolean which tells youdetermines whether to save the results
# tryToLoadVariables is a boolean which tells the code whether to try to load
# the results from a saved file (it can also work even if the saved file has a different
# T, vs, or duplexEntropyInKB).
# maxSizeSTable is for the preallocation of space for STable. It tells the code how much space to preallocate
# (in other words, what we expect the maximum total number of stems will be).
# onlyAllowSubsetsOfLongestStems is a boolean which should be set to True if you want to only
# include the longest possible stems and ignore the rest. By default, the code also includes
# any stem of length >= 16 bps.
# onlyConsiderSubstemsFromEdges is a boolean. It should be set to True if you want to consider,
# only substems that include the edges of the full stem (in order to simulate zipperings).
# For example, for a stem made up of base pairs [[1,100], [2,99], [3,98], [4,97]],
# the substems [[1,100], [2,99], [3,98]], [[2,99], [3,98], [4,97]],
# [[1,100], [2,99]], and [[3,98], [4,97]] will be considered, but
# the substem [[2,99], [3,98]] will not be since it doesn't include the edge of the full stem.
# onlyConsiderBondedStrands is a boolean which should be set to True if you want all returned structures
# to include at least one bond between the two strands.
# considerTabulatedLoops is a boolean. Set it to True to use tabulated energy/entropy parameters
# from the Turner/Mathews groups for special hairpins and internal loops.
# unmatchedBPPenalty is a boolean. The standard Turner/Mathews parameters have a rule that if
# two nts are unpaired in the specific structure you're considering, but could pair in another structure,
# the purine should be replaced with A and the pyrimidine with C for the purposes of the energy calculation
# (i.e. for terminal mismatch purposes, if the terminal mismatch could in fact have paired, there's a penalty).
# Set this to True to include that penalty, and False to ignore this rule.
# minNtsInHairpin is an integer. It specifies the minimum number of nts in a hairpin. It should be set to 3.
# considerAllAsTerminalMismatches is a boolean. Set it to True to ignore both
# dangling ends and flush coaxial stacks, and to consider all ends of stems
# as terminal mismatches (even if the next nts over are both bound as part of different stems)
# If this is True, includeTerminalMismatches = True; includeFlushCoaxialStacks = False;
# and includeDanglingEnds = False.
# includeTerminalMismatches is a boolean. Set it to True to consider terminal mismatches
# when tabulating the bondFEs. If you change this the whole CHECK function needs to be rerun.
# includeFlushCoaxialStacks is a boolean. Set it to True to consider that if two stems are adjacent,
# the stems will stack for the purposes of considering terminal mismatches (or rather, not
# mismatches, in this case). See the main code for how our use of flush coaxial stacks differs
# from the NNDB's (Turner/Mathews). If you change this the whole CHECK function needs to be rerun.
# includeTerminalAUATPenalties is a boolean. Set it to True to include the Turner/Mathews
# energy and entropy penalties for AU/GU/AT base pairs at the ends of a stem. We tabulate
# these closures regardless since it's very inexpensive, but only include them in the energy/entropy
# calculations if this is set to True. Therefore, this can be changed with only running
# the postCalculationFxn.
# includeDanglingEnds is a boolean. Set it to True to use dangling ends when performing the
# energy/entropy calculations. We tabulate these regardless since it's relatively inexpensive,
# but only include them in the energy/entropy calculations if this is set to True.
# Therefore, this can be changed with only running the postCalculationFxn.
# considerC3andC4 is a boolean. Set it to True if you want to calculate the three- and four-way
# compatibility tensors C3 and C4 in order to get rid of many structures with high-order pseudoknots.
# This is most useful (speeds up the code most) if considerOnlyBondedStrands = False,
# since these tensors are not used to deduce compatibilities including stems that define
# intermolecular binding.
# corruptFESeed is a float. Set it to False (0) to use the Turner/Mathews parameters for
# energies/entropies. Set it to a (nonzero) float to use that float as a seed for corrupting
# those energy/entropy parameters.
# unboundButCouldBindPenalties is a list [x,y]. If two nts at the edge of a stem are unbound but could bind
# we may treat them (following the NNDB) as an AC pair. However, we also introduce
# corrections to this since that decision appears to have been arbitrary.
# x is the energy penalty we add, and y is the entropy penalty.
# Even if unmatchedBPPenalty = False, this penalty is introduced if unboundButCouldBindPenalties ~=[0,0]
# =============================================================================
# =============================================================================
# Save variables to self
# =============================================================================
self.version = '1_24_19'
sequences, DNA, sequenceFrequencies = self.basicCorrectionsToInput(sequences, DNA, sequenceFrequencies)
# perform basic corrections to input like sorting sequences, making sure they're in a list, etc.
self.sequences = sequences
self.DNA = DNA
self.sequenceFrequencies = sequenceFrequencies
self.b = b #in units of nucleotides (nts or ntds -- I use both interchangably)
self.g = 3 / (2*self.b)
self.T = T #in units of Kelvin
self.vs = vs #in units of ntds^3
self.duplexEntropyPenaltyInKB = duplexEntropyPenaltyInKB
self.minBPInStem = minBPInStem
self.numParForLoops = numParForLoops
self.pairwiseEnergies = True
self.storeGraphs = storeGraphs
self.makeFigures = makeFigures
self.printProgressUpdate = printProgressUpdate
self.tryToLoadVariables = tryToLoadVariables
self.allowParallelStrands = False
self.allowPseudoknots = allowPseudoknots
self.minNumStemsInStructure = minNumStemsInStructure
self.substems = substems
self.frozenBPs = frozenBPs
self.maxSizeSTable = maxSizeSTable
self.toSave = toSave
self.initiationPenalty = False
self.onlyAllowSubsetsOfLongestStems = onlyAllowSubsetsOfLongestStems
self.onlyConsiderSubstemsFromEdges = onlyConsiderSubstemsFromEdges
self.onlyConsiderBondedStrands = onlyConsiderBondedStrands
self.considerTabulatedLoops = considerTabulatedLoops
self.unmatchedBPPenalty = unmatchedBPPenalty
self.kB = 0.0019872 #units of kcal/(mol*Kelvin)
self.minNtsInHairpin = minNtsInHairpin #min number of nts in hairpin
self.considerAllAsTerminalMismatches = considerAllAsTerminalMismatches
if considerAllAsTerminalMismatches:
includeTerminalMismatches = True
includeFlushCoaxialStacks = False
includeDanglingEnds = False
self.includeTerminalMismatches = includeTerminalMismatches
self.includeFlushCoaxialStacks = includeFlushCoaxialStacks
self.includeDanglingEnds = includeDanglingEnds
self.includeTerminalAUATPenalties = includeTerminalAUATPenalties
self.considerC3andC4 = considerC3andC4
if not self.allowPseudoknots:
self.considerC3andC4 = False #since all those will already be not considered
#However, considerC3andC4 should still be True if numSequences > 1 since we automatically only
#consider only intramolecular pseudoknots for C3 and C4
self.corruptFESeed = corruptFESeed
self.unboundButCouldBindPenalties = unboundButCouldBindPenalties
self.completed = False
# =============================================================================
# Make the list of sequences into one long sequence.
# Also convert the sequence to numbers
# =============================================================================
self.sequence,self.sequenceInNumbers,self.numSequences,self.numNt,\
self.numNt1,self.numNt2,self.linkerPos = self.multipleStrandsSetup()
if self.numSequences == 1:
self.onlyConsiderBondedStrands = False #doesn't make sense otherwise
if self.onlyConsiderBondedStrands: #if only consider bonded strands, don't include completely unfolded structure
self.minNumStemsInStructure = max(1, self.minNumStemsInStructure)
# =============================================================================
# Make the filename to save the data
# =============================================================================
if self.toSave or self.tryToLoadVariables:
self.fileVars, self.fileTxt, self.folderName, self.fileFigs = self.makeFileName()
def basicCorrectionsToInput(self, sequences, DNA, sequenceFrequencies):
# =============================================================================
# perform basic corrections to input like sorting sequences, making sure they're in a list, etc.
# =============================================================================
#If we have only one sequence and we forgot to put it in a list,
#and instead just used a string, we should correct it
if not isinstance(sequences,list) and isinstance(sequences,str):
sequences = [sequences]
if len(sequences) > 2:
print('ERROR: We only accept at most two sequences at once for now')
#not myPrintFxn since the files haven't been created yet
return(None)
if len(DNA) < len(sequences):
print('ERROR: len(DNA) < len(sequences)')
return(None)
else:
DNA = DNA[:len(sequences)]
# Make sure sequences is sorted so that we load the results if we tried it with opposite order
if not sequences == sorted(sequences):
DNA = [DNA[1], DNA[0]]
sequences = [sequences[1], sequences[0]]
sequenceFrequencies = [sequenceFrequencies[1], sequenceFrequencies[0]]
# If we have DNA, change all U's to T's, and vice versa if we have RNA
for i in range(len(sequences)):
if DNA[i]:
sequences[i].replace('U','T')
else:
sequences[i].replace('T','U')
return(sequences, DNA, sequenceFrequencies)
def calculateFELandscape(self):
# =============================================================================
# Try to load the variables
# =============================================================================
if self.printProgressUpdate:
myPrintFxn('sequence = ' + self.sequence, self.fileTxt)
myPrintFxn('minBPInStem = ' + str(self.minBPInStem), self.fileTxt)
startTime = time.time()
self.startTime = startTime
if self.tryToLoadVariables:
loadedVars = self.loadVars(self.folderName)
if loadedVars:
return(loadedVars) #don't need returnFxn here since loadVars already returns the returnFxn
# =============================================================================
# If we couldn't load the variables (otherwise this would all be skipped)
# Start enumerating the structures. First, the START function:
# Enumerate the stems and define their compatibilities
# =============================================================================
self.numStems, self.STableStructure, self.STableBPs = self.createSTable()
if self.printProgressUpdate:
myPrintFxn('numStems = ' + str(self.numStems), self.fileTxt)
self.frozenStems = self.frozenStemsFromFrozenBPs()
# =============================================================================
# given a list of base pairs that we want fixed, what regions do they correspond to?
# frozenRegions is a cell array, where frozenRegions{i} is a vector of
# regions containing all the base pairs of the i'th stem in frozenBPs. Thus,
# in order to keep all of frozenBPs fixed, each possible structure must have
# one stem from each vector in each element of frozenRegions.
# =============================================================================
#Make a list of which stems correpsond to the two strands being bound
self.linkedStems = self.findLinkedStems() #a boolean np array of length numStems.
# linkedStems[i] == True if stem i corresponds to the two strands being bound, and False otherwise
#Make compatibility matrices
self.C = self.makeCompatibilityMatrix()
if not all(np.diagonal(self.C)): #i.e. if there are some disallowed stems (because of frozenBPs)
self.numStems, self.frozenStems, self.STableBPs, self.STableStructure, self.C, self.linkedStems = self.removeDisallowedStems()
if self.printProgressUpdate:
myPrintFxn('modified numStems = ' + str(self.numStems), self.fileTxt)
if self.considerC3andC4:
self.C3 = self.makeThreewayCompatibilityTensor()
self.C4 = self.makeFourwayCompatibilityTensor()
else:
self.C3 = True; self.C4 = True #so the variables exist
# =============================================================================
# I had thought to calculate bond energy and entropies at the level of stems, but it's inefficient and slow.
# =============================================================================
#Calculate the bond energy and entropy of each stem, not including what to do at the ends of the stems
#(i.e. dangling ends, terminal mismatches, and the like)
# if self.pairwiseEnergies:
# self.stemFECounts = self.calculateStemFreeEnergiesPairwise()
# self.stemEnergies, self.stemEntropies = self.calculateStemFreeEnergiesPairwise()
# else:
# print('non-pairwise FE has not yet been created')
# self.stemEnergies, self.stemEntropies = self.calculateStemFreeEnergiesSingle()
if self.printProgressUpdate:
myPrintFxn('Time for START function = ' + str(time.time() - startTime), self.fileTxt)
# =============================================================================
# Next, the PERMU function: Find all possibile allowed structures by
# ennumerating all allowed permutations of the stems (regions) found in STable
# =============================================================================
# We no longer do the following (explained below):
# Taking a leaf from TT2NE's book, we'll start by defining helipoints --
# These are combinations of helices separated by zero or one nts. The benefit of these
# is that their bond energies and entropies are entirely well-defined (with the exception of
# if we choose to include 2x1 or 2x2 internal loop parameters, but I don't
# think we should since those seem so wishy-washy).
# # Update: making the helipoints is fairly fast, but making the helipoint compatibility matrix is
# # very very slow. The reason for that is that the number of helipoints grows quickly
# # (almost as fast as the total number of structures) and so the helipoint compatibility matrix
# # is extremely large. Here is the code which now we don't use anymore.
# # helipointStartTime = time.time()
# # self.numHelipoints, self.helipoints, self.adjStems = self.makeHelipoints()
# # self.CHeli = self.makeHelipointCompatibilityMatrix()
# # #self.helipointEnergies, self.helipointEntropies = self.calculateHelipointFreeEnergies()
# #
# # #Define helipoint energies
# # helipointTime = time.time() - helipointStartTime
# # if self.printProgressUpdate:
# # print('Time for helipoint function = ' + str(helipointTime))
# =============================================================================
# =============================================================================
permuFxnStartTime = time.time()
self.numStructures, self.structures, self.linkedStructures = self.enumerateStructures()
if self.printProgressUpdate:
myPrintFxn('numStructures = ' + str(self.numStructures), self.fileTxt)
myPrintFxn('Time for PERMU function = ' + str(time.time() - permuFxnStartTime), self.fileTxt)
checkFxnStartTime = time.time()
self.checkFxnStartTime = checkFxnStartTime
(self.bondFECounts, self.dangling5Count, self.dangling3Count,
self.allNumVs, self.allComponentGraphs, self.structureGraphList,
self.allWhichStructureGraph,self.unboundButCouldBindCounts) = self.calculateFreeEnergies()
if self.printProgressUpdate:
myPrintFxn('Time for CHECK function = ' + str(time.time() - checkFxnStartTime), self.fileTxt)
return(self.postCalculationFxn())
def makeThreewayCompatibilityTensor(self):
# =============================================================================
# In order to avoid having any numerical integration, only consider pseduoknots of max order 2,
# meaning only containing maximum two stems (for example don't consider intramolecular kissing hairpins)
# If we have three (antiparallel) stems each defined by a base pair: A bound to a, B to b, C to c (A<a, B<b, C<c)
# such that A<B<C, what matters for whether or not they can all coexist is their respective order.
# The first has to be A. Then, either a or B can be next. But if it's a, then that makes a hairpin
# and whatever happens next will result in a pseudoknot of max order 2.
# So the game is to come up with an ordering for A,a,B,b,C,c -- or, more intuitively, using parentheses, of
# the six characters ([{}]) such that ( always comes before [ which always comes before {,
# such that no subset of consecutive brackets correspond to a real substructure.
# For example, in the sequence ([{]}), the subset '[{]}' corresponds to a real substructure (it is
# a complete set of opening and closing brackets).
# So ( has to be first, then ) or [, but as discussed, in coming up with disallowed structures, next has to be [.
# After ([ next can be either ) or { -- if it's ] that closes that bracket and the pair can be removed.
# Let's consider these one by one.
# First, consider ([). Next can't be ] since that would be a complete structure on its own.
# So it has to go ([){. Next can't come }, it has to be ].
# So we have our first disallowed set: ([){]}, or ABaCbc. The order of A,a,B,b,C,c is [0, 2, 1, 4, 3, 5]
#
# Next, consider ([{. After that can be either ] or ), so we consider both.
# So we have ([{]. Next can't be } since [{]} is a complete substructure,
# so we have our next disallowed set: ([{])}, or ABCbac. The order of A,a,B,b,C,c is [0, 2, 4, 3, 1, 5]
#
# Next we have ([{). Either possibilities after are acceptable, so our final two disallowed sets are:
# ([{)]} or ABCabc -- The order of A,a,B,b,C,c is [0, 2, 4, 1, 3, 5]
# and ([{)}] or ABCacb -- The order of A,a,B,b,C,c is [0, 2, 4, 1, 5, 3]
#
# This argument is where we get threewayDisallowedPermutations, the four sorted orders of
# A,a,B,b,C,c for the four disallowed sets.
# =============================================================================
numStems = self.numStems
STableBPs = self.STableBPs
CMat = self.C
linkedStems = self.linkedStems #a boolean np array of length numStems.
# linkedStems[i] == True iff stem i corresponds to a bond between two strands
threewayDisallowedPermutations = [[0, 2, 1, 4, 3, 5], [0, 2, 4, 3, 1, 5], [0, 2, 4, 1, 3, 5], [0, 2, 4, 1, 5, 3]]
C3 = np.zeros((numStems,numStems,numStems),dtype=bool)
# Go through each triplet of stems
for i in range(numStems):
if not linkedStems[i]: #things that appear to be high-order pseudoknots because of the linker of 'OOO's
# may in reality be pseudoknots of much lower order
for j in range(i+1,numStems):
if CMat[i,j] and not linkedStems[j]: #they need to be pairwise compatible
for k in range(j+1,numStems):
if CMat[j,k] and not linkedStems[k]:
A = STableBPs[i][0][0] #first nt of stem
a = STableBPs[i][0][1] #what it's bound to. a>A
B = STableBPs[j][0][0]
b = STableBPs[j][0][1] #b>B
C = STableBPs[k][0][0]
c = STableBPs[k][0][1] #c>C
#put the stems into order such that a<c<e
l = sorted([[A,a],[B,b],[C,c]]) #python automatically sorts by first element in each sublist
A, a = l[0]
B, b = l[1]
C, c = l[2]
#What matters for determining whether this triplet is allowed is the order of a,b,c,d,e,f
if list(np.argsort([A,a,B,b,C,c])) not in threewayDisallowedPermutations:
C3[i,j,k] = True; C3[i,k,j] = True; C3[j,i,k] = True
C3[j,k,i] = True; C3[k,i,j] = True; C3[k,j,i] = True
return(C3)
def makeFourwayCompatibilityTensor(self):
# =============================================================================
# Similarly to three-way compatibility, we also consider fourway compatibility.
# There are far more than four disallowed ways of having four stems, and we'll show how you can enumerate them here.
# We have A,B,C,D,a,b,c,d, such that A<a, B<b, etc. and A<B<C<D. We want to find how we can arrange the 8 bps
# (within these rules) such that no subsequence forms a complete structure (contains all its own opening and closing nts
# where a is the closing nt of A, etc.)
#
# We have to start with AB. Next can be a or C.
# ABa. Next can't be b so it has to be C
# ABaC. Next can be b or D.
# ABaCb. Next can only be D. Then has to come c then d. --> ABaCbDcd
# ABaCD. Next can be b or c
# ABaCDbcd, ABaCDbdc
# ABaCDcbd, ABaCDcdb
# ABC. Next can be a, b, or D. If it's D, we can have any combination of the closing pairs as long as d isn't next and a isn't last.
# ABCDabcd, ABCDabdc, ABCDacbd, ABCDacdb, ABCDadbc, ABCDadcb
# ABCDbcad, ABCDbdac, ABCDbacd, ABCDbadc
# ABCDcbad, ABCDcdab, ABCDcabd, ABCDcadb
#
# ABCa. Next can be b,c, or D
# ABCabDcd
# ABCacDbd
# ABCaD. Next can be b or c
# ABCaDbcd, ABCaDbdc, ABCaDcbd, ABCaDcdb
# ABCb. Next can be a, c, or D (this is as above, but a can't be the last one)
# ABCbaDcd, ABCbcDad, ABCbDacd, ABCbDadc, ABCbDcad
# I checked this against the MatLab code -- besides parallel stems, the difference is that the MatLab code also
# includes those sets of four stems that lead to a pseduoknot of order 3 (we're only considering those of order 4 here)
# =============================================================================
numStems = self.numStems
STableBPs = self.STableBPs
CMat = self.C
C3 = self.C3
printProgressUpdate = self.printProgressUpdate
linkedStems = self.linkedStems #a boolean np array of length numStems.
# linkedStems[i] == True iff stem i corresponds to a bond between two strands
fourwayDisallowedPermutationsStr = ['ABaCbDcd','ABaCDbcd','ABaCDbdc','ABaCDcbd','ABaCDcdb','ABCDabcd',
'ABCDabdc', 'ABCDacbd', 'ABCDacdb', 'ABCDadbc', 'ABCDadcb','ABCDbcad',
'ABCDbdac', 'ABCDbacd', 'ABCDbadc','ABCDcbad', 'ABCDcdab','ABCDcabd',
'ABCDcadb','ABCabDcd','ABCacDbd','ABCaDbcd', 'ABCaDbdc', 'ABCaDcbd',
'ABCaDcdb','ABCbaDcd', 'ABCbcDad', 'ABCbDacd', 'ABCbDadc', 'ABCbDcad']
disallowedPermuStr2Num = {'A':0, 'a':1, 'B':2, 'b':3, 'C':4, 'c':5, 'D':6, 'd':7}
fourwayDisallowedPermutations = [[disallowedPermuStr2Num[i] for i in s] for s in fourwayDisallowedPermutationsStr]
C4 = np.zeros((numStems,numStems,numStems,numStems),dtype=bool)
# Go through each quadruplet of stems
counter = 0
for i in range(numStems):
if not linkedStems[i]:
for j in range(i+1,numStems):
if CMat[i,j] and not linkedStems[j]: #they need to be pairwise compatible
for k in range(j+1,numStems):
if C3[i,j,k] and not linkedStems[k]: #they need to be pairwise compatible and three-way compatible
for q in range(k+1,numStems):
if C3[i,j,q] and C3[i,k,q] and C3[j,k,q] and not linkedStems[q]:
#C3 includes pairwise compatibility as well as three-way compatibility
A = STableBPs[i][0][0] #first nt of stem
a = STableBPs[i][0][1] #what it's bound to. a>A
B = STableBPs[j][0][0]
b = STableBPs[j][0][1] #b>B
C = STableBPs[k][0][0]
c = STableBPs[k][0][1] #c>C
D = STableBPs[q][0][0]
d = STableBPs[q][0][1]
#put the stems into order such that a<c<e
l = sorted([[A,a],[B,b],[C,c],[D,d]]) #python automatically sorts by first element in each sublist
A, a = l[0]
B, b = l[1]
C, c = l[2]
D, d = l[3]
#What matters for determining whether this triplet is allowed is the order of a,b,c,d,e,f
if list(np.argsort([A,a,B,b,C,c,D,d])) not in fourwayDisallowedPermutations:
C4[i,j,k,q] = True; C4[i,j,q,k] = True; C4[i,k,j,q] = True; C4[i,k,q,j] = True
C4[i,q,j,k] = True; C4[i,q,k,j] = True; C4[j,i,k,q] = True; C4[j,i,q,k] = True
C4[j,k,i,q] = True; C4[j,k,q,i] = True; C4[j,q,i,k] = True; C4[j,q,k,i] = True
C4[k,i,j,q] = True; C4[k,i,q,j] = True; C4[k,j,i,q] = True; C4[k,j,q,i] = True
C4[k,q,i,j] = True; C4[k,q,j,i] = True; C4[q,i,j,k] = True; C4[q,i,k,j] = True
C4[q,j,i,k] = True; C4[q,j,k,i] = True; C4[q,k,i,j] = True; C4[q,k,j,i] = True
counter += 1
if counter % 5e5 == 0 and printProgressUpdate:
myPrintFxn('Setting up C4: i = ' + str(i) + '; length c4 indices = ' + str(counter), self.fileTxt)
return(C4)
def postCalculationFxn(self):
# =============================================================================
# Take the energies and entropies that have been calculated, add the
# relevant vs factors to the entropies, calculate free energies
# calculate probabilities, sort the structures, calculate prob of bonded/unbonded, etc.
# =============================================================================
postCalculationFxnTime = time.time()
numStructures = self.numStructures
numSequences = self.numSequences
linkedStructures = self.linkedStructures
DNA = self.DNA
corruptFESeed = self.corruptFESeed
g = self.g #gamma
T = self.T
duplexEntropyPenaltyInKB = self.duplexEntropyPenaltyInKB
allComponentGraphs = self.allComponentGraphs
allNumVs = self.allNumVs
bondFECounts = self.bondFECounts
dangling5Count = self.dangling5Count
dangling3Count = self.dangling3Count
unboundButCouldBindCounts = self.unboundButCouldBindCounts
logVs = np.log(self.vs)
kB = self.kB
printProgressUpdate = self.printProgressUpdate
storeGraphs = self.storeGraphs
unboundButCouldBindPenalties = self.unboundButCouldBindPenalties
# =============================================================================
# ********* Perform the loop entropy calculation given the component graphs ********
# each componentGraphs is a dictionary, where each entry of the dictionary yields a vector of vectors
# where each element of the vector is a different parameter set for a different instance of that net.
# The parameter sets are defined as vectors in the order [s1, s2, s3, s4, l1, l2].
# If the parameter is not defined for the graph its value is set to be -1 (like s4 for an open-net-2a)
# Example usage: componentGraphs['closedNet2a'][0][3] gives the value of s4 for the first
# instance of closedNet2a found in the structure. componentGraphs has these keys:
# ['openNet0','closedNet0','openNet1','closedNet1','openOpenNet2','openNet2a',
# 'closedNet2a','openNet2b','closedNet2b','openNet2c','closedNet2c','tooComplex']
# =============================================================================
compGraphKeys = ['openNet0','closedNet0','openNet1','closedNet1','openOpenNet2','openNet2a',
'closedNet2a','openNet2b','closedNet2b','openNet2c','closedNet2c','tooComplex']
allLoopEntropies = np.zeros(numStructures)
for whichStruct in range(numStructures):
loopEntropy = 0
componentGraphs = allComponentGraphs[whichStruct]
for netType in compGraphKeys:
for slVec in componentGraphs[netType]:
loopEntropy += entropyOfNet(netType, slVec, g)
allLoopEntropies[whichStruct] = kB*(loopEntropy + allNumVs[whichStruct]*logVs)
# =============================================================================
# Perform bond energy and bond entropy calculations (including dangling ends)
# =============================================================================
if not corruptFESeed:
energyMatrices, entropyMatrices = bondFreeEnergies()
else:
energyMatrices, entropyMatrices = corruptBondFreeEnergies(corruptFESeed)
allBondEnergies = np.zeros(numStructures)
allBondEntropies = np.zeros(numStructures)
# The fact that these matrices are sparse slows down the code (for ~3000 structures
# for non-sparse matrices postCalculationFxn took 0.17 s which changed to 0.8 s after introducing sparse matrices)
# We cut this in half by only considering RNA(DNA) for systems that only have RNA(DNA)
bpTypeList = []
if any([not i for i in DNA]):
bpTypeList.append(0) #RNA/RNA bonds
if any(DNA):
bpTypeList.append(1) #DNA/DNA bonds
if bpTypeList == [0,1]:
bpTypeList.append(2) #RNA/DNA bonds
if self.includeTerminalAUATPenalties:
bpTypeList.append(3) #terminal AU/GU/AT penalties
for bpType in bpTypeList: #RNARNA, DNADNA, RNADNA, terminalAUATPenalty (don't need unknown since it just gives 0)
allBondEnergies += np.sum(np.sum(np.multiply(energyMatrices[bpType],
np.array([bondFECounts[whichStruct][bpType].todense()
for whichStruct in range(numStructures)])),axis=1),axis=1)
allBondEntropies += np.sum(np.sum(np.multiply(entropyMatrices[bpType],
np.array([bondFECounts[whichStruct][bpType].todense()
for whichStruct in range(numStructures)])),axis=1),axis=1)
bpTypeList = []
if any([not i for i in DNA]):
bpTypeList.append(0) #RNA
if any(DNA):
bpTypeList.append(1) #DNA
if self.includeDanglingEnds:
if not corruptFESeed:
dangling5Energies, dangling5Entropies, dangling3Energies, dangling3Entropies = danglingEndMatrices()
else:
dangling5Energies, dangling5Entropies, dangling3Energies, dangling3Entropies = corruptDanglingEndMatrices(corruptFESeed)
for bpType in bpTypeList: #RNA, DNA, (don't need unknown)
allBondEnergies += np.sum(np.sum(np.multiply(dangling5Energies[bpType],
np.array([dangling5Count[whichStruct][bpType].todense()
for whichStruct in range(numStructures)])),axis=1),axis=1)
allBondEnergies += np.sum(np.sum(np.multiply(dangling3Energies[bpType],
np.array([dangling3Count[whichStruct][bpType].todense()
for whichStruct in range(numStructures)])),axis=1),axis=1)
allBondEntropies += np.sum(np.sum(np.multiply(dangling5Entropies[bpType],
np.array([dangling5Count[whichStruct][bpType].todense()
for whichStruct in range(numStructures)])),axis=1),axis=1)
allBondEntropies += np.sum(np.sum(np.multiply(dangling3Entropies[bpType],
np.array([dangling3Count[whichStruct][bpType].todense()
for whichStruct in range(numStructures)])),axis=1),axis=1)
if numSequences > 1:
self.allDuplexEntropies = linkedStructures * kB * duplexEntropyPenaltyInKB
else:
self.allDuplexEntropies = np.zeros(numStructures)
if any(unboundButCouldBindPenalties):
allBondEnergies += unboundButCouldBindCounts * unboundButCouldBindPenalties[0]
allBondEntropies += unboundButCouldBindCounts * unboundButCouldBindPenalties[1]
self.allLoopEntropies = allLoopEntropies
self.allBondEnergies = allBondEnergies
self.allBondEntropies = allBondEntropies
allFreeEnergies = allBondEnergies - T * (allBondEntropies + allLoopEntropies + self.allDuplexEntropies)
probabilities = np.exp(-allFreeEnergies / (kB*T))
partitionFxn = np.sum(probabilities)
probabilities /= partitionFxn
self.indexSort = np.flip(np.argsort(probabilities)) #flip so that probabilities are decreasing
# The minFE structure will be structures[indexSort[0]]
# More generally, the structure with the n^th lowest FE will be structures[indexSort[n]]
# and its FE will be given by sortedFEs[n]
self.sortedProbs = probabilities[self.indexSort]
self.sortedFEs = allFreeEnergies[self.indexSort]
if storeGraphs:
structureGraphList = self.structureGraphList
allWhichStructureGraph = self.allWhichStructureGraph
numGraphs = len(structureGraphList)
graphProbs = np.array([np.sum([probabilities[i] for i in range(numStructures) if
allWhichStructureGraph[i] == j]) for j in range(numGraphs)])
self.indexSortedGraphProbs = np.flip(np.argsort(graphProbs))
self.sortedGraphProbs = graphProbs[self.indexSortedGraphProbs]
else:
self.indexSortedGraphProbs = []
self.sortedGraphProbs = []
if numSequences > 1:
probBonded = np.sum([probabilities[i] for i in range(numStructures) if linkedStructures[i]])
if printProgressUpdate:
myPrintFxn('probability of sequences being bound = ' + str(probBonded), self.fileTxt)
else:
probBonded = 0
self.probSeparateStrands = 1 - probBonded
if printProgressUpdate:
myPrintFxn('Time for postCalculationFxn function = ' + str(time.time() - postCalculationFxnTime), self.fileTxt)
self.completed = True
if self.toSave: #saving can take some time
save(self,self.fileVars)
return(self.returnFxn())
def calculateFreeEnergies(self):
numStructures = self.numStructures
numParForLoops = self.numParForLoops
considerTabulatedLoops = self.considerTabulatedLoops
storeGraphs = self.storeGraphs
STableStructure = self.STableStructure
structures = self.structures
# =============================================================================
# Initialize the tables we're going to return
# =============================================================================
bondFECounts = [None] * numStructures
dangling5Count = [None] * numStructures
dangling3Count = [None] * numStructures
allComponentGraphs = [None] * numStructures
allNumVs = np.zeros(numStructures,dtype=int)
unboundButCouldBindCounts = np.zeros(numStructures,dtype=int)
if considerTabulatedLoops:
allSpecialLoopEnergies = np.zeros(numStructures)
allSpecialLoopEntropies = np.zeros(numStructures)
else:
allSpecialLoopEnergies = []
allSpecialLoopEntropies = []
#keep track of weight matrices and numBonds matrices, and which of
#the graphs in weightMatrixList,numBondsMatrixList each structure corresponds to
structureGraphList = []
if storeGraphs:
allWhichStructureGraph = np.zeros(numStructures,dtype=int)
else:
allWhichStructureGraph = []
if numParForLoops > 0:
listOfAllNewStructureGraphs = [None]*numStructures
# =============================================================================
# Create various arrays once so we don't need to do it for each structure over again
# =============================================================================
allPerms = []
for i in range(2,6):
allPerms.append(list(itertools.permutations(range(1,i))))
# anyDNA = any(DNA)
self.refNets = createRefNets()
# [specialHairpins, IL11FE, IL11E, IL12FE, IL12E, IL22FE, IL22E] = makeVars(anyDNA,considerTabulatedLoops);
# =============================================================================
# Iterate over structures, calculating the free energy for each
# =============================================================================
#Parallelize the next part: Also, Maybe use map?
storedGraphEigs = []
#Potentially get rid of graphs if they are showing that they don't have a high probability of occurring (add cutoff as input var)
for whichStruct in range(numStructures):
structure = [None]*len(structures[whichStruct])
for i in range(len(structure)):
structure[i] = STableStructure[structures[whichStruct][i]]
allNumVs[whichStruct], allComponentGraphs[whichStruct], structureGraph = self.calculateLoopEntropy(structure)
(bondFECounts[whichStruct], dangling5Count[whichStruct], dangling3Count[whichStruct],
specialLoopEnergies, specialLoopEntropies,unboundButCouldBindCounts[whichStruct]) = \
self.calculateBondEnergy(structure, structureGraph)
if considerTabulatedLoops:
allSpecialLoopEnergies[whichStruct] = specialLoopEnergies
allSpecialLoopEntropies[whichStruct] = specialLoopEntropies
if storeGraphs: #faster_could_be_isomorphic barely adds time to
# case without storeGraphs, but makes a lot of mistakes (thinks graphs are isomorphic when they're not)
eigGraph = np.sort(np.linalg.eigvals(nx.convert_matrix.to_numpy_matrix(structureGraph)))
GisoTest = [i for i in range(len(structureGraphList))
if nx.faster_could_be_isomorphic(structureGraph,structureGraphList[i]) and
all(np.abs(eigGraph - storedGraphEigs[i]) < 1e-6)]
Giso = []
for i in GisoTest: #break makes this ~2x faster than list comprehension,
# but still much slower than not having storeGraphs
# Considering eigenvalue equality (above) cuts time in half again
if nx.is_isomorphic(structureGraph,structureGraphList[i]):
Giso = [i]
break
if not len(Giso):
structureGraphList.append(structureGraph)
storedGraphEigs.append(eigGraph)
allWhichStructureGraph[whichStruct] = len(structureGraphList) - 1
else: #which structureGraph does it correspond to?
allWhichStructureGraph[whichStruct] = Giso[0]
if whichStruct % 100000 == 0 and whichStruct > 0:
myPrintFxn('Calculating free energies: whichStruct = ' + str(whichStruct), self.fileTxt)
myPrintFxn('Time elapsed since start of CHECK = ' + str(time.time() - self.checkFxnStartTime), self.fileTxt)
#myPrintFxn('', self.fileTxt) #add an extra line for clarity
return(bondFECounts, dangling5Count, dangling3Count, allNumVs, allComponentGraphs,
structureGraphList, allWhichStructureGraph,unboundButCouldBindCounts)
def calculateBondEnergy(self, structure, G):
sequenceInNumbers = self.sequenceInNumbers
numNt = self.numNt
linkerPos = self.linkerPos
unmatchedBPPenalty = self.unmatchedBPPenalty
if len(linkerPos):
firstNts = [0, linkerPos[-1] + 1]
lastNts = [linkerPos[0] - 1, numNt - 1]
else:
firstNts = [0]
lastNts = [numNt - 1]
boundNts = structure2boundNt(structure,numNt)
bpsList = structure2bpsList(structure)
RNARNACount = scipy.sparse.lil_matrix((6,16),dtype=int) #np.zeros((6,4,4),dtype = int)
#First index tells you if the first bp of the set is AU (0) CG (1) GC (2) UA (3) GU (4) or UG (5)
#Second index tells you if the 3' ntd of the second bp is A (0) C (1) G(2) or U(3) (which row of table 1 in Serra & Turner).
#Third index tells you if the 5' ntd of the second bp is A (0) C (1) G(2) or U(3) (which column of table 1 in Serra & Turner).
# Had to make this and the others 2D array to be able to use scipy sparse functions, so
# second/third index is replaced by (4*second index) + third index. That's why it's
# np.zeros(6,16) and not np.zeros(6,4,4).
# lil_matrix was chosen because, from scipy reference page, it is fast for constructing
# sparse matrices incrementally. For operations (later we'll multiply it) it should be converted to
# another form.
DNADNACount = scipy.sparse.lil_matrix((4,16),dtype=int) #np.zeros((4,16),dtype = int)
#First index tells you if the first bp of the set is AT (0) CG (1) GC (2) or TA (3)
#Second index tells you if the 3' ntd of the second bp is A (0) C (1) G(2) or T(3)
#Third index tells you if the 5' ntd of the second bp is A (0) C (1) G(2) or T(3)
RNADNACount = scipy.sparse.lil_matrix((8,16),dtype=int) #np.zeros((8,16),dtype = int)
#First index tells you if the set is (putting RNA first)
#5'AX3'/3'TY5' (0) 5CX3/3GY5 (1) 5GX3/3CY5 (2) 5UX3/3AY5 (3) 5XA3/3YT5 (4) 5XC3/3YG5 (5) 5XG3/3YC5 (6) or 5XU3/3YA5 (7).
#Second index tells you if X is A (0) C (1) G (2) or U (3)
#Third index tells you if Y is A (0) C (1) G (2) or T (3)
terminalAUATCount = scipy.sparse.lil_matrix((1,2),dtype=int)
# Number of terminal AU (or GU) pairs, number of terminal AT pairs
unknownCount = scipy.sparse.lil_matrix((1,1),dtype=int) #np.zeros((1,1), dtype = int)
bondFECounts = [RNARNACount, DNADNACount, RNADNACount, terminalAUATCount, unknownCount]
dangling5RNARNACount = scipy.sparse.lil_matrix((4,4),dtype=int) #np.zeros((4,4),dtype = int)
dangling5DNADNACount = scipy.sparse.lil_matrix((4,4),dtype=int) #np.zeros((4,4),dtype = int)
dangling3RNARNACount = scipy.sparse.lil_matrix((4,4),dtype=int) #np.zeros((4,4),dtype = int)
dangling3DNADNACount = scipy.sparse.lil_matrix((4,4),dtype=int) #np.zeros((4,4),dtype = int)
unknownDanglingCount = scipy.sparse.lil_matrix((1,1),dtype=int) #np.zeros((1,1),dtype = int)
dangling5Count = [dangling5RNARNACount, dangling5DNADNACount, unknownDanglingCount]
dangling3Count = [dangling3RNARNACount, dangling3DNADNACount, unknownDanglingCount]
# first index tells you if the paired ntd is an A(1) C(2) G(3) U/T(4)
# second index tells you if the dangling ntd is A(1) C(2) G(3) or U/T(4).
# Uses dangling nt to determine if it's RNA/RNA or DNA/DNA
specialLoopEnergies = 0; specialLoopEntropies = 0
unboundButCouldBindCounts = 0
if self.considerTabulatedLoops: #XXX NOT CREATED YET
#check if any hairpin loops are special
for edge in G.selfloop_edges():
closingBPs = G.nodes[edge[0]]['bp'] #edge[0] and edge[1] are equal by definition
hairpinLoopSeq = sequenceInNumbers[closingBPs[0]:closingBPs[1]+1]
if (not any(hairpinLoopSeq == 0) #can't have unknown ntds or multiple strands bonded
and len(hairpinLoopSeq) <= 8): #%none of the special hairpin loops have more than 6 unpaired ntds
additionalLoopEnergy, additionalLoopEntropy = specialHairpinFxn(hairpinLoopSeq)
specialLoopEnergies += additionalLoopEnergy
specialLoopEntropies += additionalLoopEntropy
# from Zhang & Chen (2006) which cites Turner ,... Freier (1988)
# the energy gained from a single bp is zero (in the nearest neighbor model)
stemCounter = 0 #remove stems of length 1 for the nearest-neighbor model energy calculation
while stemCounter < len(structure):
if len(structure[stemCounter]) == 2:
del structure[stemCounter]
else:
stemCounter += 1
# #If we previously (at the level of stems) made the stemFECounts matrix
# for stemIndex in self.structures[whichStruct]: #bondFE from the stems
# for bpType in range(5): #RNA/RNA, DNA/DNA, RNA/DNA, terminalAU/AT, unknown
# bondFECounts[bpType] += self.stemFECounts[stemIndex][bpType]
for stem in structure: #base pairs in stem
numBonds = int(len(stem)/2)
for j in range(numBonds - 1):
firstNtIndex = stem[j]
firstBPIndex = stem[j+numBonds]
secondNtIndex = stem[j+1]
secondBPIndex = stem[j+1+numBonds]
index, bpType = freeEnergyMatrixIndices(sequenceInNumbers, firstNtIndex,firstBPIndex,
secondNtIndex,secondBPIndex, bound = [1,1],
unmatchedBPPenalty = unmatchedBPPenalty)
bondFECounts[bpType][index[0], index[1]] += 1
# Terminal AU/GU/AT penalties (bpType = 3). Tabulate them no matter what,
# but we only include them in the FE calculation if includeTerminalAUATPenalties==True.
for j in [0, numBonds - 1]: #penalties apply to ends of helices
if sequenceInNumbers[stem[j]] == 4 or sequenceInNumbers[stem[j + numBonds]] == 4:
bondFECounts[3][0,0] += 1 #terminal AU/GU was found
elif sequenceInNumbers[stem[j]] == 8 or sequenceInNumbers[stem[j + numBonds]] == 8:
bondFECounts[3][0,1] += 1 #terminal AT was found
for stem in structure: #two potential terminal mismatches for each stem
numBonds = int(len(stem)/2)
for c, (firstNtIndex, firstBPIndex) in enumerate(zip(
[stem[numBonds], stem[numBonds-1]], [stem[0], stem[-1]])):
secondNtIndex = firstNtIndex + 1
secondBPIndex = firstBPIndex - 1
bound = [1, 0] #secondNt is not actually bound to secondBP
# =============================================================================
# Terminal mismatches
# For the terminal mismatch just after stem ends, firstNt = stem[numBonds - 1], firstBP = stem[-1]
# For the mismatch just before stem starts, secondNt = stem[0] and secondBP = stem[numBonds]
# but if you flip it upside down, you get firstNt = stem[numBonds], firstBP = stem[0]