In [20]:
import re

# def gc(self):
#     """ Return the GC content of seq as a float
#     >>> x = Sequence(name='chr1', seq='ATCGTA')
#     >>> y = round(x.gc, 2)
#     >>> y == 0.33
#     True
#     """
#     g = seq.count('G')
#     g += seq.count('g')
#     c = seq.count('C')
#     c += seq.count('c')
#     return (g + c) / len(seq)
seq = "This is a test of this program's capability of filtering and counting IUPAC bases. NRSCYMTVAYKYKYKDYNGHVNGMRYARAVYTGADNDTDSAVVDCNCYWRNDVRNAVRWNKGCDWKYYACMDCVTTASYKGKRRNSBRBBVWTCVNHKYYRBNCMRGRCRSKSASGKWYBBGMHTCCVRNBVYHTYMSNAKGGVGCAVRWTTKCDDWHYTGKWKKCCBARYKGHGYTWSANGDYABTGAWANNRVSMACDVNYNKKRDAGNBNTTYRRNVABVARRBWCCWRTBMGMHHVTSKNWYWMHYCTHMNRBDCYWATYKYYDHBBDGYDVVSHCVAVVBMHAMBMHVAKKGVWRGCASBTGNASARMNGAHSTRDRGMVMCGWANSYHNBWSHMRKGCWVHDWCWWKBBSSYWWVSSVHMWHVNCVDRAYBATHHHTWDKKTBKHWDBGHYAHAHKDYMBNVNYNDBVAWGBCYABDSBCYDCDTABSHRHAHSYDYNNWKCNCGNNABWVCSYCASSRASADHVKMBVMHGSCHBMWMVTNVAAWBTBNMHVCHVBTGWSGMKVSGVWBNHWARTVSWNYAVACHYVGBSGTKCKDKYBRRVWSRWNMCGCKVSTSWMVBVAASAYRCGMSRKTKVCCSHBDRASSAANKTCDMWWKRBMBGACHHBRDNBGGTGKMABTBKCSCWAHASSWARBKHVAMMNKAKYAMMAAYWKYSMHYKGSCMTSSNWWBGDCGBKDYNVNHNGCVCRYSHWWMGWGSMBRYGGNGBKSCHHVKMSVBMYSGNVWYSTNAYGGNCYMDWYVWCSRVMAAMBVBKVCNAYKBAMMASTCVNDSNGCDMANWSCDRRWDNNTYCKMWVYMWBASDAMRKDHVBTMSSRTBTANKGHHKVVTMNKMAVCSYSGSYGTSBWAMKWTAGCYYWYYSSNTSCBHVAWMRVMDKBMADVKAYBBVBWMVNTYSWANRANRSDGKRAVSSVWNVVGRDRYSHNTKSCHSNDRBTSCNAKCHWAWACCGKGSTKMYBBMDMNBMSBRSCHYDYCDAKSMMNRTYYBBGHTW"

def gc_strict(seq):
    """ Return the GC content of seq as a float, ignoring non ACGT characters
    >>> x = Sequence(name='chr1', seq='NMRATCGTA')
    >>> y = round(x.gc, 2)
    >>> y == 0.33
    True
    """
    trimSeq = re.sub(r'[^ACGTacgt]', '', seq)
    g = seq.count('G')
    g += seq.count('g')
    c = seq.count('C')
    c += seq.count('c')
    return (g + c) / len(trimSeq)

def gc_iupac(seq):
    from collections import Counter
    """ Return the GC content of seq as a float, accounting for IUPAC ambiguity 
    >>> x = Sequence(name='chr1', seq='NMRATCGTA')
    >>> y = round(x.gc, 2)
    >>> y == 0.36
    True
    """
    trimSeq = re.sub(r'[^ACGTMRWSYKVHDBNacgtmrwsykvhdbn]', '', seq)
    # count all, including fractions of GC content
    seqCount = Counter(trimSeq)
    gc = seqCount['S'] + seqCount['C'] + seqCount['G']
    gc += 0.67 * (seqCount['B'] + seqCount['V'])
    gc += 0.5 * (seqCount['M'] + seqCount['R'] + seqCount['Y'] + seqCount['K'])
    gc += 0.33 * (seqCount['H'] + seqCount['D'])
    gc += 0.25 * (seqCount['N'])
    # return gc content
    return gc / len(trimSeq)


print(gc_strict(seq))

print(gc_iupac(seq))


0.3333333333333333
0.3611111111111111


In [3]:
'''
S	1
C	1
G	1

B	0.67
V	0.67

M	0.5
R	0.5
Y	0.5
K	0.5

H	0.33
D	0.33

N	0.25
'''

print("ACGTMRWSYKVHDBN".lower())

acgtmrwsykvhdbn
