In [5]:
import gzip

def loadFasta(filename):
    """ Parses a classically formatted and possibly 
        compressed FASTA file into two lists. One of 
        headers and a second list of sequences.
        The ith index of each list correspond."""
    if (filename.endswith(".gz")):
        fp = gzip.open(filename, 'r')
    else:
        fp = open(filename, 'r')
    # split at headers
    data = fp.read().split('>')
    fp.close()
    # ignore whatever appears before the 1st header
    data.pop(0)     
    headers = []
    sequences = []
    for sequence in data:
        lines = sequence.split('\n')
        headers.append(lines.pop(0))
        # add an extra "+" to make string "1-referenced"
        sequences.append('+' + ''.join(lines))
    return (headers, sequences)

In [10]:
# returns a dictionary with the counts of each unique kmer present in a given sequence
# there will be N-k+1 total kmers, where N = len(seq)-1 (-1 to account for the "+")
# there are always 4**k total possible kmers

def kmerCounts(seq, k):
    kmerDict = {}
    for i in range(1,len(seq)-k+1):
        kmer = seq[i:i+k]
        kmerDict[kmer] = kmerDict.get(kmer,0) + 1
    return kmerDict

In [15]:
kmerCounts("TAGACATGTCAAC", 2)

{'AG': 1,
 'GA': 1,
 'AC': 2,
 'CA': 2,
 'AT': 1,
 'TG': 1,
 'GT': 1,
 'TC': 1,
 'AA': 1}

In [21]:
def kmerTwice(k, string):
    kmerDict = {}
    for i in range(1,len(string)-k+1):
        kmer = string[i:i+k]
        kmerDict[kmer] = kmerDict.get(kmer,0) + 1
    
    hasTwice = []    
    
    for kmer in kmerDict:
        if kmerDict[kmer] == 2:
            hasTwice.append(kmer)
            
    return hasTwice

In [22]:
kmerTwice(2, "TAGACATGTCAAC")

['AC', 'CA']

In [7]:
codon = {  # Maps an RNA triplet of nucelotides to a 1-letter Amino Acid Abbrevation 
    "AAA": 'K', "AAG": 'K', "AAC": 'N', "AAT": 'N',
    "AGA": 'R', "AGG": 'R', "AGC": 'S', "AGT": 'S',
    "ACA": 'T', "ACG": 'T', "ACC": 'T', "ACT": 'T',
    "ATA": 'I', "ATG": 'M', "ATC": 'I', "ATT": 'I',
    "GAA": 'E', "GAG": 'E', "GAC": 'D', "GAT": 'D',
    "GGA": 'G', "GGG": 'G', "GGC": 'G', "GGT": 'G',
    "GCA": 'A', "GCG": 'A', "GCC": 'A', "GCT": 'A',
    "GTA": 'V', "GTG": 'V', "GTC": 'V', "GTT": 'V',
    "CAA": 'Q', "CAG": 'Q', "CAC": 'H', "CAT": 'H',
    "CGA": 'R', "CGG": 'R', "CGC": 'R', "CGT": 'R',
    "CCA": 'P', "CCG": 'P', "CCC": 'P', "CCT": 'P',
    "CTA": 'L', "CTG": 'L', "CTC": 'L', "CTT": 'L',
    "TAA": '*', "TAG": '*', "TAC": 'Y', "TAT": 'Y',
    "TGA": '*', "TGG": 'W', "TGC": 'C', "TGT": 'C',
    "TCA": 'S', "TCG": 'S', "TCC": 'S', "TCT": 'S',
    "TTA": 'L', "TTG": 'L', "TTC": 'F', "TTT": 'F'
}

# maps a DNA sequence to a string of 1-letter amino acid abbreviations

def dna_to_protein(seq):
    return ''.join([codon[seq[i:i+3]] for i in range(0,len(seq),3)])

In [11]:
# returns the reverse complement of a given DNA sequence

def revComp(dnaSeq):
    return ''.join([{'A':'T','C':'G','G':'C','T':'A'}[base] for base in reversed(dnaSeq)])