  # Clump Finding

In [2]:
from genome_functions import loadData, MapFrequencies, numberToPattern

In [3]:
''' Find relevant clumps in genome

Clump is a window of L characters in the genome code.
If its possible to find more than t occurences of a k length pattern, this clump is added to clumpDict.

Return: Dictionary of clump indexes as keys and number of different patterns repeated as values.
'''
def ClumpFinding(genome, k, L, t):
    clumpDict = {}
    w_begin = 0
    w_end = L
    while w_end < len(genome):
        clump = genome[w_begin:w_end]
        freqs = MapFrequencies(clump, k)
        valid_patterns = [key for key, v in freqs.items() if v >= t]
        # if list is not empty
        if valid_patterns:
            clumpDict[w_begin] = valid_patterns
        # updates loop variables
        w_begin += 1
        w_end += 1
    return clumpDict



In [4]:
# validate clump finding in a small genome
genome = loadData('../data/v_cholerae_oric.txt')
print('Genome length: ', len(genome))
L = 100 # window length
t = 3 # patterns occurences threshold
k = 6 # pattern length
patternCodeDict = ClumpFinding(genome, k, L, t)
print(patternCodeDict)

Genome length:  540
{372: [1918], 373: [1918], 374: [1918], 375: [1918], 376: [1918], 377: [1918], 378: [1918], 379: [1918], 380: [1918], 381: [1918], 382: [1918], 383: [1918], 384: [1918], 385: [1918], 386: [1918], 387: [1918], 388: [1918], 389: [1918], 390: [1918], 391: [1918], 392: [1918], 393: [1918], 394: [1918], 395: [1918], 429: [1918], 430: [1918], 431: [1918], 432: [1918], 433: [1918, 3636], 434: [1918, 3636], 435: [1918, 3636], 436: [1918, 3636], 437: [1918, 3636], 438: [3636], 439: [3636]}


In [5]:
# convert pattern codes to patterns of length k
# function redefinition to a constant parameter k
nbr2pattern_k = lambda v : numberToPattern(v, k)
# apply it to each value and store it in new dict
patternDict = dict((key, list(map(nbr2pattern_k, value))) for key, value in patternCodeDict.items())
print(patternDict)

{372: ['CTCTTG'], 373: ['CTCTTG'], 374: ['CTCTTG'], 375: ['CTCTTG'], 376: ['CTCTTG'], 377: ['CTCTTG'], 378: ['CTCTTG'], 379: ['CTCTTG'], 380: ['CTCTTG'], 381: ['CTCTTG'], 382: ['CTCTTG'], 383: ['CTCTTG'], 384: ['CTCTTG'], 385: ['CTCTTG'], 386: ['CTCTTG'], 387: ['CTCTTG'], 388: ['CTCTTG'], 389: ['CTCTTG'], 390: ['CTCTTG'], 391: ['CTCTTG'], 392: ['CTCTTG'], 393: ['CTCTTG'], 394: ['CTCTTG'], 395: ['CTCTTG'], 429: ['CTCTTG'], 430: ['CTCTTG'], 431: ['CTCTTG'], 432: ['CTCTTG'], 433: ['CTCTTG', 'TGATCA'], 434: ['CTCTTG', 'TGATCA'], 435: ['CTCTTG', 'TGATCA'], 436: ['CTCTTG', 'TGATCA'], 437: ['CTCTTG', 'TGATCA'], 438: ['TGATCA'], 439: ['TGATCA']}


 # Explore Bacteroides Fragilis Genome

In [6]:
# load the big genoma of bacteroides fragilis
genome = loadData('../data/bacteroides_fragilis.txt')
# total length
print('Total length : ', len(genome))
# print first 500 elements
print(genome[:500])


Total length :  5205140
TTATCAACACCTATGTTAACAAGAAAAGAATTACTTTTGCAACATACTAACAGAAACGACATCATCATGCGAAAATTGAAAATAACCGAGCTGAACCGGATAAGTATAGAAGAGTTTAAAGAAGCTGATAAATTGCCTTTAGTTGTAGTGTTGGACGATATACGGAGTTTGCATAATATCGGTTCTGTGTTTCGTACGGCAGATGCTTTCCGGATTGAATGTATTTATCTGTGTGGAATTACGGCTACTCCTCCCCATCCCGAGATGCATAAGACAGCTTTGGGAGCCGAGTTTACAGTGGATTGGAAGTATGTTAATAACGCAGTTGAAACGGTTGATAACCTCCGGAGTGAAGGATATGTGGTATACTCTGTCGAACAGGCGGAAGGGAGTATCATGTTGGATGAGTTAACACTGGACCGTTCGAAGAAATATGCTGTAGTTATGGGAAATGAAGTAAAAGGAGTGCAGCAGGAGGTTATTGACCATTCGGATGGTTG


In [7]:
# apply clump finding to the genome
L = 500 # window length
t = 3 # patterns occurences threshold
k = 9 # pattern length
patternCodeDict = ClumpFinding(genome, k, L, t)
print(patternCodeDict)


KeyboardInterrupt: 

In [None]:
# convert pattern codes to patterns of length k

# function redefinition
nbr2pattern_k = lambda v : numberToPattern(v, k)
# apply it to each value and store it in new dict
patternDict = dict((key, list(map(nbr2pattern_k, value))) for key, value in patternCodeDict.items())

print(patternDict)
