In [7]:
import random
import toyplot

In [42]:
def get_kmer_count_from_sequence(sequence, k=4, cyclic=True):
    
    kmers = {}
    
    
    for i in range(0, len(sequence)):
        kmer = sequence[i:i + k]
        
        
        length = len(kmer)
        if cyclic:
            if len(kmer) != k:
                kmer += sequence[:(k - length)]
        
       
        else:
            if len(kmer) != k:
                continue
        
       
        if kmer in kmers:
            kmers[kmer] += 1
        else:
            kmers[kmer] = 1
    
    return kmers

In [43]:
def get_debruijn_edges_from_kmers(kmers):
    
    
    edges = set()
    
    
    for k1 in kmers:
        for k2 in kmers:
            if k1 != k2:            
               
                if k1[1:] == k2[:-1]:
                    edges.add((k1[:-1], k2[:-1]))
                if k1[:-1] == k2[1:]:
                    edges.add((k2[:-1], k1[:-1]))

    return edges

In [44]:
def random_sequence(seqlen):

    return "".join([random.choice("ACGT") for i in range(seqlen)])

In [45]:
random.seed(123)


genome1 = random_sequence(25)
genome1

'AGATGAATGGACCGGCCATATAAGT'

In [46]:
def plot_debruijn_graph(edges, width=1500, height=1600):
    
    graph = toyplot.graph(
        [i[0] for i in edges],
        [i[1] for i in edges],
        width=width,
        height=height,
        tmarker=">", 
        vsize=25,
        vstyle={"stroke": "black", "stroke-width": 1, "fill": "none"},
        vlstyle={"font-size": "11px"},
        estyle={"stroke": "black", "stroke-width": 1},
        layout=toyplot.layout.FruchtermanReingold(edges=toyplot.layout.CurvedEdges()))
    return graph

In [47]:
kmers = get_kmer_count_from_sequence(genome1, k=4, cyclic=False)


edges = get_debruijn_edges_from_kmers(kmers)


plot_debruijn_graph(edges, width=600, height=400);


print("the true sequence: {}".format(genome1))

the true sequence: AGATGAATGGACCGGCCATATAAGT


In [48]:
kmers = get_kmer_count_from_sequence(genome1, k=4)
kmers

{'AGAT': 1,
 'GATG': 1,
 'ATGA': 1,
 'TGAA': 1,
 'GAAT': 1,
 'AATG': 1,
 'ATGG': 1,
 'TGGA': 1,
 'GGAC': 1,
 'GACC': 1,
 'ACCG': 1,
 'CCGG': 1,
 'CGGC': 1,
 'GGCC': 1,
 'GCCA': 1,
 'CCAT': 1,
 'CATA': 1,
 'ATAT': 1,
 'TATA': 1,
 'ATAA': 1,
 'TAAG': 1,
 'AAGT': 1,
 'AGTA': 1,
 'GTAG': 1,
 'TAGA': 1}

In [49]:
edges = get_debruijn_edges_from_kmers(kmers)
edges

{('AAG', 'AGT'),
 ('AAT', 'ATG'),
 ('ACC', 'CCG'),
 ('AGA', 'GAT'),
 ('AGT', 'GTA'),
 ('ATA', 'TAA'),
 ('ATA', 'TAT'),
 ('ATG', 'TGA'),
 ('ATG', 'TGG'),
 ('CAT', 'ATA'),
 ('CCA', 'CAT'),
 ('CCG', 'CGG'),
 ('CGG', 'GGC'),
 ('GAA', 'AAT'),
 ('GAC', 'ACC'),
 ('GAT', 'ATG'),
 ('GCC', 'CCA'),
 ('GGA', 'GAC'),
 ('GGC', 'GCC'),
 ('GTA', 'TAG'),
 ('TAA', 'AAG'),
 ('TAG', 'AGA'),
 ('TAT', 'ATA'),
 ('TGA', 'GAA'),
 ('TGG', 'GGA')}