## Développement d’une solution de mapping de données de séquençage à haut-débit sur un génome de référence

#### Mise en place d'un algorithme permettant de chercher un mot de longueur fixe dans un texte

#### Plan du notebook:
1/ Implémentation d'un algorithme "Difference Cover size 3" pour construire une table de suffixe à partir d'un génome en un temps linéaire

### DC3
#### Première étape de DC3: division des positions de notre génome, tri des positions

In [1]:
import time
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns



In [2]:
def asciiDC3 (seq) : 
    """
    Create sequence of ascii equivalent of each element of given list parameter.
    Add three sentinel numbers necessary for DC3 algorithm.
    
    Args:
        seq (list of str): list of elements to transform
    
    Return:
        (list of int)
    """
    asc=[]
    for i in seq : 
        asc.append(ord(i))
    
    return asc+[0,0,0]

def position12_radix_with_p12 (asc):
    """
    Create sequence of ascii equivalent of each element of given list parameter.
    Add three sentinel numbers necessary for DC3 algorithm.
    
    Args:
        seq (list of str): list of elements to transform
    
    Return:
        (list of int)
    """
    p12=[]
    r12=[]
    for k in range(len(asc)-2):
        #attention on a peut etre fait de la merde ici, pas sure du -2
        if k%3!=0 :
            p12.append(k)
            r12.append([[asc[k],asc[k+1], asc[k+2]],k])
    # dans le cas où T a un nombre d'élément multiple de 3, r12 ne contiendra pas le triplet [0,0,0].
    # Or, la fonction, "removesentinel" retire 1 élément notre suffix table, pour justement se débarrasser de [0,0,0]
    # J'ai donc décider de rajouter artificiellement [0,0,0] dans les cas où il n'est pas déjà là.
    if len(asc)%3==0:
        k=len(asc)-1-2
        p12.append(k)
        r12.append([[asc[k],asc[k+1], asc[k+2]],k])
        
    return p12,r12

def radix_with_p12(p,t):
    r=[]
    for i in range(len(p)):
        index=p[i]
        r.append([[t[index],t[index+1], t[index+2]],index])
    return r

In [3]:
def sort_with_p12(array, alphabet, columnNumber):
        """
    Sorts parameter "array" using Radix Sort
    
    Args:
        array (list of list): list of triplets and their positions.
        
        alphabet (dictionnary): alphabet of our array 
        
        columnNumber (int): number of last column of the lists in our paramater "array". 
                                    for r12, columnNumber=2
                                    for r0, columnNumber=1
    
    Return:
        array (list of list) : sorted list of triplets and their positions

    """
    #print("array")
    #print(array)
    #print("alphabet")
    #print(alphabet)
    if len(array) == 0:
        return array

  # Perform counting sort on each column, starting at the last

    column = columnNumber
    while column>=0: # nous on met "tant que l'indice est supérieur à 3"
        array = countingSortByDigit_with_p12(array, alphabet, column)
        column-=1 #change de colonne dans ton tableau

    return array

def countingSortByDigit_with_p12(array, alphabet, column):
    """
    ici possibleNumbers= nombre de chiffres possibles. Nous n'avons que 4 nombres (0,97,98,99)
    """

    countIndex = -1
    count = [0] * len(alphabet)
    output = [None] * len(array)

  # Count frequencies
    for i in range(0, len(array)):
        #print("i "+str(i))
        #print("array[i] "+ str(array[i]))
        #print("column "+str(column))
        
        countIndex = alphabet[array[i][0][column]]
        count[countIndex] += 1

  # Compute cumulates
    for i in range(1, len(alphabet)):
        count[i] += count[i - 1]

  # Move records
    for i in range(len(array) - 1, -1, -1):
        countIndex = alphabet[array[i][0][column]]
        count[countIndex] -= 1
        output[count[countIndex]] = array[i]
       
    return output

In [4]:
def triplets_are_equal(a,b):
    for i in range(len(a)):
        if a[i]!=b[i]:
            return False
    return True


def ordre_with_p12(R12p12sorted, use_index_12=False):
    # nous renovie la liste order du genre (1,2,2,3,4,4,5)  et un booléen indiquant s'il y a répétition
    index=1
    repetition=False
    order=[1]
    
    if use_index_12:
        indexdict={R12p12sorted[0][1]:1}
        
    for i in range(1, len(R12p12sorted)): 
        if triplets_are_equal(R12p12sorted[i-1][0], R12p12sorted[i][0]):
            order.append(index)
            repetition=True
        else : 
            index+=1
            order.append(index)
            
        if use_index_12:
            indexdict[R12p12sorted[i][1]]=order[i]
    if use_index_12:
        return order, repetition, indexdict
    else: 
        return order
            

In [5]:
def alphabetT(T):
        """
    Returns a dictionnary with the order of each "letter" constituting the parameter T.
    
    Example: 
        alphabetT([4,9,14,67])={4:0, 9:1, 14:2, 67:3}
    
    Args:
        T (list of int): the sequence we want a suffix array from
    
    Return:
        dic (dictionnary) : order of each "letter" constituting the parameter T

    """
    dic={}
    a=[]
    for i in range(len(T)):
        a.append(T[i])
    a.sort() 
    
    for element in a:
        if not (element in dic):
            dic[element]=len(dic)
    return dic

In [6]:
def alphabetR0_with_p12(R0p0):
        """
    Returns a dictionnary with the order of each "letter" constituting the parameter R0p0.
    Similar to the function alphabetT, coded specifically for the output of function position0_R0_p0
    
    Example: 
        alphabetR0_with_p12([[[65, 14], 0], [[67, 11], 3]]) = {11: 0, 14: 1, 65: 2, 67: 3}
    
    Args:
        R0p0 (list of list): list of couples and their position in the sequence we want a suffix array from.
                             output of function position0_R0_p0
                             [ [couple], position multiple of 3]
    
    Return:
        dic (dictionnary) : order of each "letter" constituting the parameter R0p0
    """

    dic={}
    a=[]
    for column in range(2):
        for i in range(len(R0p0)):
            a.append(R0p0[i][0][column])
    a.sort() 
    
    for element in a:
        if not (element in dic):
            dic[element]=len(dic)
    return dic

In [7]:
def Tprime_with_p12(p12, indexdict):
    t=[]
    for p in p12:
        t.append(indexdict[p])
    return t

In [8]:
def position0_R0_p0(T, index12dict):
    #attention T se finit avec trois 0
    #print("T")
    #print(T)
    position=[]
    R=[]
    for i in range(len(T)-3): #on ne prend pas en compte les trois 0 sentinelle
        if i%3==0:
            position.append(i)
            if i+1<len(T)-3:
                #print("i")
                #print(i)
                #print("T[i+1]")
                #print(T[i+1])
                R.append([[T[i],index12dict[i+1]],position[-1]])
            else:
                R.append([[T[i],1],position[-1]]) # car les sentinelles sont forcément les 1ères dans l'ordre alphabétique
    return position, R

In [9]:
def merge_with_p12(Tfinal, r0p0sorted, index12dict) : 
    index12dictkeys=list(index12dict.keys())
    liste=[]
    A=0
    B=0
    while A<len(r0p0sorted) and B<len(index12dictkeys):
        a=r0p0sorted[A][1]
        b=index12dictkeys[B]
        if Tfinal[a]!=Tfinal[b] :
            minimum=min(Tfinal[a], Tfinal[b])
            
            if minimum == Tfinal[a]:
                A+=1
                liste.append(a)
            else: 
                B+=1
                liste.append(b)

        else :
            if b%3==1 : 
                longueur=len(liste)
                i=0
                if index12dict[a+1]<index12dict[b+1]:
                    liste.append(a)
                
                    A+=1
                else:
                    liste.append(b)
                
                    B+=1
                    
                    
            elif b%3==2 :
            
                if Tfinal[a+1]!=Tfinal[b+1] :
                   
                    minimum=min(Tfinal[a+1], Tfinal[b+1])
                    if minimum == Tfinal[a+1]:
                        A+=1
                    
                        liste.append(a)
                    else: 
                        B+=1
                        
                        liste.append(b)

                else:
                  
                    if index12dict[a+2]<index12dict[b+2]:
                        liste.append(a)
                        A+=1
                    else:
                        liste.append(b)
                        B+=1
                        

    if A==len(r0p0sorted):

        for i in range(B,len(index12dictkeys)):
            liste.append(index12dictkeys[i])
                
    if B==len(index12dictkeys):

        for i in range(A, len(r0p0sorted)):
            liste.append(r0p0sorted[i][1])

    return liste

In [10]:
def merge_with_p12_debugg(Tfinal, r0p0sorted, index12dict) : 
    print("rappel des paramètres entrés pour la fonction merge")
    print("Tfinal")
    print(Tfinal)
    print("r0p0sorted")
    print(r0p0sorted)
    print("index12dict")
    print(index12dict)
    print("on construit une liste des clés du dictionnaire")
    print("qui est dans l'ordre normalement ahhh!")
    index12dictkeys=list(index12dict.keys())
    liste=[]
    A=0
    B=0
    while A<len(r0p0sorted) and B<len(index12dictkeys):
        print("rentre dans le while")
        a=r0p0sorted[A][1]
        b=index12dictkeys[B]
        if Tfinal[a]!=Tfinal[b] :
            minimum=min(Tfinal[a], Tfinal[b])
            
            if minimum == Tfinal[a]:
                print("a= "+str(a)+", b= "+str(b)+", on append le "+str(a))
                A+=1
                liste.append(a)
            else: 
                print("a= "+str(a)+", b= "+str(b)+", on append le "+str(b))
                B+=1
                liste.append(b)

        else :
            print("Tfinal[a]")
            print(Tfinal[a])
            print("Tfinal[b]")
            print(Tfinal[b])
            print("Tfinal[a]==Tfinal[b]")
            print(Tfinal[a]==Tfinal[b])
            if b%3==1 : 
                print(str(b)+" est congru à 1 modulo 3")
                longueur=len(liste)
                print("l'objet liste est pour l'instant constitué de: ")
                print(liste)
                i=0
                print("on compare les positions de "+str(a+1)+" et "+str(b+1))
                print("index12dict[" +str(a+1)+"]= ")
                print(index12dict[a+1])
                print("index12dict[" +str(b+1)+"]= ")
                print(index12dict[b+1])
                if index12dict[a+1]<index12dict[b+1]:
                    liste.append(a)
                    print("on append "+ str(a))
                    A+=1
                else:
                    liste.append(b)
                    print("on append "+ str(b))
                    B+=1
                    
                    
            elif b%3==2 :
                print(str(b)+"est congru à 2 modulo 3")
                if Tfinal[a+1]!=Tfinal[b+1] :
                    print("Tfinal[a+1]")
                    print(Tfinal[a+1])
                    print("Tfinal[b+1]")
                    print(Tfinal[b+1])
                    print("Tfinal[a+1]!=Tfinal[b+1]")
                    print(Tfinal[a+1]!=Tfinal[b+1])
                    minimum=min(Tfinal[a+1], Tfinal[b+1])
                    if minimum == Tfinal[a+1]:
                        A+=1
                        print("on append "+ str(a))
                        liste.append(a)
                    else: 
                        B+=1
                        print("on append "+ str(b))
                        liste.append(b)

                else:
                    print("Tfinal[a+1]")
                    print(Tfinal[a+1])
                    print("Tfinal[b+1]")
                    print(Tfinal[b+1])
                    print("Tfinal[a+1]==Tfinal[b+1]")
                    print(Tfinal[a+1]==Tfinal[b+1])
                    print("on compare a+2 et b+2")
                    if index12dict[a+2]<index12dict[b+2]:
                        liste.append(a)
                        A+=1
                    else:
                        liste.append(b)
                        B+=1
                        
    print("plus dans la boucle while")
    print("A "+str(A))
    print("B "+str(B))
    if A==len(r0p0sorted):
        print("on concatène les index12")
        for i in range(B,len(index12dictkeys)):
            liste.append(index12dictkeys[i])
                
    if B==len(index12dictkeys):
        print("on concatène les index0")
        for i in range(A, len(r0p0sorted)):
            liste.append(r0p0sorted[i][1])

    return liste

In [11]:
def removesentinel(index):
    return index[1:]

In [12]:
def resumeHigherOrder_with_p12(index012prime, P12):
    output={}
    for i in range(len(index012prime)):
        output[P12[index012prime[i]]]=i
        
    return output

### test zone avec test 2

In [13]:
test_2='ATGCTAGCTGCCCTGATCTCTCTGA!'
p12_test_2=[1, 4, 7, 10, 13, 16, 19, 22, 25, 2, 5, 8, 11, 14, 17, 20, 23, 26]
r12_test_2=[[84, 71, 67], [84, 65, 71], [67, 84, 71], [67, 67, 67], [84, 71, 65], [84, 67, 84], [67, 84, 67], [84, 71, 65], [33, 0, 0], [71, 67, 84], [65, 71, 67], [84, 71, 67], [67, 67, 84], [71, 65, 84], [67, 84, 67], [84, 67, 84], [71, 65, 33], [0, 0, 0]]
alphabet_test_2={0: 0, 33: 1, 65: 2, 67: 3, 71: 4, 84: 5}


p12,r12_p12_test_2=position12_radix_with_p12(asciiDC3(test_2))
print(r12_p12_test_2)



[[[84, 71, 67], 1], [[71, 67, 84], 2], [[84, 65, 71], 4], [[65, 71, 67], 5], [[67, 84, 71], 7], [[84, 71, 67], 8], [[67, 67, 67], 10], [[67, 67, 84], 11], [[84, 71, 65], 13], [[71, 65, 84], 14], [[84, 67, 84], 16], [[67, 84, 67], 17], [[67, 84, 67], 19], [[84, 67, 84], 20], [[84, 71, 65], 22], [[71, 65, 33], 23], [[33, 0, 0], 25], [[0, 0, 0], 26]]


In [14]:
r12_p12_sorted=sort_with_p12(r12_p12_test_2, alphabet_test_2, 2)
print(r12_p12_sorted)

[[[0, 0, 0], 26], [[33, 0, 0], 25], [[65, 71, 67], 5], [[67, 67, 67], 10], [[67, 67, 84], 11], [[67, 84, 67], 17], [[67, 84, 67], 19], [[67, 84, 71], 7], [[71, 65, 33], 23], [[71, 65, 84], 14], [[71, 67, 84], 2], [[84, 65, 71], 4], [[84, 67, 84], 16], [[84, 67, 84], 20], [[84, 71, 65], 13], [[84, 71, 65], 22], [[84, 71, 67], 1], [[84, 71, 67], 8]]


In [15]:
print(r12_p12_sorted[5][0])
print(r12_p12_sorted[6][0])
print(triplets_are_equal(r12_p12_sorted[5][0],r12_p12_sorted[6][0]))
ordre_test_2, repetition, indexdict_test_2=ordre_with_p12(r12_p12_sorted, True)
print(ordre_test_2)
print(indexdict_test_2)

[67, 84, 67]
[67, 84, 67]
True
[1, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 12, 12, 13, 13, 14, 14]
{26: 1, 25: 2, 5: 3, 10: 4, 11: 5, 17: 6, 19: 6, 7: 7, 23: 8, 14: 9, 2: 10, 4: 11, 16: 12, 20: 12, 13: 13, 22: 13, 1: 14, 8: 14}


In [16]:
p0, R0_p0_test_2=position0_R0_p0(asciiDC3(test_2), indexdict_test_2)

print(p0)
print(R0_p0_test_2)

alphabet_r0_test_2=alphabetR0_with_p12(R0_p0_test_2)
print(alphabet_r0_test_2)
    
r0p0sorted=sort_with_p12(R0_p0_test_2,alphabet_r0_test_2 ,1)
print(r0p0sorted)

order0=ordre_with_p12(r0p0sorted)
    

[0, 3, 6, 9, 12, 15, 18, 21, 24]
[[[65, 14], 0], [[67, 11], 3], [[71, 7], 6], [[71, 4], 9], [[67, 13], 12], [[65, 12], 15], [[84, 6], 18], [[67, 13], 21], [[65, 2], 24]]
{2: 0, 4: 1, 6: 2, 7: 3, 11: 4, 12: 5, 13: 6, 14: 7, 65: 8, 67: 9, 71: 10, 84: 11}
[[[65, 2], 24], [[65, 12], 15], [[65, 14], 0], [[67, 11], 3], [[67, 13], 12], [[67, 13], 21], [[71, 4], 9], [[71, 7], 6], [[84, 6], 18]]


### test zone avec abcabcabc

In [17]:
S="abcabcacab"
T=asciiDC3(S)
columnnumber=2
print(T)


[97, 98, 99, 97, 98, 99, 97, 99, 97, 98, 0, 0, 0]


In [18]:
p12,R12=position12_radix_with_p12(T)
print(p12)
print(R12)
alph=alphabetT(T)
alph
Rs12= sort_with_p12(R12, alph, columnnumber)
print(Rs12)

use_index12=True
order12, boolean, index12dict =ordre_with_p12(Rs12,use_index12)
print(order12)
print(boolean)
print(index12dict)

tprime=Tprime_with_p12(p12, index12dict)
print(tprime)

tprime=tprime+[0,0,0]
tprime

[1, 2, 4, 5, 7, 8, 10]
[[[98, 99, 97], 1], [[99, 97, 98], 2], [[98, 99, 97], 4], [[99, 97, 99], 5], [[99, 97, 98], 7], [[97, 98, 0], 8], [[0, 0, 0], 10]]
[[[0, 0, 0], 10], [[97, 98, 0], 8], [[98, 99, 97], 1], [[98, 99, 97], 4], [[99, 97, 98], 2], [[99, 97, 98], 7], [[99, 97, 99], 5]]
[1, 2, 3, 3, 4, 4, 5]
True
{10: 1, 8: 2, 1: 3, 4: 3, 2: 4, 7: 4, 5: 5}
[3, 4, 3, 5, 4, 2, 1]


[3, 4, 3, 5, 4, 2, 1, 0, 0, 0]

In [19]:
p12prime,R12prime=position12_radix_with_p12(tprime)
print(p12prime)
print(R12prime)
alphprime=alphabetT(tprime)
print(alphprime)
Rs12prime= sort_with_p12(R12prime, alphprime, columnnumber)
print(Rs12prime)

order12prime, boolean, index12primedict =ordre_with_p12(Rs12prime, True)
print(order12prime)
print(boolean)
print(index12primedict)

[1, 2, 4, 5, 7]
[[[4, 3, 5], 1], [[3, 5, 4], 2], [[4, 2, 1], 4], [[2, 1, 0], 5], [[0, 0, 0], 7]]
{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
[[[0, 0, 0], 7], [[2, 1, 0], 5], [[3, 5, 4], 2], [[4, 2, 1], 4], [[4, 3, 5], 1]]
[1, 2, 3, 4, 5]
False
{7: 1, 5: 2, 2: 3, 4: 4, 1: 5}


In [20]:
p0prime, r0prime=position0_R0_p0(tprime, index12primedict)
print(p0prime)
print(r0prime)

[0, 3, 6]
[[[3, 5], 0], [[5, 4], 3], [[1, 1], 6]]


In [21]:
alph0prime=alphabetR0_with_p12(r0prime)
print(alph0prime)
r0sprime= sort_with_p12(r0prime, alph0prime, columnnumber-1)
print(r0sprime)

index120prime=merge_with_p12(tprime, r0sprime, index12primedict)
index120prime

{1: 0, 3: 1, 4: 2, 5: 3}
[[[1, 1], 6], [[3, 5], 0], [[5, 4], 3]]


[7, 6, 5, 0, 2, 4, 1, 3]

# Assemblage final

In [22]:
def almost_dc3_with_p12(T):
    # T est une liste de int

    
    columnNumber=2
    p12,r12=position12_radix_with_p12(T)
    alphabet_T=alphabetT(T)
    r12sorted=sort_with_p12(r12, alphabet_T, columnNumber)    
    order12,repetition, index12dict=ordre_with_p12(r12sorted, True)
    if repetition:
        Tprim=Tprime_with_p12(p12, index12dict)+[0,0,0]
        index012=almost_dc3_with_p12(Tprim)
        index12dict=resumeHigherOrder_with_p12(index012, p12)
    p0,r0p0=position0_R0_p0(T, index12dict)
    alphabet_r0=alphabetR0_with_p12(r0p0)
    r0sorted=sort_with_p12(r0p0,alphabet_r0 ,columnNumber-1)
    index012=removesentinel(merge_with_p12(T, r0sorted, index12dict))
    
    return index012
   
    
    

In [23]:
def almost_dc3_with_p12_debugg(T):
    # T est une liste de int

    
    columnNumber=2
    p12,r12=position12_radix_with_p12(T)
    
    
    print("p12")
    print(p12)
    print("taille de p12 avant récursion")
    print(len(p12))
    print("\n")
    
    
    print("r12")
    print(r12)
    print("\n")

    alphabet_T=alphabetT(T)
    print("alphabet de T")
    print(alphabet_T)
    
    r12sorted=sort_with_p12(r12, alphabet_T, columnNumber)
    print("r12sorted")
    print(r12sorted)
    print("\n")
    
    order12,repetition, index12dict=ordre_with_p12(r12sorted, True)
    print("order12")
    print(order12)
    print("index12dict")
    print(index12dict)
    print("entre-t-on dans la boucle de répétition?")
    print(repetition)
    print("\n")
    

    if repetition:

        
        Tprim=Tprime_with_p12(p12, index12dict)+[0,0,0]
        print("tprime")
        print(Tprim)
        print("taille de tprime")
        print(len(Tprim))
        print("\n")
        
        index012=almost_dc3_with_p12_debugg(Tprim)
        print("index012")
        print(index012)
        print("taille de index012")
        print(len(index012))
        print("\n")

        index12dict=resumeHigherOrder_with_p12(index012, p12)
        print("resume higher order index12dict")
        print(index12dict)
        print("taille de index12dict après retour à taille initiale")
        print(len(index12dict))
        print("\n")

    p0,r0p0=position0_R0_p0(T, index12dict)
    print("rappel de T")
    print(T)
    print("r0p0")
    print(r0p0)
    print("taille de r0p0")
    print(len(r0p0))
    print("\n")

    alphabet_r0=alphabetR0_with_p12(r0p0)
    print("alphabet_r0")
    print(alphabet_r0)
    print("\n")
    
    r0sorted=sort_with_p12(r0p0,alphabet_r0 ,columnNumber-1)
    print("r0sorted")
    print(r0sorted)
    print("taille de r0sorted")
    print(len(r0sorted))
    print("\n")

    #order0=ordre(r0sorted, index12)
    
    
    index012=removesentinel(merge_with_p12(T, r0sorted, index12dict))
    print("taille après la fonction removesentinel")
    print(len(index012))
    print("rappel de T")
    print(T)
    print("index012")
    print(index012)
    print("\n")
    
    for i in range(len(index012)):
        print(T[index012[i]:])
    
    return index012
   
    
    

## Test avec biopython 


In [24]:
pip install biopython

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [25]:
from Bio import SeqIO
genome=[]
nucleotide_genome=0
for seq_record in SeqIO.parse("/home/azarkua/Documents/2023-2024/omiques2/developement/omique2/genome.fna", "fasta"):
    genome.append(seq_record)
    nucleotide_genome+=len(seq_record.seq)
print(genome[13:])

[SeqRecord(seq=Seq('tgaaccctaaaccctaaaccctaaaccctaaacccctaaaccctaaaccctgaa...gtt'), id='NC_037283.1', name='NC_037283.1', description='NC_037283.1 Plasmodium falciparum 3D7 genome assembly, chromosome: 14', dbxrefs=[]), SeqRecord(seq=Seq('ATGATAAAATTTTTAAAACCTAAAATAAAAATATTAAAAAAATTAAATATACCT...ATT'), id='NC_036769.1', name='NC_036769.1', description='NC_036769.1 Plasmodium falciparum strain 3D7, apicoplast', dbxrefs=[])]


In [26]:
reads=[]
nucleotide_reads=0
for seq_record in SeqIO.parse("/home/azarkua/Documents/2023-2024/omiques2/developement/omique2/reads.fq", "fastq"):
    reads.append(seq_record)
    nucleotide_reads+=len(seq_record.seq)
    #if len(seq_record.seq)!= 100:
    #    print("False")
print(reads[:1])

[SeqRecord(seq=Seq('TTTCCTTTTTAAGCGTTTTATTTTTTAATAAAAAAAATATAGTATTATATAGTA...TAA'), id='NC_004325.2-100000', name='NC_004325.2-100000', description='NC_004325.2-100000', dbxrefs=[])]


In [27]:
temp=genome[1].seq.upper()
print(len(temp)//4)
temp[50:60]

236775


Seq('CCCTGAACCC')

Pour l'instant, notre code met environ 2min d'attente pour faire la suffix table de 25 000 nucléotides.

In [28]:
T=asciiDC3(temp)
#print(T)

In [29]:
suffix_array=almost_dc3_with_p12(T)
print(suffix_array[:100])
#print(timemeasuring)

[947101, 207415, 207416, 441107, 238221, 388233, 441108, 207417, 238222, 207423, 656253, 388234, 724613, 168552, 468510, 724614, 468513, 724617, 434619, 533169, 537111, 441109, 576520, 633567, 207420, 633566, 207419, 207418, 434614, 434615, 633569, 157883, 238223, 207422, 434616, 576522, 633570, 633573, 458613, 654534, 157884, 238224, 369057, 684126, 747225, 437319, 157302, 128385, 399897, 207424, 481497, 656254, 422508, 656256, 422510, 385979, 388235, 481500, 771120, 633582, 751761, 68823, 548505, 422511, 385980, 388236, 168553, 724611, 724610, 468511, 724615, 468512, 468514, 533174, 533175, 724618, 468516, 574871, 388242, 168555, 656262, 74142, 458622, 168561, 574872, 434620, 533170, 537112, 724620, 468518, 699572, 399892, 399893, 247439, 441110, 576521, 633568, 207421, 422507, 434617]


## Mesurer le temps que prend notre fonction

In [30]:
import pstats
import cProfile

In [31]:
cProfile.run("almost_dc3_with_p12(T)", "dc3_stats")
p = pstats.Stats("dc3_stats")
p.sort_stats("cumulative").print_stats()

Sun Nov  5 12:55:47 2023    dc3_stats

         29865615 function calls (29865608 primitive calls) in 26.240 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000   26.240   26.240 {built-in method builtins.exec}
        1    0.441    0.441   26.240   26.240 <string>:1(<module>)
      8/1    0.984    0.123   25.799   25.799 /tmp/ipykernel_5847/3235192990.py:1(almost_dc3_with_p12)
       16    0.128    0.008    7.102    0.444 /tmp/ipykernel_5847/2101628537.py:1(sort_with_p12)
       40    6.974    0.174    6.974    0.174 /tmp/ipykernel_5847/2101628537.py:18(countingSortByDigit_with_p12)
        8    3.609    0.451    4.217    0.527 /tmp/ipykernel_5847/4110635339.py:1(merge_with_p12)
        8    3.738    0.467    3.911    0.489 /tmp/ipykernel_5847/738299132.py:18(position12_radix_with_p12)
        8    3.391    0.424    3.531    0.441 /tmp/ipykernel_5847/3786488779.py:1(position0_R0_p0)
        8    1

<pstats.Stats at 0x7f329159f820>

### Idées d'optimisation  
-réussir à ne plus faire appel aux fonctions "alphabet"  

### BWT à l'aide de notre suffix table

In [32]:
def BWT_suffix_table(T,end_of_string=False):
    """
    Compute the BWT from the suffix table
    
    Args:
        T (str): string
        end_of_string (char): end of string character to append
    
    Return:
        bwt (str): BWT
    """
    if end_of_string==False:
        T += '!'
    suffix_array=almost_dc3_with_p12(asciiDC3(T)) 
    bwt = ""
    for i in suffix_array:
        bwt += T[i-1]
    return(bwt)

In [33]:
def BWT(suffix_array, sequence):
    #the sequence needs an end of string
    bwt = ""
    for i in suffix_array:
        bwt += sequence[i-1]
    return bwt

In [34]:
def run_length_encoding(S):
    """
    Encode sequence using the Run Length method
    
    Args:
        text (str): string to be shifted
    
    Return:
        str: run length
    """
    encoded_S= ""
    i=0
    number=1
    while i<len(S):
        encoded_S+=S[i]
        i+=1
        while i<len(S) and S[i-1]==S[i]:
            number+=1
            i+=1
        if number>1:
            encoded_S+=str(number)
        number=1
    return encoded_S

In [35]:
def print_suffix_table(sequence, sf, visualize_bwt=False):
    for i in range(len(sf)):
        if visualize_bwt:
            print(sequence[sf[i]-1:])
        else:
            print(sequence[sf[i]:])
    return

#### Testons notre fonction

In [36]:
test_2='ATGCTAGCTGCCCTGATCTCTCTGA!'
suffix_array_2_with_p12=almost_dc3_with_p12(asciiDC3(test_2)) 
print(suffix_array_2_with_p12)


[25, 24, 5, 15, 0, 10, 11, 3, 17, 19, 21, 12, 7, 23, 14, 9, 2, 6, 4, 16, 18, 20, 22, 13, 8, 1]


In [37]:
print_suffix_table(test_2,suffix_array_2_with_p12)

!
A!
AGCTGCCCTGATCTCTCTGA!
ATCTCTCTGA!
ATGCTAGCTGCCCTGATCTCTCTGA!
CCCTGATCTCTCTGA!
CCTGATCTCTCTGA!
CTAGCTGCCCTGATCTCTCTGA!
CTCTCTGA!
CTCTGA!
CTGA!
CTGATCTCTCTGA!
CTGCCCTGATCTCTCTGA!
GA!
GATCTCTCTGA!
GCCCTGATCTCTCTGA!
GCTAGCTGCCCTGATCTCTCTGA!
GCTGCCCTGATCTCTCTGA!
TAGCTGCCCTGATCTCTCTGA!
TCTCTCTGA!
TCTCTGA!
TCTGA!
TGA!
TGATCTCTCTGA!
TGCCCTGATCTCTCTGA!
TGCTAGCTGCCCTGATCTCTCTGA!


In [38]:
print(BWT_suffix_table(test_2, True))

AGTG!GCGTTTCGTTTTACACCCCCA


In [39]:
print(run_length_encoding(BWT_suffix_table(test_2, True)))

AGTG!GCGT3CGT4ACAC5A


BWT sur notre génome

In [40]:
bwt_T=BWT_suffix_table(genome[1].seq.upper())
print(run_length_encoding(bwt_T)[:100])

ACTA2T2A7GTA5TACA2TA2TA9GA5TACA3CA4TA4TA7CA21CTAGA4TA2TA2CA7T2CA7TA4TA4GA4CA4C2A8GA4GA4CGA2GA17GATA6


Mesurons le temps que l'opération prend

In [41]:
cProfile.run("run_length_encoding(BWT_suffix_table(genome[1].seq.upper()))", "bwt_stats")
p = pstats.Stats("bwt_stats")
p.sort_stats("cumulative").print_stats()

Sun Nov  5 12:56:35 2023    bwt_stats

         39896150 function calls (39896143 primitive calls) in 28.699 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000   28.699   28.699 {built-in method builtins.exec}
        1    0.035    0.035   28.699   28.699 <string>:1(<module>)
        1    0.780    0.780   28.196   28.196 /tmp/ipykernel_5847/2127043640.py:1(BWT_suffix_table)
      8/1    1.006    0.126   25.779   25.779 /tmp/ipykernel_5847/3235192990.py:1(almost_dc3_with_p12)
       16    0.131    0.008    7.161    0.448 /tmp/ipykernel_5847/2101628537.py:1(sort_with_p12)
       40    7.029    0.176    7.030    0.176 /tmp/ipykernel_5847/2101628537.py:18(countingSortByDigit_with_p12)
        8    3.625    0.453    4.258    0.532 /tmp/ipykernel_5847/4110635339.py:1(merge_with_p12)
        8    3.552    0.444    3.704    0.463 /tmp/ipykernel_5847/3786488779.py:1(position0_R0_p0)
        8    3.439    0

<pstats.Stats at 0x7f322b402e60>

## String search with BWT

version du prof

In [42]:
from collections import Counter

In [43]:
def occurrence_indexer(S):
    K = []
    last_index = {}
    for s in S:
        if s not in last_index: 
            last_index[s] = 0
        K.append(last_index[s])
        last_index[s] += 1
    return(K)

def initialize_suffix_table_positions(counts, pattern_letter):
    start=sum([counts[char] for char in counts if char < pattern_letter])
    end=start+counts[pattern_letter]
    return [i for i in range(start, end)]


In [44]:
def lettre_et_occurence(BWT,pattern_letter, position_in_BWT, occurence_index,counts):
    output=[]
    for i in range(len(position_in_BWT)):
        if BWT[position_in_BWT[i]]==pattern_letter:
            L=[BWT[position_in_BWT[i]],occurence_index[position_in_BWT[i]]]
            letter=L[0]
            occurence=L[1]
            new_position_to_evaluate=occurence + sum([counts[char] for char in counts if char < letter])
            output.append(new_position_to_evaluate)
    return output

In [45]:
def find_patterns_in_sequence(sequence, pattern, sequence_has_end_character=False):
    output=[]
    
    if sequence_has_end_character==False:
        sequence += '!'
        
    sf=almost_dc3_with_p12(asciiDC3(sequence)) 
    BWT = ""
    for i in sf:
        BWT += sequence[i-1]
    
    
    
    #print("BWT")
    #print(BWT)
    #print("\n")
    occurence_index = occurrence_indexer(BWT)
    #print('occurence_index')
    #print(occurence_index)
    #print("\n")
    counts = Counter(BWT)
    #print("counts")
    #print(counts)
    #print("\n")
    index=len(pattern)-1
    pattern_letter=pattern[index]
    
    suffix_table_positions=initialize_suffix_table_positions(counts, pattern_letter)
    #print("suffix_table_positions")
    #print(suffix_table_positions)
    #print("\n")
    while index>0 and len(suffix_table_positions)>0:
        
        index-=1
        #print("index")
        #print(index)
        #print("\n")
        pattern_letter=pattern[index]
        #print("pattern_letter")
        #print(pattern_letter)
        #print("\n")
        
        suffix_table_positions=lettre_et_occurence(BWT,pattern_letter,suffix_table_positions, occurence_index,counts)
        #print("A")
        #print(A)
        #print("\n")



    for i in range(len(suffix_table_positions)):
        output.append(sf[suffix_table_positions[i]])
        

    return output
        
    

#### Testons notre fonction

In [46]:
test_mltpl='ABCDEFGHIJKLMNOPPPPPPABCDE!'
sf=almost_dc3_with_p12(asciiDC3(test_mltpl))
print(sf)

[26, 21, 0, 22, 1, 23, 2, 24, 3, 25, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 20, 19, 18, 17, 16, 15]


In [47]:
positions_of_pattern=find_patterns_in_sequence(test_mltpl, 'JKLMNO', sequence_has_end_character=True)
print(positions_of_pattern)
for p in positions_of_pattern:
    print(test_mltpl[p:p+6])


[9]
JKLMNO


In [48]:
#print_suffix_table(test_mltpl, sf, visualize_bwt=True)

### Testons avec notre génome

In [49]:
genome[1].seq.upper()

Seq('AACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCTAAACCCT...TCA')

In [50]:
pattern=genome[1].seq.upper()[:10]
pattern

Seq('AACCCTAAAC')

In [51]:
positions_of_pattern=find_patterns_in_sequence(genome[1].seq.upper(), pattern, sequence_has_end_character=False)
print(sorted(positions_of_pattern))
#for p in positions_of_pattern:
    #print(genome[1].seq.upper()[p:p+len(pattern)])

[]


In [52]:
cProfile.run("find_patterns_in_sequence(genome[1].seq.upper(), pattern, sequence_has_end_character=False)", "pattern_matching_stats")
p = pstats.Stats("pattern_matching_stats")
p.sort_stats("cumulative").print_stats()

Sun Nov  5 12:57:24 2023    pattern_matching_stats

         39496424 function calls (39496416 primitive calls) in 26.951 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000   26.951   26.951 {built-in method builtins.exec}
        1    0.049    0.049   26.951   26.951 <string>:1(<module>)
        1    0.775    0.775   26.901   26.901 /tmp/ipykernel_5847/2690991672.py:1(find_patterns_in_sequence)
      8/1    0.997    0.125   24.075   24.075 /tmp/ipykernel_5847/3235192990.py:1(almost_dc3_with_p12)
       16    0.133    0.008    7.381    0.461 /tmp/ipykernel_5847/2101628537.py:1(sort_with_p12)
       40    7.247    0.181    7.248    0.181 /tmp/ipykernel_5847/2101628537.py:18(countingSortByDigit_with_p12)
        8    3.710    0.464    4.345    0.543 /tmp/ipykernel_5847/4110635339.py:1(merge_with_p12)
        8    3.749    0.469    3.939    0.492 /tmp/ipykernel_5847/738299132.py:18(position12_radix_w

<pstats.Stats at 0x7f322b459e70>

### Travaillons avec des kmers

In [53]:
def assembler_des_kmers(numero_du_kmer, position_ds_genome,longueur, liste_position_kmers):
    compteur_de_kmer=1 #compte le nb de kmers qu'on arrive à aligner
    last_position=0
    i=numero_du_kmer+1
    difference=1
    
    while i<len(liste_position_kmers):
        print("i au début du while")
        print(i)
        print("\n")
        j=0
        previous_compteur=compteur_de_kmer
        print("le compteur est à")
        print(compteur_de_kmer)
        print("\n")
        print("on s'interesse au kmer numero")
        print(i)
        print("qui est")
        print(liste_position_kmers[i])
        k=0
        while j<len(liste_position_kmers[i]) and compteur_de_kmer==previous_compteur:
            print("on cherche la position")
            print(position_ds_genome+difference)
            if position_ds_genome+difference==liste_position_kmers[i][j]:
                compteur_de_kmer+=1
                last_position=liste_position_kmers[i][j]
            j+=1
            
        if compteur_de_kmer==previous_compteur:
            i+=longueur
            difference+=longueur
            if  i>= len(liste_position_kmers):
                return [[position_ds_genome,last_position+longueur],compteur_de_kmer]
            else:
        
                ### cas d'une mutation
                print("cas d'une mutation")
                print("on s'interesse au kmer numero")
                print(i)
                print("qui est")
                print(liste_position_kmers[i])
                k=0

                while k<len(liste_position_kmers[i]) and compteur_de_kmer==previous_compteur:
                    print("on cherche la position")
                    print(position_ds_genome+difference)
                    if position_ds_genome+difference==liste_position_kmers[i][k]:
                        compteur_de_kmer+=1
                        last_position=liste_position_kmers[i][k]
                    k+=1



                if compteur_de_kmer==previous_compteur:
                    ###cas d'une addition
                    print("cas d'une addition")
                    difference-=1
                    l=0
                    print("on cherche la position")
                    print(position_ds_genome+difference)
                    while l<len(liste_position_kmers[i]) and compteur_de_kmer==previous_compteur:

                        if position_ds_genome+difference==liste_position_kmers[i][l]:
                            compteur_de_kmer+=1
                            last_position=liste_position_kmers[i][l]
                        l+=1

                    if compteur_de_kmer==previous_compteur:
                        ###cas d'une déletions
                        print("cas d'une délétion")
                        difference+=1
                        i-=1
                        print("on s'interesse au kmer numero")
                        print(i)
                        print("qui est")
                        print(liste_position_kmers[i])
                        print("on cherche la position")
                        print(position_ds_genome+difference)
                        m=0
                        while m<len(liste_position_kmers[i]) and compteur_de_kmer==previous_compteur:

                            if position_ds_genome+difference==liste_position_kmers[i][m]:
                                compteur_de_kmer+=1
                                last_position=liste_position_kmers[i][m]
                            m+=1

                        if compteur_de_kmer==previous_compteur:
                            ### cas où l'alignement est terminé
                            return [[position_ds_genome,last_position+longueur],compteur_de_kmer]

        i+=1
        difference+=1   
        
    return [[position_ds_genome,last_position+longueur],compteur_de_kmer]

In [54]:
def kmer(sequence, longueur):
    n = len(sequence)
    kmer=[]
    for i in range(n-longueur+1):
        kmer.append(sequence[i:i+longueur])
    return kmer
    

In [55]:
read_list=["ABCDEFGHIJKLMNOPQRSZUV","PPPABCDEFGHW"]
read="ABCDEFGHIJKLMNOPQRSZUV"
print(len(read))
print(kmer(read,5))

22
['ABCDE', 'BCDEF', 'CDEFG', 'DEFGH', 'EFGHI', 'FGHIJ', 'GHIJK', 'HIJKL', 'IJKLM', 'JKLMN', 'KLMNO', 'LMNOP', 'MNOPQ', 'NOPQR', 'OPQRS', 'PQRSZ', 'QRSZU', 'RSZUV']


In [56]:
genom="ABCDEFGHIJKLMNOPQRSTUVWPPPPPABCDEFGHWWWWWWWWWWNOPQRSTUV"
liste_kmer=kmer(read, 5)
print(liste_kmer)
print(len(liste_kmer))

['ABCDE', 'BCDEF', 'CDEFG', 'DEFGH', 'EFGHI', 'FGHIJ', 'GHIJK', 'HIJKL', 'IJKLM', 'JKLMN', 'KLMNO', 'LMNOP', 'MNOPQ', 'NOPQR', 'OPQRS', 'PQRSZ', 'QRSZU', 'RSZUV']
18


In [57]:
positions=[]
for i in range(len(liste_kmer)):
    positions.append(find_patterns_in_sequence(genom, liste_kmer[i],sequence_has_end_character=True))
print(positions)
print(positions[5][0])

[[0, 28], [1, 29], [2, 30], [3, 31], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13, 46], [14, 47], [], [], []]
5


In [58]:
def find_kmer_positions(sf, BWT, read, longueur_des_kmers):
    """ la différence avec la fonction "find_pattern_in_sequence", c'est que la fonction ne calcule pas la suffix table 
    et la BWT, et qu'elle cherche elle même les kmers du read.
    """
    output=[]
    occurence_index = occurrence_indexer(BWT)
    counts = Counter(BWT)
    n = len(read)
    for i in range(n-longueur_des_kmers+1):
        pattern=read[i:i+longueur_des_kmers]
        index=len(pattern)-1
        pattern_letter=pattern[index]

        suffix_table_positions=initialize_suffix_table_positions(counts, pattern_letter)
        while index>0 and len(suffix_table_positions)>0:

            index-=1
            pattern_letter=pattern[index]
            suffix_table_positions=lettre_et_occurence(BWT,pattern_letter,suffix_table_positions, occurence_index,counts)

        temp=[]
        for i in range(len(suffix_table_positions)):
            temp.append(sf[suffix_table_positions[i]])
        output.append(temp)
        

    return output

In [59]:
genom="IJKLMNOPQRWWWWWWIJKLMNWWWWABCDEFGHIJKLMNOPQRSTUVWPPPPPABCDEFGHWWWWWWWWWWNOPQRSTUV!"
read="ABCDEFZGHIJKLMNOPQRSTUV"
suffix_array=almost_dc3_with_p12(asciiDC3(genom))
bwt=BWT(suffix_array, genom)
positions=find_kmer_positions(suffix_array, bwt, read, 5)
print(positions)

[[26, 54], [27, 55], [], [], [], [], [], [32], [33], [0, 34, 16], [1, 35, 17], [2, 36], [3, 37], [4, 38], [5, 72, 39], [6, 73], [7, 74], [75, 42], [76, 43]]


In [60]:
assembler_des_kmers(0, positions[0][0],5,positions)

i au début du while
1


le compteur est à
1


on s'interesse au kmer numero
1
qui est
[27, 55]
on cherche la position
27
i au début du while
2


le compteur est à
2


on s'interesse au kmer numero
2
qui est
[]
cas d'une mutation
on s'interesse au kmer numero
7
qui est
[32]
on cherche la position
33
cas d'une addition
on cherche la position
32
i au début du while
8


le compteur est à
3


on s'interesse au kmer numero
8
qui est
[33]
on cherche la position
33
i au début du while
9


le compteur est à
4


on s'interesse au kmer numero
9
qui est
[0, 34, 16]
on cherche la position
34
on cherche la position
34
i au début du while
10


le compteur est à
5


on s'interesse au kmer numero
10
qui est
[1, 35, 17]
on cherche la position
35
on cherche la position
35
i au début du while
11


le compteur est à
6


on s'interesse au kmer numero
11
qui est
[2, 36]
on cherche la position
36
on cherche la position
36
i au début du while
12


le compteur est à
7


on s'interesse au kmer numero
12
qui est


[[26, 44], 10]

In [61]:
def alignement_maximum_de_kmer(liste_position_kmers, longueur):
    output=[[0,0],0]
    maximum=0
    for i in range(longueur+2):
        current_kmer=liste_position_kmers[i]
        for p in current_kmer:
            alignement=assembler_des_kmers(i, p, longueur, liste_position_kmers)
            if maximum<alignement[1]:
                output=alignement
                maximum=alignement[1]
    return output

In [62]:
alignement_maximum_de_kmer(positions, 5)

i au début du while
1


le compteur est à
1


on s'interesse au kmer numero
1
qui est
[27, 55]
on cherche la position
27
i au début du while
2


le compteur est à
2


on s'interesse au kmer numero
2
qui est
[]
cas d'une mutation
on s'interesse au kmer numero
7
qui est
[32]
on cherche la position
33
cas d'une addition
on cherche la position
32
i au début du while
8


le compteur est à
3


on s'interesse au kmer numero
8
qui est
[33]
on cherche la position
33
i au début du while
9


le compteur est à
4


on s'interesse au kmer numero
9
qui est
[0, 34, 16]
on cherche la position
34
on cherche la position
34
i au début du while
10


le compteur est à
5


on s'interesse au kmer numero
10
qui est
[1, 35, 17]
on cherche la position
35
on cherche la position
35
i au début du while
11


le compteur est à
6


on s'interesse au kmer numero
11
qui est
[2, 36]
on cherche la position
36
on cherche la position
36
i au début du while
12


le compteur est à
7


on s'interesse au kmer numero
12
qui est


[[26, 44], 10]

In [63]:
def complementaireInverse(sequence):
    ci = []
    for i in range(len(sequence) - 1, -1, -1):
        if sequence[i] == 'A':
            ci.append('T')
        elif sequence[i] == 'T':
            ci.append('A')
        elif sequence[i] == 'C':
            ci.append('G')
        elif sequence[i] == 'G':
            ci.append('C')
    return ''.join(ci)

In [64]:
complementaireInverse("ATCG")

'CGAT'

### Testons avec notre génome

In [65]:
suffix_array_first_chromosome=almost_dc3_with_p12(asciiDC3(genome[0].seq.upper()))
BWT_first_chromosome=BWT(suffix_array_first_chromosome,genome[0].seq.upper()+"!")


In [66]:
longueur_kmer=20
liste_de_kmer=kmer(reads[10].seq.upper(), longueur_kmer)

In [67]:
position=find_kmer_positions(suffix_array_first_chromosome, BWT_first_chromosome, complementaireInverse(reads[10].seq.upper()), longueur_kmer)
print(position)

[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]


In [68]:
alignment=alignement_maximum_de_kmer(position, longueur_kmer)
print(alignment)

[[0, 0], 0]


In [69]:
def find_aligned_reads(BWT,suffix_array, reads, number_of_reads, size_of_kmers):
    output=[]
    for i in range(number_of_reads):
        position=find_kmer_positions(suffix_array, BWT, reads[i].seq.upper(), size_of_kmers)
        alignment=alignement_maximum_de_kmer(position, size_of_kmers)
        if alignment[1]>4:
            output.append({'read_number':i, 'alignment':alignment})
    
    return output

In [70]:
find_aligned_reads(BWT_first_chromosome,suffix_array_first_chromosome, reads, 2, longueur_kmer)

[]

In [71]:
cProfile.run("find_aligned_reads(BWT_first_chromosome,suffix_array_first_chromosome, reads, 1, longueur_kmer)", "pattern_matching_stats")
p = pstats.Stats("pattern_matching_stats")
p.sort_stats("cumulative").print_stats()

Sun Nov  5 12:58:22 2023    pattern_matching_stats

         34466264 function calls (34466182 primitive calls) in 17.098 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000   17.098   17.098 {built-in method builtins.exec}
        1    0.000    0.000   17.097   17.097 <string>:1(<module>)
        1    0.009    0.009   17.097   17.097 /tmp/ipykernel_5847/1422914090.py:1(find_aligned_reads)
        1    0.226    0.226   17.089   17.089 /tmp/ipykernel_5847/992215275.py:1(find_kmer_positions)
      968    9.797    0.010   16.096    0.017 /tmp/ipykernel_5847/1019946815.py:1(lettre_et_occurence)
 11271283    4.648    0.000    4.648    0.000 /tmp/ipykernel_5847/1019946815.py:8(<listcomp>)
 11271364    1.036    0.000    1.036    0.000 {built-in method builtins.sum}
 11912215    0.650    0.000    0.650    0.000 {method 'append' of 'list' objects}
       81    0.000    0.000    0.547    0.007 /tmp/ipykernel

<pstats.Stats at 0x7f322b459f00>