# Burrows-Wheeler Transform Practice without Library

Nayoung Ku | English Version | Fall 2024 Bioinformatics | School of Life Science | Handong Global University

* Generated Date: Sep 12, 2024
* Last modified: Sep 22, 2024

# Procedures 

In [1]:
import numpy as np

## 1. Prepare the sequence in string form

In [2]:
#Input of the assignment = attaggact
sequence = input("sequence:") 

sequence: attaggact


In [3]:
sequence += "$"

In [4]:
sequence_list = list(sequence)

In [5]:
sequence_list

['a', 't', 't', 'a', 'g', 'g', 'a', 'c', 't', '$']

## 2. Generate the rotation matrix
* This is the process that generates the ***every rotation version*** of the given sequence 
* ***Rotation***: Repeating the process of each character in a string being pushed to the left one by one, and the front character goes to the back.

***EXAMPLE: Sequence = ATGC***

***Versions:***
1. Original sequence: 
**ATGC$**  (No change)

2. Push one character to the left:
**TGC$A**  (Sent the front "A" to the back)

3. Push one character to the left again:
**GC$AT** (Sent the front "T" to the back)

4. Push one character to the left again:
**C$ATG** (Send the front "G" to the back)

5. Push one character to the left for the last time:
**$ATGC** (Send the front "C" to the back)

In [6]:
bwm = [] 

for i in range(len(sequence_list)):
    bwm.append(sequence_list.copy())
    # 첫 번째 시퀀스를 제거
    first_seq = sequence_list.pop(0)  
    #지운 시퀀스를 맨 뒤로 보내기
    sequence_list.append(first_seq) 

In [7]:
bwm 

[['a', 't', 't', 'a', 'g', 'g', 'a', 'c', 't', '$'],
 ['t', 't', 'a', 'g', 'g', 'a', 'c', 't', '$', 'a'],
 ['t', 'a', 'g', 'g', 'a', 'c', 't', '$', 'a', 't'],
 ['a', 'g', 'g', 'a', 'c', 't', '$', 'a', 't', 't'],
 ['g', 'g', 'a', 'c', 't', '$', 'a', 't', 't', 'a'],
 ['g', 'a', 'c', 't', '$', 'a', 't', 't', 'a', 'g'],
 ['a', 'c', 't', '$', 'a', 't', 't', 'a', 'g', 'g'],
 ['c', 't', '$', 'a', 't', 't', 'a', 'g', 'g', 'a'],
 ['t', '$', 'a', 't', 't', 'a', 'g', 'g', 'a', 'c'],
 ['$', 'a', 't', 't', 'a', 'g', 'g', 'a', 'c', 't']]

In [8]:
#numpy array로 변환
bwm = np.array(bwm) 

## 3. Sort the rotation matrix 
* Sort the every generated rotation in dictionary order (ascending order) 
* ***Suffix array***: the array indexing the original position of the sorted suffix

***EXAMPLE: "attaggact"***
#### (1) Organize based on each suffix of the string

0. attaggact
1. ttaggact
2. taggact
3. aggact
4. ggact
5. gact
6. act 
7. ct 
8. t
9. $

In [9]:
bwm[:, 0] 

array(['a', 't', 't', 'a', 'g', 'g', 'a', 'c', 't', '$'], dtype='<U1')

#### (2) Sort by alphabetical order
***EXAMPLE: "attaggact"***
* Order: 9. ***$*** -> 6. ***act*** -> 3. ***aggact*** -> 0. ***attaggact*** -> 7. ***ct*** -> 5. ***gact*** -> 4. ***ggact*** -> 8. ***t*** -> 2. ***taggact*** -> 1. ***ttaggact***
* ***9*** -> ***6*** -> ***3*** -> ***0*** -> ***7*** -> ***5*** -> ***4*** -> ***8*** -> ***2*** -> ***1***

In [10]:
sorted_bwm = bwm[np.lexsort([bwm[:, i] for i in range((bwm.shape[1]-1), -1, -1)])]

In [11]:
sorted_bwm

array([['$', 'a', 't', 't', 'a', 'g', 'g', 'a', 'c', 't'],
       ['a', 'c', 't', '$', 'a', 't', 't', 'a', 'g', 'g'],
       ['a', 'g', 'g', 'a', 'c', 't', '$', 'a', 't', 't'],
       ['a', 't', 't', 'a', 'g', 'g', 'a', 'c', 't', '$'],
       ['c', 't', '$', 'a', 't', 't', 'a', 'g', 'g', 'a'],
       ['g', 'a', 'c', 't', '$', 'a', 't', 't', 'a', 'g'],
       ['g', 'g', 'a', 'c', 't', '$', 'a', 't', 't', 'a'],
       ['t', '$', 'a', 't', 't', 'a', 'g', 'g', 'a', 'c'],
       ['t', 'a', 'g', 'g', 'a', 'c', 't', '$', 'a', 't'],
       ['t', 't', 'a', 'g', 'g', 'a', 'c', 't', '$', 'a']], dtype='<U1')

#### (+) Expressing suffix array

***EXAMPLE: "attaggact"***
* Order: 9 -> 6 -> 3 -> 0 -> 7 -> 5 -> 4 -> 8 -> 2 -> 1
* [9, 6, 3, 0, 7, 5, 4, 8, 2, 1]

In [12]:
suffix_array = np.lexsort([bwm[:, i] for i in range((bwm.shape[1]-1), -1, -1)])

## 4. Final result: BWT array (and suffix array)

In [13]:
bwt_first = sorted_bwm[:, 0]
bwt_last = sorted_bwm[:, -1]

In [14]:
bwt_first

array(['$', 'a', 'a', 'a', 'c', 'g', 'g', 't', 't', 't'], dtype='<U1')

In [15]:
bwt_last

array(['t', 'g', 't', '$', 'a', 'g', 'a', 'c', 't', 'a'], dtype='<U1')

In [16]:
bwt_result_string = ''.join(bwt_last)

In [17]:
print(f"BWT of {sequence}:", bwt_result_string)
print(f"Suffix Array of {sequence}:", suffix_array)

BWT of attaggact$: tgt$agacta
Suffix Array of attaggact$: [9 6 3 0 7 5 4 8 2 1]


# Conclusion

In [18]:
def burrows_wheeler_transform(sequence):
    sequence += "$"
    sequence_list = list(sequence)
    
    bwm = [] 
    
    for i in range(len(sequence_list)):
        bwm.append(sequence_list.copy())
        first_seq = sequence_list.pop(0) 
        sequence_list.append(first_seq) 
    
    bwm = np.array(bwm)
    
    sorted_bwm = bwm[bwm[:, 0].argsort()]

    bwt_first = sorted_bwm[:, 0]
    bwt_last = sorted_bwm[:, -1]

    return bwt_first, bwt_last

## Contribution
* Main logics are based on the content of the lecture slides
* Some of the codes and explanations were written with the help of ChatGPT