In [27]:
from collections import defaultdict
import re

def suffix_array_construction(s):
    suffixes = sorted((s[i:], i) for i in range(len(s)))
    suffix_array = [suffix[1] for suffix in suffixes]
    return suffix_array

def burrows_wheeler_transform(s):
    rotations = sorted(s[i:] + s[:i] for i in range(len(s)))
    bwt = ''.join(rotation[-1] for rotation in rotations)
    return bwt

def run_length_encoding_custom(s):
    if not s:
        return ""

    encoded_text = []
    current_char = s[0]
    count = 1

    for i in range(1, len(s)):
        if s[i] == current_char:
            count += 1
        else:
            if count >= 3:
                encoded_text.append(f"{current_char}#{count}")  # Use hash # and count for three or more repeats
            else:
                encoded_text.append(current_char * count)  # Directly append character repeated if less than three
            current_char = s[i]
            count = 1

    # Handle the last run of characters
    if count >= 3:
        encoded_text.append(f"{current_char}#{count}")  # Append last character and count with escape
    else:
        encoded_text.append(current_char * count)  # Append character repeated if less than three

    return ''.join(encoded_text)


In [28]:
RLE_BWT = """
:.#3fnl.n.#3a!sfhs!nanna.!!..e:.."e,nos,rgdsteae,e,,e#5fd,d,ryre,,seede,g,owenhegysneys#4a#3deasae,a,ado,a"a,,sassehh,s#4tageyaasofe"sestosseea#3edrya,y,,eaaenr"ommyfnsr;"syssgge,ee,s,g,a#7fhesa#3tcns!?,!! #5!tnesendga#3dwsrs#3gsgtsddaareks#6ysstasd$  "  " #18" #5"  n#3 n#4 n#19 nzzn#6ztpoemmenmmt#4wecTb rrhn#34B#3b#4Bb#3Bb#17BbbBb#4BbBBb#13n#20d  ehzmn#3hC#4nnmnmn#3  n#13enccss #27" #23i #4i#4n #3ea e#5 #7es #3ao nleenlnleelran  n#3 alnnaa rwrre ahmh#7shhmrmrnhrrvmlrvwhrvhchsslrrplr db#4dnhttrhwib tddmpetpudnhhwhvhvsv#3moitdlrrnnr#5 #3tthho#6uiaf  n#10uiiaat#4gtccTt#8 t#3 scw W t#3 wWctggrfgllwfretzra#4oogsbddhdmnfk s#4lrzhurbWwWwaDac arrureapcdppliarlapcoo #3I aao#4uomi  o#8ioi#3Ioaiio#3a#88o#3a#20oa#3uooueeieaddi#10ok oroeeaiea#5hthsrtldh #6JJrrc#7C#6i i#3bbB ccocilesw#4mmw#3arc  nnrms soiursm#7ia#4nse#3ue#3oaao#4eoe#3oplc p#7c-btco#3eueffhcue#3cee'eattna#3ddaasruaaieunaepa#4ts#3da#7esdoa  eann i s#4—s i  e#10 #3acuhuIxnsp#4aahti  i#4 #14y #3n xnius qtoqocd dots lBcliaaore#3ooao #15eenneder#3crn#3 #3s
"""
print( "RLE_BWT_MSG:", RLE_BWT )


story = """
Banana Split Manana Banana
"""

# Remove extra new lines and prepare the string
input_string = ' '.join(story.split()) + "$"

# Generate the Suffix Array
suffix_array = suffix_array_construction(input_string)
print("Suffix Array:", suffix_array)

# Generate the Burrows-Wheeler Transform
bwt = burrows_wheeler_transform(input_string)
print("BWT:", bwt)

# Generate RLE-encoded text for original and BWT
original_rle = run_length_encoding_custom(input_string)
bwt_rle = run_length_encoding_custom(bwt)

# Print results
print("\nOriginal Text       :", input_string)
print("RLE Original Text   :", original_rle)
print("BWT                 :", bwt)
print("RLE BWT             :", bwt_rle)

RLE_BWT_MSG: 
:.#3fnl.n.#3a!sfhs!nanna.!!..e:.."e,nos,rgdsteae,e,,e#5fd,d,ryre,,seede,g,owenhegysneys#4a#3deasae,a,ado,a"a,,sassehh,s#4tageyaasofe"sestosseea#3edrya,y,,eaaenr"ommyfnsr;"syssgge,ee,s,g,a#7fhesa#3tcns!?,!! #5!tnesendga#3dwsrs#3gsgtsddaareks#6ysstasd$  "  " #18" #5"  n#3 n#4 n#19 nzzn#6ztpoemmenmmt#4wecTb rrhn#34B#3b#4Bb#3Bb#17BbbBb#4BbBBb#13n#20d  ehzmn#3hC#4nnmnmn#3  n#13enccss #27" #23i #4i#4n #3ea e#5 #7es #3ao nleenlnleelran  n#3 alnnaa rwrre ahmh#7shhmrmrnhrrvmlrvwhrvhchsslrrplr db#4dnhttrhwib tddmpetpudnhhwhvhvsv#3moitdlrrnnr#5 #3tthho#6uiaf  n#10uiiaat#4gtccTt#8 t#3 scw W t#3 wWctggrfgllwfretzra#4oogsbddhdmnfk s#4lrzhurbWwWwaDac arrureapcdppliarlapcoo #3I aao#4uomi  o#8ioi#3Ioaiio#3a#88o#3a#20oa#3uooueeieaddi#10ok oroeeaiea#5hthsrtldh #6JJrrc#7C#6i i#3bbB ccocilesw#4mmw#3arc  nnrms soiursm#7ia#4nse#3ue#3oaao#4eoe#3oplc p#7c-btco#3eueffhcue#3cee'eattna#3ddaasruaaieunaepa#4ts#3da#7esdoa  eann i s#4—s i  e#10 #3acuhuIxnsp#4aahti  i#4 #14y #3n xnius qtoqocd dots lBclia

In [29]:
def decode_rle(rle_string):
    """
    Decodes the RLE-encoded string back to the original string.
    The RLE encoding format is like '3a' meaning 'aaa'.
    """
    decoded = []
    i = 0
    while i < len(rle_string):
        # Find the length of the repetition (the number before the character)
        run_length = 0
        while i < len(rle_string) and rle_string[i].isdigit():
            run_length = run_length * 10 + int(rle_string[i])  # Handling multi-digit numbers
            i += 1
        # Now, rle_string[i] should be the character that repeats
        if i < len(rle_string):
            decoded.append(rle_string[i] * run_length)
            i += 1
    return ''.join(decoded)

In [37]:
def reverse_bwt(bwt):
    """
    Reverses the Burrows-Wheeler Transform (BWT) to get the original string.
    """
    # Create a table to store BWT's intermediate results
    n = len(bwt)
    table = ['' for _ in range(n)]  # Create empty rows for the table
    
    # Sort the BWT string, this will be used to generate the sorted columns of the table
    for _ in range(n):
        table = [bwt[i] + table[i] for i in range(n)]  # Add the characters from BWT to each row
        table.sort()  # Sort the rows to get the proper order

    # The original string is the row that ends with the special end-of-string character (e.g., '$')
    for row in table:
        if row.endswith('$'):
            return row  # Return the original string
    return ''

In [47]:
decoded_long_message = decode_rle(RLE_BWT)
print(f"RLE Long Message: {RLE_BWT}")
print(f"Decoded RLE Long Message:  {decoded_long_message}")

RLE Long Message: 
:.#3fnl.n.#3a!sfhs!nanna.!!..e:.."e,nos,rgdsteae,e,,e#5fd,d,ryre,,seede,g,owenhegysneys#4a#3deasae,a,ado,a"a,,sassehh,s#4tageyaasofe"sestosseea#3edrya,y,,eaaenr"ommyfnsr;"syssgge,ee,s,g,a#7fhesa#3tcns!?,!! #5!tnesendga#3dwsrs#3gsgtsddaareks#6ysstasd$  "  " #18" #5"  n#3 n#4 n#19 nzzn#6ztpoemmenmmt#4wecTb rrhn#34B#3b#4Bb#3Bb#17BbbBb#4BbBBb#13n#20d  ehzmn#3hC#4nnmnmn#3  n#13enccss #27" #23i #4i#4n #3ea e#5 #7es #3ao nleenlnleelran  n#3 alnnaa rwrre ahmh#7shhmrmrnhrrvmlrvwhrvhchsslrrplr db#4dnhttrhwib tddmpetpudnhhwhvhvsv#3moitdlrrnnr#5 #3tthho#6uiaf  n#10uiiaat#4gtccTt#8 t#3 scw W t#3 wWctggrfgllwfretzra#4oogsbddhdmnfk s#4lrzhurbWwWwaDac arrureapcdppliarlapcoo #3I aao#4uomi  o#8ioi#3Ioaiio#3a#88o#3a#20oa#3uooueeieaddi#10ok oroeeaiea#5hthsrtldh #6JJrrc#7C#6i i#3bbB ccocilesw#4mmw#3arc  nnrms soiursm#7ia#4nse#3ue#3oaao#4eoe#3oplc p#7c-btco#3eueffhcue#3cee'eattna#3ddaasruaaieunaepa#4ts#3da#7esdoa  eann i s#4—s i  e#10 #3acuhuIxnsp#4aahti  i#4 #14y #3n xnius qtoqocd dots l

In [48]:
decoded_bwt_rle = decode_rle(bwt_rle)
print(f"RLE Original Message: {bwt_rle}")
print(f"Decoded RLE Message: {decoded_bwt_rle}")

RLE Original Message: ataa$ #3n#6MBBlpa#6Si
Decoded RLE Message: nnnMMMMMMSSSSSS


In [49]:
reversed_bwt = reverse_bwt(bwt)
print(f"BWT: {bwt}")
print(f"Reverse BWT: {reversed_bwt}")

BWT: ataa$   nnnnnnMBBlpaaaaaaSi
Reverse BWT: Banana Split Manana Banana$
