In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pylcs as LCS

In [None]:
# Import cell data
cell_data = pd.read_csv('data/cells_no_repeats.csv', index_col=0)
print(cell_data.shape)
display(cell_data.head())

In [None]:
# Import sequence data
sequences_df = pd.read_csv('data/dendritic_sequences/dendritic_sequences_ids.csv', index_col=0)
print(sequences_df.shape)
display(sequences_df.iloc[:10])

In [None]:
import math
import string

# Convert a pt_root_id index to a string of printable characters
def map_to_printable_string(number, length):
    if 0 <= number <= 56208:
        printable_chars = string.printable[:-6]  # Exclude non-printable characters
        base = len(printable_chars)
        
        encoded = ""
        while number > 0:
            number, index = divmod(number, base)
            encoded = printable_chars[index] + encoded
        
        # Pad with leading zeros if needed
        padding = length - len(encoded)
        encoded = printable_chars[0] * padding + encoded
        
        return encoded
    else:
        raise ValueError("Number must be in the range 0 to 56208")

In [None]:
def get_printable(encoded_str, encoding_map):
    return "".join([encoding_map[c] for c in encoded_str])

In [None]:
pt_root_ids = cell_data['pt_root_id'].unique()

max_value = len(pt_root_ids) - 1
num_unique_chars = len(string.printable[:-6])  # Excluding non-printable characters
min_string_length = math.ceil(math.log(max_value + 1, num_unique_chars))

In [None]:
# Create dictionary mapping pt_root_id to a single unique unicode character
pt_root_id_to_char = {pt_root_id: chr(i) for i, pt_root_id in enumerate(pt_root_ids)}
pt_root_id_to_char[-1] = '' # Add empty string to replace padding

char_to_pt_root_id = {v: k for k, v in pt_root_id_to_char.items()}
print(f'Number of unique pt_root_ids: {len(pt_root_ids)}')

# Create dictionary mapping pt_root_id char encoding to printable string
char_encoding_to_printable_string = {pt_root_id_to_char[pt_root_id]: '<' + map_to_printable_string(i, min_string_length) + '>' for i, pt_root_id in enumerate(pt_root_ids)}
printable_string_to_pt_root_id = {v: char_to_pt_root_id[k] for k, v in char_encoding_to_printable_string.items()}
print(f'Minimum string length: {min_string_length}')

In [None]:
# Test encoding
test_id = 864691134884741370
print(f'pt_root_id: {test_id}')
print(f'Size of unique Encoding: {len(pt_root_id_to_char[test_id])}')
print(f'Encoded: {char_encoding_to_printable_string[pt_root_id_to_char[test_id]]}')
print(f'Decoded: {printable_string_to_pt_root_id[char_encoding_to_printable_string[pt_root_id_to_char[test_id]]]}')

In [None]:
# Pre-process sequences: convert from pandas dataframe to list of strings
sequences = []
sequences_printable = []
for _, sequence_row in sequences_df.iterrows():
    cur_sequence = ''
    for rank in sequences_df.columns[1:]:
        cur_sequence += pt_root_id_to_char[sequence_row[rank]]
    sequences.append(cur_sequence)
    sequences_printable.append(get_printable(cur_sequence, char_encoding_to_printable_string))
    print(_, len(cur_sequence), get_printable(cur_sequence, char_encoding_to_printable_string))

sequences = np.array(sequences)
sequences_printable = np.array(sequences_printable)
print(len(sequences))
        

In [None]:
# Sort sequences by sequence length
sequences_sorted = sorted(sequences, key=len)
sequences_printable_sorted = sorted(sequences_printable, key=len)
print(sequences_printable_sorted[:10])

In [None]:
# Find the index of each sequence length increase
# Each element of sequence_length_indexes is the index in sequences where the first sequence of a new length starts
sequence_length_indexes = np.array([0])
remaining_indexes = np.where(np.array([len(s) - len(sequences_sorted[i-1]) for i, s in enumerate(sequences_sorted)])[1:] > 0)[0] + 1
sequence_length_indexes = np.append(sequence_length_indexes, remaining_indexes)

print(sequence_length_indexes)
print(len(sequence_length_indexes))

In [None]:
test_a = 'abcd'
test_b = ['abc', 'abcd', 'abcde', 'abcdef']
print(max(LCS.lcs_string_of_list(test_a, test_b)))

In [None]:
all_distributions = []

for k, length_index in enumerate(sequence_length_indexes):
    kth_distribution = []

    # Disregard sequences shorter than the current length
    cur_sequences = sequences_sorted[length_index:]

    # Single out the sequences of exactly the current length
    if k != len(sequence_length_indexes) - 1:
        short_sequences = sequences_sorted[length_index:sequence_length_indexes[k+1]]
    else:
        short_sequences = sequences_sorted[length_index:]
    
    # Get the LCS for each sequence of the current length
    for short_sequence in short_sequences:
        kth_distribution.append(max(LCS.lcs_string_of_list(short_sequence, cur_sequences)))
        print(kth_distribution[-1])
        break
    break
