In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd


In [16]:
file_path = 'MS_01_T0_TCRB.tsv'
n_rows = 10000
# Read the CSV file
df = pd.read_csv(file_path, sep = "\t", nrows = n_rows)

In [18]:
df = df.dropna(subset=['aminoAcid'])

In [20]:
df.to_csv('MS_without_na.csv', index=False)

In [28]:
amino_acids = df["aminoAcid"].tolist()

In [30]:
amino_acids = [item for item in amino_acids if '*' not in item]

In [31]:
amino_acids_length_15 = [item for item in amino_acids if len(item) == 15]

In [32]:
amino_acids_length_15

['CASSLAGTPYNEQFF',
 'CASSQARGPATEAFF',
 'CSVVRVGETTDTQYF',
 'CAWSVAGLSGDEQYF',
 'CASSMEVGRTDTQYF',
 'CAWSVLGSGANVLTF',
 'CSIWASGLSSYEQYF',
 'CASSSPDILNTEAFF',
 'CAWSRIRAPRTEAFF',
 'CASSIDPEYTDTQYF',
 'CAWTGTLDRTYGYTF',
 'CAWRPWTSGRGEQYF',
 'CAWSTRQNSNQPQHF',
 'CAWSVPGQQNTEAFF',
 'CAWSLEGSGNQPQHF',
 'CAWSVIAANTGELFF',
 'CAWSVPGPFSYEQYV',
 'CAWSVQMGVRQPQHF',
 'CSVESDFTPNYGYTF',
 'CAWSKSWLQGTEAFF',
 'CSVGFGSDGNQPQHF',
 'CSAKRGGPNTGELFF',
 'CAWSVRGQGWQPQHF',
 'CASSWTAGTSYEQYF',
 'CATSALAGGPHEQYV',
 'CASSSLGTSAYEQYF',
 'CAWSELAGEWTEAFF',
 'CAWSVRVNSNQPQHF',
 'CSVEDQGDGHYEQYF',
 'CASSQDRASNGGYTF',
 'CAWSVWGAVTYEQFF',
 'CAWSTGKVLNQPQHF',
 'CASSLGMGPSNEQYF',
 'CAWGLSGRNTGELFF',
 'CASSSHRAANTEAFF',
 'CAWSRVAGGQSEQYV',
 'CSIRDVRKGGTEAFF',
 'CAWSTGFSSGNTIYF',
 'CAWRGLWHHYNEQFF',
 'CSVEDPRGATYEQYF',
 'CAWSVGGASSYEQYV',
 'CSVGATGVTNEKLFF',
 'CACVPTTGGYNEQFF',
 'CSVENGGSGEGTQYF',
 'CASSYPDRVTYEQYV',
 'CAWSWGQGRDQPQHF',
 'CASSSATGPYYGYTF',
 'CASSDTRSLNQPQHF',
 'CASSYDVRGLNEQFF',
 'CAWKGSPLFSAEAFF',


In [49]:
unique_letters = list(set(letter for item in amino_acids_length_15 for letter in item))

In [52]:
# Create a StringLookup layer to map letters to integer indices
letter_lookup = tf.keras.layers.StringLookup()

# Adapt the layer to the unique letters
letter_lookup.adapt(unique_letters)

# Create the reverse mapping from indices to letters (optional for decoding)
reverse_lookup = tf.keras.layers.StringLookup(vocabulary=letter_lookup.get_vocabulary(), invert=True)

# Define a function to transform a string into a sequence of numbers
def encode_string(input_string):
    return letter_lookup(tf.strings.unicode_split(input_string, 'UTF-8')).numpy()

# Transform each string in the list into a sequence of numbers
encoded_list = [encode_string(s) for s in amino_acids_length_15]

# Display the mappings and the encoded sequences
print("Unique letters:")
print(unique_letters)
print("\nLetter to number mapping:")
print({letter: letter_lookup(letter).numpy() for letter in unique_letters})
print("\nEncoded sequences:")
for original, encoded in zip(amino_acids_length_15, encoded_list):
    print(f"{original} -> {encoded}")

Unique letters:
['P', 'C', 'D', 'N', 'Q', 'I', 'S', 'W', 'M', 'G', 'A', 'R', 'F', 'E', 'H', 'T', 'Y', 'V', 'K', 'L']

Letter to number mapping:
{'P': 8, 'C': 19, 'D': 18, 'N': 9, 'Q': 7, 'I': 13, 'S': 5, 'W': 2, 'M': 10, 'G': 15, 'A': 20, 'R': 6, 'F': 16, 'E': 17, 'H': 14, 'T': 4, 'Y': 1, 'V': 3, 'K': 12, 'L': 11}

Encoded sequences:
CASSLAGTPYNEQFF -> [19 20  5  5 11 20 15  4  8  1  9 17  7 16 16]
CASSQARGPATEAFF -> [19 20  5  5  7 20  6 15  8 20  4 17 20 16 16]
CSVVRVGETTDTQYF -> [19  5  3  3  6  3 15 17  4  4 18  4  7  1 16]
CAWSVAGLSGDEQYF -> [19 20  2  5  3 20 15 11  5 15 18 17  7  1 16]
CASSMEVGRTDTQYF -> [19 20  5  5 10 17  3 15  6  4 18  4  7  1 16]
CAWSVLGSGANVLTF -> [19 20  2  5  3 11 15  5 15 20  9  3 11  4 16]
CSIWASGLSSYEQYF -> [19  5 13  2 20  5 15 11  5  5  1 17  7  1 16]
CASSSPDILNTEAFF -> [19 20  5  5  5  8 18 13 11  9  4 17 20 16 16]
CAWSRIRAPRTEAFF -> [19 20  2  5  6 13  6 20  8  6  4 17 20 16 16]
CASSIDPEYTDTQYF -> [19 20  5  5 13 18  8 17  1  4 18  4  7  1 16]
CAWT

In [57]:
encoded_list

[array([19, 20,  5,  5, 11, 20, 15,  4,  8,  1,  9, 17,  7, 16, 16]),
 array([19, 20,  5,  5,  7, 20,  6, 15,  8, 20,  4, 17, 20, 16, 16]),
 array([19,  5,  3,  3,  6,  3, 15, 17,  4,  4, 18,  4,  7,  1, 16]),
 array([19, 20,  2,  5,  3, 20, 15, 11,  5, 15, 18, 17,  7,  1, 16]),
 array([19, 20,  5,  5, 10, 17,  3, 15,  6,  4, 18,  4,  7,  1, 16]),
 array([19, 20,  2,  5,  3, 11, 15,  5, 15, 20,  9,  3, 11,  4, 16]),
 array([19,  5, 13,  2, 20,  5, 15, 11,  5,  5,  1, 17,  7,  1, 16]),
 array([19, 20,  5,  5,  5,  8, 18, 13, 11,  9,  4, 17, 20, 16, 16]),
 array([19, 20,  2,  5,  6, 13,  6, 20,  8,  6,  4, 17, 20, 16, 16]),
 array([19, 20,  5,  5, 13, 18,  8, 17,  1,  4, 18,  4,  7,  1, 16]),
 array([19, 20,  2,  4, 15,  4, 11, 18,  6,  4,  1, 15,  1,  4, 16]),
 array([19, 20,  2,  6,  8,  2,  4,  5, 15,  6, 15, 17,  7,  1, 16]),
 array([19, 20,  2,  5,  4,  6,  7,  9,  5,  9,  7,  8,  7, 14, 16]),
 array([19, 20,  2,  5,  3,  8, 15,  7,  7,  9,  4, 17, 20, 16, 16]),
 array([19, 20,  2, 

In [60]:
sequence_length = np.array(encoded_list).shape[1] - 1

X = []
y = []

for row in encoded_list:
    X.append(row[:-5])
    y.append(row[-5])

X = np.array(X)
y = np.array(y)

print("Input sequences (X):")
print(X)
print("\nTarget sequences (y):")
print(y)

Input sequences (X):
[[19 20  5 ...  4  8  1]
 [19 20  5 ... 15  8 20]
 [19  5  3 ... 17  4  4]
 ...
 [19 20  5 ...  6  9  4]
 [19 20  5 ... 20 17  9]
 [19 20  4 ... 15  3  9]]

Target sequences (y):
[ 9  4 18 ... 15  7  4]


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Define parameters
vocab_size = 27  # Assuming 26 letters plus a special character (e.g., space or padding)
embedding_dim = 10
rnn_units = 64
batch_size = 1  # Use batch size 1 for prediction later

# Build the model
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=sequence_length),
    LSTM(rnn_units, return_sequences=False),
    Dense(vocab_size, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()
Step 3: Train the Model
python
Copy code
# Train the model
model.fit(X, y, epochs=50, batch_size=1)
Step 4: Make Predictions
python
Copy code
# Function to encode a single string
def encode_string(input_string):
    return np.array([letter_lookup(tf.strings.unicode_split(input_string, 'UTF-8')).numpy()])

# Predict the next letter for a given sequence
def predict_next_letter(sequence):
    encoded_sequence = encode_string(sequence)
    prediction = model.predict(encoded_sequence)
    predicted_index = np.argmax(prediction)
    return predicted_index

# Example: Predict the next letter for a given input sequence
input_sequence = X[0]  # Use the first sequence as an example
predicted_index = predict_next_letter(input_sequence)
print(f"Predicted index: {predicted_index}")