In [19]:
import pandas as pd
import numpy as np
df = pd.read_csv('only_five_missing.csv')

In [76]:
# One Hot Encoding function, generates a 658X5 numpy array with the encoded nucleotide
def one_hot_encoding(nucleotide: str, seq_len: int) -> np.ndarray:
    # Cutting the sequence if it is longer than a pre-defined value seq_len
    if len(nucleotide) > seq_len:
        nucleotide = nucleotide[:seq_len]
    # Encoding
    mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    sequence = [mapping[i] if i in mapping else 4 for i in nucleotide]
    encoded_sequence = np.eye(5)[sequence]
    # Padding if the sequence is smaller than a pre-defined value seq_len
    if len(encoded_sequence) < seq_len:
        padding = np.zeros((seq_len - len(encoded_sequence), 5))
        encoded_sequence = np.concatenate((encoded_sequence, padding))
    
    return encoded_sequence

In [21]:
df = df.dropna(subset=['nucleotide']) #drop missing nucleotide

In [63]:
# Usage example of function one_hot_encoding

nucleotide = df['nucleotide'][0]
print(f'Original nucleotide:\n\n{nucleotide}\n\n')

encoded = one_hot_encoding(nucleotide, 658)
print(f'Encoded nucleotide with dimensions {encoded.shape}:\n\n{encoded}\n')


Original nucleotide:

TACATTATATTTTATTTTTGGAATTTGAGCTGGTATAGTTGGAACTTCATTAAGATTACTAATTCGAGCTGAATTAGGAACCCCCGGATCTTTAATTGGAGATGATCAAATTTATAATACTATTGTAACAGCTCATGCTTTTATTATAATTTTTTTTATAGTTATACCTATTATAATTGGAGGATTTGGTAATTGACTTGTTCCTTTAATATTAGGAGCTCCTGATATAGCATTTCCACGAATAAATAATATAAGTTTTTGATTACTCCCCCCTTCTTTAACTTTATTAATTTCAAGTAGAATTGTAGAAAATGGAGCAGGAACAGGATGAACAGTTTACCCCCCTCTCTCATCTAACATTGCTCATGGGGGAAGATCAGTAGATTTAGCCATTTTTTCTCTTCATCTTGCTGGTATTTCTTCTATTTTAGGAGCTATTAATTTTATTACTACAATTATTAATATACGATTAAACAGTTTATCTTTTGATCAAATACCTTTATTTATTTGAGCGGTAGGAATTACTGCATTTTTATTATTATTATCTTTACCTGTTTTAGCTGGAGCTATTACTATACTTTTAACTGATCGAAATCTTAATACATCTTTTTTCGATCCGGCAGGAGGAGGAGATCCAATTTTATATCAACATTTATTT


Encoded nucleotide with dimensions (658, 5):

[[0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 ...
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]]



In [81]:
# Adding all the encoded nucleotides to a list of encoded nucleotides
nucleotides = df['nucleotide'].values
nucleotides = [one_hot_encoding(nucleotide,658) for nucleotide in nucleotides]

In [82]:
nucleotides

[array([[0., 0., 0., 1., 0.],
        [1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        ...,
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0.]]),
 array([[1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0.],
        ...,
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0.]]),
 array([[1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        ...,
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0.]]),
 array([[0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0.],
        ...,
        [0., 0., 0., 1., 0.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0.]]),
 array([[0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0.],
        ...,
        [0., 0., 0., 1., 0.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0.]]),
 array([[1., 0., 0., 0., 