In [1]:
import glob

import numpy as np
import pandas as pd


#####
# Getting amino acid sequences
#####

def oneHot(residue):
    """
    Converts string sequence to one-hot encoding
    Example usage:
    seq = "GSHSMRY"
    oneHot(seq)
    """
    
    mapping = dict(zip("ACDEFGHIKLMNPQRSTVWY", range(20)))
    if residue in "ACDEFGHIKLMNPQRSTVWY":
        return np.eye(20)[mapping[residue]]
    else:
        return np.zeros(20)
    
def reverseOneHot(encoding):
    """
    Converts one-hot encoded array back to string sequence
    """
    mapping = dict(zip(range(20),"ACDEFGHIKLMNPQRSTVWY"))
    seq=''
    for i in range(len(encoding)):
        if np.max(encoding[i])>0:
            seq+=mapping[np.argmax(encoding[i])]
    return seq

def extract_sequences(dataset_X):
    """
    Return DataFrame with MHC, peptide and TCR a/b sequences from
    one-hot encoded complex sequences in dataset X
    """
    complex_sequences = [reverseOneHot(arr[:, 0:20]) for arr in dataset_X]
    mhc_sequences = [seq[0:180] for seq in complex_sequences]
    pep_sequences = [seq[180:192] for seq in complex_sequences]
    tcrab_sequences = [seq[192:] for seq in complex_sequences]
    df_sequences = pd.DataFrame({"MHC":mhc_sequences, "peptide":pep_sequences,
                                 "tcra":tcrab_sequences})
    return df_sequences


#####
# Load data and convert to amino acid sequences
#####

def load_data():
    data_list = []
    target_list = []

    for fp in glob.glob("./data/train/*input.npz"):
        data = np.load(fp)["arr_0"]
        targets = np.load(fp.replace("input", "labels"))["arr_0"]
        
        data_list.append(data)
        target_list.append(targets)

    X_data = np.concatenate(data_list)
    Y_data = np.concatenate(target_list)
    print("X data shape ", X_data.shape)
    print("Y data shape ", Y_data.shape)

    X_sequences = extract_sequences(X_data)
    return X_sequences


In [2]:
seqs = load_data()

X data shape  (5706, 420, 54)
Y data shape  (5706,)


In [3]:
seqs["MHC"].head()

0    GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...
1    GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...
2    GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...
3    GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...
4    GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...
Name: MHC, dtype: object

In [4]:
from allennlp.commands.elmo import ElmoEmbedder
from pathlib import Path

model_dir = Path('./data/seqvec')
weights = model_dir / 'weights.hdf5'
options = model_dir / 'options.json'
embedder = ElmoEmbedder(options, weights, cuda_device=0)

ModuleNotFoundError: No module named 'allennlp'

In [20]:
arr = np.load("./data/train/tcra/embedding_batch_0.npy")


In [27]:
arr.shape
arr[0][0][207]

array([0., 0., 0., ..., 0., 0., 0.])

In [15]:
np.save("./data/arr64", arr)