In [1]:
# import packages
import os, sys, glob
import numpy as np
import pandas as pd
import multiprocessing as mp

from functools import partial

sys.path.append('../')

In [2]:
# load data such that each row is a sample with one column for name and one for sequence
def load_data(filepath):
    with open(filepath, 'r') as f:
        lines = f.readlines()
        names = []
        seqs = []
        for i in range(0, len(lines), 2):
            names.append(lines[i].strip())
            seqs.append(lines[i+1].strip())
    return pd.DataFrame({'name': names, 'sequence': seqs})

seqs = load_data('../data/completeseq.oneline.filtered.170.to.300.txt')
seqs.head()

Unnamed: 0,name,sequence
0,>3LCC_1|Chain,MAEEQQNSDQSNGGNVIPTPEEVATFLHKTVEEGGWEKCWEEEITP...
1,>Arabidopsis_thaliana_1_,MAEEQQNSSYSIGGNILPTPEEAATFQPQVVAEGGWDKCWEDGVTP...
2,>XP_023640711.1,MAEEQQHSSGYSSGGNVIPTPEEAATFQPQVVAEGGWDKCWEDKVT...
3,>XP_010517973.1,MAEKQQSSNYSSGGNIIPTPEEAATFQPQVVAEGGWDKCWEDKVTP...
4,>XP_019101055.1,MAEKQQSSGYSSGGNIIPTPEEAATFQPQVVAEGGWDKCWEDKVTP...


In [3]:
# check sequence lengths
np.unique(seqs['sequence'].apply(len))

array([171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183,
       184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
       197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
       210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222,
       223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235,
       236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248,
       249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261,
       262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
       275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287,
       288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300])

In [4]:
# get list of unique amino acids
amino_acids = list(set(''.join(seqs['sequence'])))
amino_acids.sort()
len(amino_acids)

21

In [5]:
# create a dictionary mapping amino acids to specific features
# first, define features we want to include

# 1. hydrophobicity (ordinal)
hydrophobic = ['A', 'C', 'I', 'L', 'M', 'F', 'V', 'W'] # 0
neutral = ['G', 'H', 'P', 'S', 'T', 'Y'] # 1
hydrophilic = ['R', 'N', 'D', 'Q', 'E', 'K'] # 2

hydrophobicity = {0: hydrophobic, 
                  1: neutral, 
                  2: hydrophilic}

# 2. volume (ordinal)
very_small = ['A', 'G', 'S'] #0
small = ['N', 'D', 'C', 'P', 'T'] #1
medium = ['Q', 'E', 'H', 'V'] #2
large = ['R', 'I', 'L', 'K', 'M'] #3
very_large = ['F', 'W', 'Y'] #4

volume = {0: very_small, 
          1: small, 
          2: medium, 
          3: large, 
          4: very_large}

# 3. chemical (arbitrary)
aliphatic = ['A', 'G', 'I', 'L', 'P', 'V']
aromatic = ['F', 'W', 'Y']
sulfur = ['C', 'M']
hydroxyl = ['S', 'T']
basic = ['R', 'H', 'K']
acidic = ['D', 'E']
amide = ['N', 'Q']

chemical = {'aliphatic': aliphatic, 
            'aromatic': aromatic, 
            'sulfur': sulfur, 
            'hydroxyl': hydroxyl, 
            'basic': basic, 
            'acidic': acidic, 
            'amide': amide}

# 4. charge (ordinal)
positive = ['R', 'H', 'K'] #1
negative = ['D', 'E'] #-1
uncharged = ['A', 'N', 'C', 'Q', 'G', 'I', 'L', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] #0

charge = {1: positive, 
          0: negative, 
          int(-1): uncharged}

# 5. hydrogen donor/acceptor (arbitrary)
donor = ['R', 'K', 'W']
acceptor = ['D', 'E']
both = ['N', 'Q', 'H', 'S', 'T', 'Y']
none = ['A', 'C', 'G', 'I', 'L', 'M', 'F', 'P', 'V']

hydrogen = {'d': donor, 
            'a': acceptor, 
            'b': both,
            'n': none}

# 6. polarity (ordinal)
polar = ['R', 'N', 'D', 'Q', 'E', 'H', 'K', 'S', 'T', 'Y'] #1
nonpolar = ['A', 'C', 'G', 'I', 'L', 'M', 'F', 'P', 'W', 'V'] #0

polarity = {1: polar, 
            0: nonpolar}

# allows us to map amino acids to features, creating a (num_samples, seq_length, num_features) tensor
feature_list = [hydrophobicity, volume, chemical, charge, hydrogen, polarity]

In [6]:
# create feature dictionary that contains the properties of each amino acid
feature_dict = {}

# loop through each amino acid and create a dictionary of features

# loop through amino acids
for aa in amino_acids:

    # X is a placeholder for padding - it has no features
    if aa == 'X':
        features = ['NA', 'NA', 'NA', 'NA', 'NA', 'NA']
        feature_dict[aa] = features
    
    else:
        # make list of features for each amino acid
        features = []

        # loop through each feature
        for feature in feature_list:

            # loop through each key in the feature dictionary
            for key, list in feature.items():
                if aa in list:
                    features.append(key)
        feature_dict[aa] = features

# print dictionary for feature conversion
feature_dict

{'A': [0, 0, 'aliphatic', -1, 'n', 0],
 'C': [0, 1, 'sulfur', -1, 'n', 0],
 'D': [2, 1, 'acidic', 0, 'a', 1],
 'E': [2, 2, 'acidic', 0, 'a', 1],
 'F': [0, 4, 'aromatic', -1, 'n', 0],
 'G': [1, 0, 'aliphatic', -1, 'n', 0],
 'H': [1, 2, 'basic', 1, 'b', 1],
 'I': [0, 3, 'aliphatic', -1, 'n', 0],
 'K': [2, 3, 'basic', 1, 'd', 1],
 'L': [0, 3, 'aliphatic', -1, 'n', 0],
 'M': [0, 3, 'sulfur', -1, 'n', 0],
 'N': [2, 1, 'amide', -1, 'b', 1],
 'P': [1, 1, 'aliphatic', -1, 'n', 0],
 'Q': [2, 2, 'amide', -1, 'b', 1],
 'R': [2, 3, 'basic', 1, 'd', 1],
 'S': [1, 0, 'hydroxyl', -1, 'b', 1],
 'T': [1, 1, 'hydroxyl', -1, 'b', 1],
 'V': [0, 2, 'aliphatic', -1, 'n', 0],
 'W': [0, 4, 'aromatic', -1, 'd', 0],
 'X': ['NA', 'NA', 'NA', 'NA', 'NA', 'NA'],
 'Y': [1, 4, 'aromatic', -1, 'b', 1]}

In [7]:
# convert sequences to feature tensors
def seq_to_tensor(seq, feature_dict):
    # initialize tensor
    tensor = []
    
    # loop through each amino acid in the sequence
    for aa in seq:
        # get features for amino acid
        features = feature_dict[aa]
        tensor.append(features)
        
    return np.array(tensor)


In [8]:
# use pool to apply conversion function to all sequences
with mp.Pool(mp.cpu_count()) as pool:
    seq_tensors = pool.map(partial(seq_to_tensor, feature_dict=feature_dict), seqs['sequence'])
    names = seqs['name']

In [9]:
# check that all sequences are the same length (THEY ARE NOT)
np.unique([len(seq) for seq in seq_tensors])

array([171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183,
       184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
       197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
       210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222,
       223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235,
       236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248,
       249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261,
       262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
       275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287,
       288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300])