## Imports

In [2]:
import Bio
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import numpy as np
import pandas as pd
import sklearn

import torch
import torch.nn as nn
import torch.nn.functional as F
#import transformers
#from transformers import AutoTokenizer, AutoModel

## Create dictionary of sequences for each location

In [2]:
blind_path = "../data/blind.fasta.txt"
cyto_path = "../data/cyto.fasta.txt"
mito_path = "../data/mito.fasta.txt"
nucleus_path = "../data/nucleus.fasta.txt"
other_path = "../data/other.fasta.txt"
secreted_path = "../data/secreted.fasta.txt"

In [3]:
def read_fasta(file):
    """
    This function takes an unstructured fasta file and outputs a dictionary of the sequences
    Input: - fasta file
    Output: - dict with keys (sequence header) and values (sequence)
    """
    sequences = {}
    with open(file, 'r') as f:
        header = ""
        sequence = ""
        for line in f:
            #in a fasta file the first character is a > sign
            if line[0] == ">":
                if header:
                    sequences[header] = sequence
                header = line[1:].strip()
                sequence = ""
            else:
                sequence += line.strip()
        sequences[header] = sequence
    return sequences

In [4]:
# This creates the dictionary of sequences for each location
blind_sequences = read_fasta(blind_path)
cyto_sequences = read_fasta(cyto_path)
mito_sequences = read_fasta(mito_path)
nucleus_sequences = read_fasta(nucleus_path)
other_sequences = read_fasta(other_path)
secreted_sequences = read_fasta(secreted_path)

## Exploring the suggested correlated features

In [5]:
def seq_len(seq):
    return len(seq)

In [6]:
def seq_len(seq):
    return len(seq)

def global_comp(seq):
    analysed_seq = ProteinAnalysis(seq)
    aa_comp = analysed_seq.count_amino_acids()
    return aa_comp

def aa_pct(seq):
    analysed_seq = ProteinAnalysis(seq)
    aa_pct = analysed_seq.get_amino_acids_percent()
    return aa_pct

def local_comp(seq, x,y):
    seq = seq[x:y]
    analysed_seq = ProteinAnalysis(seq)
    aa_local_comp = analysed_seq.count_amino_acids()
    return aa_local_comp

def aa_local_pct(seq, x,y):
    seq = seq[x:y]
    analysed_seq = ProteinAnalysis(seq)
    aa_local_pct = analysed_seq.get_amino_acids_percent()
    return aa_local_pct

def molecular_weight(seq):
    analysed_seq = ProteinAnalysis(seq)
    mw = analysed_seq.molecular_weight()
    return mw

def isoelectric_pt(seq):
    analysed_seq = ProteinAnalysis(seq)
    iso_pt = analysed_seq.isoelectric_point()
    return iso_pt

In [8]:
#for i in blind_sequences.values():
    #print(aa_local_pct(i, 0,500))

## Exploring other features

## Model building and training

LSTM

In [3]:
class ProteinClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.fc(x)
        x = F.softmax(x, dim=-1)
        
        return x


Transformer

In [1]:
import transformers
from transformers import AutoTokenizer, AutoModel

In [2]:
#import a tokenizer (parse text as numerical data and padding) along with a model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModel.from_pretrained("bert-base-uncased")

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/440M [00:00<?, ?B/s]

KeyboardInterrupt: 

## Testing