In [6]:
import torch
from torch import nn
import torch.utils.data
import torch.utils.data.distributed
from torch.utils.data import Dataset, DataLoader, RandomSampler, TensorDataset
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torchmetrics
import re
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup

## Data Creation

In [20]:
blind_path = "../data/blind.fasta.txt"
cyto_path = "../data/cyto.fasta.txt"
mito_path = "../data/mito.fasta.txt"
nucleus_path = "../data/nucleus.fasta.txt"
other_path = "../data/other.fasta.txt"
secreted_path = "../data/secreted.fasta.txt"

In [21]:
def read_fasta(file):
    """
    This function takes an unstructured fasta file and outputs a dictionary of the sequences
    Input: - fasta file
    Output: - dict with keys (sequence header) and values (sequence)
    """
    sequences = {}
    with open(file, 'r') as f:
        header = ""
        sequence = ""
        for line in f:
            #in a fasta file the first character is a > sign
            if line[0] == ">":
                if header:
                    sequences[header] = sequence
                header = line[1:].strip()
                sequence = ""
            else:
                sequence += line.strip()
        sequences[header] = sequence
    return sequences

In [46]:
# This creates the dictionary of sequences for each location
blind_sequences = read_fasta(blind_path)
cyto_sequences = read_fasta(cyto_path)
mito_sequences = read_fasta(mito_path)
nucleus_sequences = read_fasta(nucleus_path)
other_sequences = read_fasta(other_path)
secreted_sequences = read_fasta(secreted_path)

df_cyto = pd.DataFrame.from_dict(cyto_sequences, orient='index', columns=['Sequences'])
df_cyto = df_cyto.reset_index().rename(columns={'index':'Label'})
df_cyto['Label'] = 'cyto'

df_mito = pd.DataFrame.from_dict(mito_sequences, orient='index', columns=['Sequences'])
df_mito = df_mito.reset_index().rename(columns={'index':'Label'})
df_mito['Label'] = 'mito'

df_nucleus = pd.DataFrame.from_dict(nucleus_sequences, orient='index', columns=['Sequences'])
df_nucleus = df_nucleus.reset_index().rename(columns={'index':'Label'})
df_nucleus['Label'] = 'nucleus'

df_other = pd.DataFrame.from_dict(other_sequences, orient='index', columns=['Sequences'])
df_other = df_other.reset_index().rename(columns={'index':'Label'})
df_other['Label'] = 'other'

df_secreted = pd.DataFrame.from_dict(secreted_sequences, orient='index', columns=['Sequences'])
df_secreted = df_secreted.reset_index().rename(columns={'index':'Label'})
df_secreted['Label'] = 'secreted'

df_blind = pd.DataFrame.from_dict(blind_sequences, orient='index', columns=['Sequences'])
df_blind = df_blind.reset_index().rename(columns={'index':'Label'})
df_blind['Label'] = 'blind'

df = pd.concat([df_cyto, df_mito, df_nucleus, df_other, df_secreted], axis=0).reset_index()
# Display the DataFrame
#df['encoded_cat'] = df['Label'].astype('category').cat.codes
#df.drop(columns={'index', 'Label'}, inplace=True)

#result = df.to_dict('records')

one_hot = pd.get_dummies(df['Label'])
df = pd.concat([df, one_hot], axis=1)
df.drop(columns=['index', 'Label'], inplace=True)
df['Sequences'] = [re.sub(r"[UZOB]", "X", sample) for sample in df['Sequences']]

In [31]:
df_blind.drop(columns=['Label', 'Label'], inplace=True)

In [34]:
target_list = ['cyto', 'mito', 'nucleus','other', 'secreted']

In [35]:
def train_test_split(df, validation_set = False):  
    
    #train test split of dataset 
    train_size = 0.8
    train_df=df.sample(frac=train_size,random_state=200)
    test_df=df.drop(train_df.index).reset_index(drop=True)
    train_df = train_df.reset_index(drop=True)
    
    if validation_set:
        #split the train further with a validation dataset
        train_size = 0.8
        train2_df = train_df.sample(frac=train_size, random_state=200)
        val_df = train_df.drop(train2_df.index).reset_index(drop=True)
        train2_df = train2_df.reset_index(drop=True)
    
        return train2_df, test_df, val_df
   
    return train_df, test_df

train_df, test_df, val_df = train_test_split(df, validation_set = True)

In [36]:
train_df[['cyto','mito','nucleus','other','secreted']]=train_df[['cyto','mito','nucleus','other','secreted']].astype('float')
test_df[['cyto','mito','nucleus','other','secreted']]=test_df[['cyto','mito','nucleus','other','secreted']].astype('float')
val_df[['cyto','mito','nucleus','other','secreted']]=val_df[['cyto','mito','nucleus','other','secreted']].astype('float')

In [37]:
train_df['Sequences'] = train_df['Sequences'].apply(lambda x: ' '.join(list(x)))
test_df['Sequences'] = test_df['Sequences'].apply(lambda x: ' '.join(list(x)))
val_df['Sequences'] = val_df['Sequences'].apply(lambda x: ' '.join(list(x)))


In [44]:
val_df

Unnamed: 0,Sequences,cyto,mito,nucleus,other,secreted
0,M N A F Q D F E L G A K L Y L Q C L L S L S S ...,0.0,0.0,1.0,0.0,0.0
1,M A E I I Q E R I E D R L P E L E Q L E R I G ...,0.0,0.0,1.0,0.0,0.0
2,M G L K E E F E E H A E K V N T L T E L P S N ...,1.0,0.0,0.0,0.0,0.0
3,M G A A R L L P N L T L C L Q L L I L C C Q T ...,0.0,0.0,0.0,0.0,1.0
4,M A D S P G C C S I W A R C L H C L Y S C H W ...,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
1791,M R S W P G L F W L L T L A L L A D G G R R E ...,0.0,0.0,0.0,0.0,1.0
1792,M K K S Y S G G T R T S S G R L R R L G D S S ...,1.0,0.0,0.0,0.0,0.0
1793,S K G K K A N K D V E L A R G,0.0,0.0,0.0,0.0,1.0
1794,M T T Q Q I V L Q G P G P W G F R L V G G K D ...,1.0,0.0,0.0,0.0,0.0


In [42]:
df_blind[['cyto','mito','nucleus','other','secreted']]=float(0.0)

In [43]:
df_blind['Sequences'] = df_blind['Sequences'].apply(lambda x: ' '.join(list(x)))

df_blind

Unnamed: 0,Sequences,cyto,mito,nucleus,other,secreted
0,M N E L I D S A L L R L ...,0.0,0.0,0.0,0.0,0.0
1,E E E S S T A K A F K I ...,0.0,0.0,0.0,0.0,0.0
2,M V G W K R N L Q T V I ...,0.0,0.0,0.0,0.0,0.0
3,M E R T R S K P V R N L ...,0.0,0.0,0.0,0.0,0.0
4,M S H H P S G L R A G F ...,0.0,0.0,0.0,0.0,0.0
5,M S L F S R A L M R F Q ...,0.0,0.0,0.0,0.0,0.0
6,M T W L K Q M W S S I L ...,0.0,0.0,0.0,0.0,0.0
7,M A T R E A C G Q F A A ...,0.0,0.0,0.0,0.0,0.0
8,M E T P A W P R V P R P ...,0.0,0.0,0.0,0.0,0.0
9,M Q N L K W V L M N L L ...,0.0,0.0,0.0,0.0,0.0


In [45]:
df_blind.to_pickle('blind_df.pkl')

In [129]:
train_df.to_pickle('train_df.pkl')

In [142]:
train_df.to_pickle('train_df.pkl')
test_df.to_pickle('test_df.pkl')
val_df.to_pickle('val_df.pkl')

In [144]:
df['Sequences'] = [" ".join("".join(sample.split())) for sample in df['Sequences']]

In [145]:
df

Unnamed: 0,Sequences,cyto,mito,nucleus,other,secreted
0,M G Q Q V G R V G E A P G L Q Q P Q P R G I R ...,1,0,0,0,0
1,M A L E P I D Y T T H S R E I D A E Y L K I V ...,1,0,0,0,0
2,M N Q I E P G V Q Y N Y V Y D E D E Y M I Q E ...,1,0,0,0,0
3,M S E E P T P V S G N D K Q L L N K A W E I T ...,1,0,0,0,0
4,M G D W M T V T D P G L S S E S K T I S Q Y T ...,1,0,0,0,0
...,...,...,...,...,...,...
11219,M I P N I T Q L K T A A L V M L F A G Q A L S ...,0,0,0,0,1
11220,M L R K L V T G A L A A A L L L S G Q S N A Q ...,0,0,0,0,1
11221,M I F H Q F Y S I L I L C L I F P N Q V V Q S ...,0,0,0,0,1
11222,M K F Q V V L S A L L A C S S A V V A S P I E ...,0,0,0,0,1
