In [1]:
!pip install -q transformers datasets matplotlib
!pip install -q torchinfo 

In [2]:
import os
import requests
import tqdm
import json
import pandas as pd
import numpy as np
import zipfile
from collections import Counter
import torch 
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import PreTrainedTokenizerFast
from tqdm.notebook import tqdm
from transformers import T5ForConditionalGeneration, T5Config, AdamW
from tqdm.auto import tqdm
from accelerate import Accelerator

In [3]:
file_id = '1yXvDTRRxjCLyAf4icU1sEMKF0NpssCnC'
destination = 'human_proteins.tsv'

def format_size(value):
  if value >= 1024**3:
    return f"{round(value / 1024**3 , 3)} GB"
  elif value >= 1024**2:
    return f"{round(value / 1024**2 , 3)} MB"
  elif value >= 1024:
    return f"{round(value / 1024 , 3)} KB"
  return f"{value} B"

# Code taken from https://stackoverflow.com/questions/38511444/python-download-files-from-google-drive-using-url
def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)
    params = { 'id' : id, 'confirm' : 1 }
    response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)    

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768
    
    pb = tqdm(response.iter_content(CHUNK_SIZE))
    b_total = 0
    with open(destination, "wb") as f:
        for i,chunk in enumerate(pb):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
                b_total += len(chunk)

                if i % 1000 == 0:
                  pb.set_description(f"written : {format_size(b_total)}")
                i += 1
    print("\n")
    print(f"saved all the data to {destination}. total size : {format_size(os.stat(destination).st_size)}")


download_file_from_google_drive(file_id, destination)

0it [00:00, ?it/s]



saved all the data to human_proteins.tsv. total size : 12.9 MB


In [4]:
download_file_from_google_drive("1FLrC9kK5-R_NwjmX_WTqT8YqchzS9zIN", "all_proteins.tsv")

0it [00:00, ?it/s]



saved all the data to all_proteins.tsv. total size : 294.963 MB


In [5]:
dataframe = pd.read_csv("human_proteins.tsv", sep="\t")
dataframe = dataframe[dataframe["Sequence"].str.len() <= 510]
dataframe = pd.read_csv("human_proteins.tsv", sep="\t")
dataframe = dataframe[dataframe["Sequence"].str.len() <= 510]
dataframe["Protein Name"] = [v.split("(")[0] for v in dataframe["Protein Name"]]
dataframe["Sequence"] = [" ".join(v) for v in dataframe["Sequence"]]

with open("Sequences.txt", "w") as f:
    f.write("\n".join(dataframe["Sequence"].values))
    
dataframe

Unnamed: 0,Sequence,Protein Name
0,M R W Q E M G Y I F Y P R K L R,Mitochondrial-derived peptide MOTS-c
4,M P G W F K K A W Y G L A S L L S F S S F I L ...,Clarin-2
6,M K R R Q K R K H L E N E E S Q E T A E K G G ...,Protein FAM170A
7,M D D A D P E E R N Y D N M L K M L S D L N K ...,Synaptonemal complex central element protein 3
8,M A F S D L T S R T V H L Y D N W I K D A D P ...,Elongation of very long chain fatty acids prot...
...,...,...
20417,M P L A S P I Q H H E V T R G V A P S M A L R ...,Putative uncharacterized protein PRO3102
20418,M V R P H L L K K K I L G R V W W L M P V V L ...,Putative uncharacterized protein PRO2829
20419,M A E T Y R R S R Q H E Q L P G Q R H M D L L ...,Putative uncharacterized protein DKFZp434L187
20420,M A H H S L N T F Y I W H N N V L H T H L V F ...,Putative uncharacterized protein encoded by LI...


In [6]:
dataframe2 = pd.read_csv("all_proteins.tsv", sep="\t")
dataframe2 = dataframe2[dataframe2["sequence"].str.len() <= 510]
dataframe2 = dataframe2[dataframe2["sequence"].str.len() <= 510]
dataframe2["protein_name"] = [v.split("(")[0] for v in dataframe2["protein_name"]]

with open("names.txt", "w") as f:
    f.write("\n".join([v.split("(")[0].strip() for v in dataframe2["protein_name"].values]))  

dataframe2

Unnamed: 0.1,Unnamed: 0,sequence,protein_name,length
0,0,MRWQEMGYIFYPRKLR,Mitochondrial-derived peptide MOTS-c,16
1,1,MPGWFKKAWYGLASLLSFSSFILIIVALVVPHWLSGKILCQTGVDL...,Clarin-2,232
2,2,MKRRQKRKHLENEESQETAEKGGGMSKSQEDALQPGSTRVAKGWSQ...,Protein FAM170A,330
3,3,MDDADPEERNYDNMLKMLSDLNKDLEKLLEEMEKISVQATWMAYDM...,Synaptonemal complex central element protein 3,88
4,4,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...,Elongation of very long chain fatty acids prot...,281
...,...,...,...,...
619007,529106,MNSNAPAAVIVLAAGAGTRMKSKLPKVLHEIGGRSLLMHAITAARG...,Bifunctional protein GlmU [Includes: UDP-N-ace...,492
619008,529107,MSATGSDPSRRPVDLPDLSREAVPGEKVALAPGQLQLRPTRRGKAP...,Probable dual-specificity RNA methyltransferas...,428
619009,529108,MVLASHNAKKLRELQRILAPAVPGLEAEQIVSAAGLGLPDVVEDAV...,dITP/XTP pyrophosphatase,203
619010,529109,MLPVLTADALRTAEQAHWDEHPGDDLMGRAAAEVARHATEMLGDGP...,ADP-dependent,492


In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("mlewand/PROT5-small", max_length=512, padding=True)
ids = tokenizer.encode("M R W Q E M G Y I F Y P R K L R")
print(ids, tokenizer.decode(ids))
ids = tokenizer.encode("Protein FAM170A")
print(ids, tokenizer.decode(ids))

Downloading (…)okenizer_config.json:   0%|          | 0.00/186 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.51M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/3.00 [00:00<?, ?B/s]

[1, 118, 89, 204, 116, 100, 118, 92, 131, 110, 115, 131, 88, 89, 104, 85, 89, 2] <s> M R W Q E M G Y I F Y P R K L R</s>
[1, 249, 9701, 12, 25, 2] <s> Protein FAM170A</s>


In [8]:
_tokenized_X = []
_tokenized_Y = []

tokenized_X = []
tokenized_Y = []
sizes_X = []
sizes_Y = []

max_length_inp = 512
max_length_out = 32 

for x in tqdm(dataframe["Sequence"]):
    input_tokens = tokenizer.encode(x)
    tokenized_X.append(input_tokens)
    sizes_X.append(len(input_tokens))

for y in tqdm(dataframe["Protein Name"]):
    out_tokens = tokenizer.encode(y)
    tokenized_Y.append(out_tokens)
    sizes_Y.append(len(out_tokens))
    
for x,y in tqdm(zip(_tokenized_X, _tokenized_Y)):
   if len(x) <= max_length_inp and len(y) <= max_length_out:
    tokenized_X.append(x)
    tokenized_Y.append(y)

  0%|          | 0/12617 [00:00<?, ?it/s]

  0%|          | 0/12617 [00:00<?, ?it/s]

0it [00:00, ?it/s]

In [9]:
class CustomAADataset(Dataset):
  def __init__(self, max_length=512, max_length2=32):
    self.max_length = max_length
    self.max_length2 = max_length2 
    
  def __len__(self):
    return len(tokenized_X)

  def __getitem__(self, idx):
    input_tokens = tokenized_X[idx]
    output_tokens = tokenized_Y[idx]

    # Pad input and output tokens
    input_tokens = input_tokens + [0] * (self.max_length - len(input_tokens))

    output_tokens = output_tokens + [0] * (self.max_length2 - len(output_tokens))

    return {"input_ids": torch.tensor(input_tokens, dtype=torch.long),
            "labels": torch.tensor(output_tokens, dtype=torch.long),
            }


In [10]:
# Load data
from datasets import Dataset as Dataset_hf

dataset = Dataset_hf.from_dict({"source": dataframe["Sequence"].values, "target": dataframe["Protein Name"].values})

# Create dataset instances
train_dataset = CustomAADataset(512, 32)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [11]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

data_collator = DataCollatorForSeq2Seq(tokenizer=None, padding=True, max_length=512)

# Load T5 model
config = T5Config.from_pretrained("mlewand/PROT5-small")
model = T5ForConditionalGeneration(config)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

In [14]:
batch = next(iter(train_loader))
input_ids = batch["input_ids"].to("cpu")
labels = batch["labels"].to("cpu")

model.to("cpu")
outputs = model(input_ids=input_ids, labels=labels)

preds = torch.argmax(outputs.logits, dim=2)

In [18]:
for v in range(len(labels)):
    y_true = labels[v][labels[v] != 0]
    y_hat  = preds[v][preds[v] != 0]

    try:
        y_true_text = tokenizer.decode(y_true.cpu().detach().numpy())
        y_true_text = y_true_text.split("<s>")[1].split("</s>")[0]

        y_hat_text = tokenizer.decode(y_hat.cpu().detach().numpy())
        y_hat_text = y_hat_text.split("<s>")[1].split("</s>")[0]


        print(y_true_text, y_hat_text)
    except IndexError:
        pass

 BET1 homolog   BET1 homolog 
 Putative FK506-binding protein 9-like protein   Putative FK506-binding Endophilin- 9-like Endophilin- 
 Upstream stimulatory factor 1   Upstream stimulatory factor 1 
 CD27 antigen   CD27 antigen 
 Protein Wnt-10a  Protein Wnt-10a
 Serine/threonine-protein phosphatase 2A activator   Serine/threonine-protein phosphatase 2A activator 
 Nuclear pore complex-interacting protein family member B6  Nuclearatrox complex-interacting protein family member B6
 Carboxypeptidase E   Carboxypeptidase E 
 WD repeat domain phosphoinositide-interacting protein 3   WD repeat phosphoinositide-interacting protein 3 
 Phospholipase A and acyltransferase 5   Phospholipase A and acyltransferase 5 
 Mast cell-expressed membrane protein 1  Mast cell-expressed membranefH
 1
 Urocortin  Urocortin
 Ribonuclease 8   Ribonuclease 8 
 Proteasome subunit alpha type-6   Proteasome subunit alpha type-6 
 Aspartate beta-hydroxylase domain-containing protein 2   Aspartate beta-hydroxylase d