In [None]:

# Ami G. Sangster 

!git clone https://github.com/moses-lab/zero-shot-protein-segmentation.git

import sys
working_path = '/content/zero-shot-protein-segmentation/'
sys.path.insert(0,working_path)

import csv
import h5py
from protT5_embedder import get_embeddings
from functions import *



In [None]:

# reccommended: don't run the blocks of code that save the embeddings unless 
#               you actually want to have them, these files can be very large
#               (per-residue embeddings of the human proteome = 22GB)


In [None]:
# get per-residue protien embeddings from prot_t5_xl_half_uniref50-enc
# note: colab may temporarily ban you if you use too much GPU time

# emb_dict is a dictionary where the keys are UniProt protein IDs (as given in the fasta file)
# the values of the dictionary are data matrices that contain the per-residue embedding from prot_t5_xl_half_uniref50-enc
emb_dict = get_embeddings(seq_path="protein_sequences_demo.fasta", model_dir="", 
                          per_protein=False, max_residues=4000, max_seq_len=4000, max_batch=100)



In [None]:
# save per-residue embeddings for whole proteins

# note: files saved in colab must be downloaded to your desktop before the session is closed.
# you can find the file on the left hand side. Click on the file folder and it should be listed there.

whole_emb_path = "ProtT5_whole_protein_embeddings.hdf5"

with h5py.File(str(whole_emb_path), "a") as hf:
    for sequence_id, embedding in emb_dict.items():
        # noinspection PyUnboundLocalVariable
        hf.create_dataset(sequence_id, data=embedding)



In [None]:
# segment the protein embeddings using change point analysis

# protein_segment_boundaries is a dictionary where the keys are UniProt protein IDs (as given in the fasta file)
# the values of the dictionary are a list of boundaries between protein segments
protein_segment_boundaries = get_protein_segment_boundaries(emb_dict, max_bkps_per100aa=3)



In [None]:
# save segment boundaries

# note: files saved in colab must be downloaded to your desktop before the session is closed.
# you can find the file on the left hand side. Click on the file folder and it should be listed there.

seg_bounds_path = "ZPS_segment_boundaries.tsv"

with open(seg_bounds_path, 'w', newline='') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')
    for protein_id, protein_seg in protein_segment_boundaries.items():
        writer.writerow([protein_id, protein_seg])

        

In [None]:
# generate segment embeddings

# protein_segment_embeddings is a dictionary where the keys are "ID start-stop"
# where ID is the UniProt protein ID, start and stop are the start and stop positions 
# of the protein segment (defined in above block of code) in zero-based indexing
# the values of the dictionary are 1x1024 data vector containing the segment embedding
protein_segment_embeddings = get_protein_segment_embeddings(emb_dict, protein_segment_boundaries)

In [None]:
# save segment embeddings

# note: files saved in colab must be downloaded to your desktop before the session is closed.
# you can find the file on the left hand side. Click on the file folder and it should be listed there.

seg_emb_path = "ZPS_segment_embeddings.hdf5"

with h5py.File(str(seg_emb_path), "a") as hf:
    for sequence_key, embedding in protein_segment_embeddings.items():
        # noinspection PyUnboundLocalVariable
        hf.create_dataset(sequence_key, data=embedding)