In [5]:
from pathlib import Path
import os
import sys
import gzip
import wget

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

from src.utils.data import read_fasta

link = 'https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz'
filename = 'uniprot_sprot.dat.gz'
unzipped_filename = 'uniprot_sprot.dat'

# Download the file from the web
wget.download(link, filename)

# Unzip the downloaded file
with gzip.open(filename, 'rb') as f_in:
    with open(unzipped_filename, 'wb') as f_out:
        f_out.write(f_in.read())

print(f"File {filename} has been downloaded and unzipped to {unzipped_filename}.")


In [3]:
import pandas as pd
from Bio import SwissProt

# Extract data from SwissProt records
data = []
# See https://biopython.org/docs/1.75/api/Bio.SwissProt.html and https://web.expasy.org/docs/userman.html
with open('/home/ncorley/protein/ProteinFunctions/data/swissprot/uniprot_sprot.dat', 'r') as f:
    records = SwissProt.parse(f)
    for record in records:
        # Extract sequence ID
        seq_id = record.accessions[0]
        
        # Extract sequence
        sequence = record.sequence

        # Extract GO ids
        go_ids = [ref[1] for ref in record.cross_references if ref[0] == "GO" and len(ref) > 0]
        
        # Extract free-text description
        description = record.description

        # Extract organism and organism classification
        organism = record.organism
        organism_classification = record.organism_classification

        # Extract organelle
        organelle = record.organelle
        
        # Extract CC line as a dictionary
        cc = {}
        for comment in record.comments:
            key, value = comment.split(": ", 1)
            cc[key] = value
        
        data.append([seq_id, sequence, go_ids, description, organism, organism_classification, organelle, cc])

In [7]:
print(data[0])
print(len(data[0]))

['Q6GZX4', 'MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWKFTPL', ['GO:0046782'], 'RecName: Full=Putative transcription factor 001R;', 'Frog virus 3 (isolate Goorha) (FV-3).', ['Viruses', 'Varidnaviria', 'Bamfordvirae', 'Nucleocytoviricota', 'Megaviricetes', 'Pimascovirales', 'Iridoviridae', 'Alphairidovirinae', 'Ranavirus'], '', {'FUNCTION': 'Transcription activation. {ECO:0000305}.'}]
8


In [35]:
# Convert data into a pandas DataFrame
df_2023 = pd.DataFrame(data, columns=["seq_id", "sequence", "go_ids", "description", "organism", "organism_classification", "organelle", "cc"])

# Create a new column with the subcellular location
df_2023['subcellular_location'] = df_2023.cc.apply(lambda x: x['SUBCELLULAR LOCATION'] if 'SUBCELLULAR LOCATION' in x else None)

In [37]:
# import sequence embeddings from /home/ncorley/protein/ProteinFunctions/data/embeddings/frozen_proteinfer_sequence_embeddings.pkl
import pickle

# Load the sequence embeddings from the file
with open('/home/ncorley/protein/ProteinFunctions/data/embeddings/frozen_proteinfer_sequence_embeddings.pkl', 'rb') as f:
    sequence_embeddings = pickle.load(f)

# Make a set of the sequence strings
sequence_strings_2019 = set(sequence_embeddings.keys())

In [41]:
# Find sequence ids  that are in df but not in sequence_strings
df_2023['in_ProteInfer_dataset'] = df_2023.seq_id.apply(lambda x: x in sequence_strings_2019)

# Print 5 example sequences from df.sequence
print(df_2023.seq_id.head())

# Print 5 example sequences from sequence_strings
print(list(sequence_strings_2019)[:5])

# Count the number of sequences that are in df but not in sequence_strings
print(f"Number of sequences in df_2023 but not in ProteInfer dataset: {df_2023.in_ProteInfer_dataset.value_counts()[False]}")
print(f"Number of sequences in df_2023: {len(df_2023)}")
print(f"Number of sequences in ProteInfer dataset: {len(sequence_strings_2019)}")

0    Q6GZX4
1    Q6GZX3
2    Q197F8
3    Q197F7
4    Q6GZX2
Name: seq_id, dtype: object
['Q5RDG8', 'Q027V0', 'B2UBB1', 'F4I893', 'Q6BJH5']
Number of sequences in df_2023 but not in ProteInfer dataset: 47493
Number of sequences in df_2023: 569793
Number of sequences in ProteInfer dataset: 522607


In [42]:
# Import label embeddings from /home/ncorley/protein/ProteinFunctions/data/embeddings/frozen_proteinfer_label_embeddings.pkl
import pickle

# Load the label embeddings from the file
with open('/home/ncorley/protein/ProteinFunctions/data/embeddings/frozen_PubMedBERT_label_embeddings.pkl', 'rb') as f:
    label_embeddings_2019 = pickle.load(f)

In [45]:
# Make a set of the GO labels from the label embeddings
label_ids_2019 = set(label_embeddings_2019.keys())
print(len(label_ids_2019))

# Make a set from all the GO labels that occur in the data
label_ids_2023 = set([item for sublist in df_2023.go_ids for item in sublist])
print(len(label_ids_2023))

47401
29283


In [46]:
# Find GO labels that are in go_label_strings but not in label_strings
print(f"Number of GO labels in go_label_strings but not in label_strings: {len(label_ids_2023 - label_ids_2019)}")

# Print out 10 examples of GO labels that are in go_label_strings but not in label_strings
print(list(label_ids_2023 - label_ids_2019)[:10])

Number of GO labels in go_label_strings but not in label_strings: 666
['GO:0140915', 'GO:0140823', 'GO:0160074', 'GO:0110162', 'GO:0106292', 'GO:0140926', 'GO:0062158', 'GO:0140900', 'GO:0120317', 'GO:0106283']


In [48]:
# Find added labels
new_go_labels = label_ids_2023 - label_ids_2019

# Find protein sequences with added labels
df_2023['new_labels'] = df_2023.go_ids.apply(lambda x: set(x) & new_go_labels)

In [52]:
# Count how many rows have 'in_Proteinfer_dataset' == False
print(f"Number of rows with 'in_ProteInfer_dataset' == False: {len(df_2023[df_2023.in_ProteInfer_dataset == False])}")

# Count how many rows have 'in_Proteinfer_dataset' == False and 'new_labels' != set()
print(f"Number of rows with 'in_ProteInfer_dataset' == False and 'new_labels' != set(): {len(df_2023[(df_2023.in_ProteInfer_dataset == False) & (df_2023.new_labels != set())])}")

# Create a new dataframe out of those that meet that criteria
df_2023_new_sequences_and_labels = df_2023[(df_2023.in_ProteInfer_dataset == False) & (df_2023.new_labels != set())]

Number of rows with 'in_ProteInfer_dataset' == False: 47493
Number of rows with 'in_ProteInfer_dataset' == False and 'new_labels' != set(): 917


In [53]:
# Save df_2023_new_sequences_and_labels as a pickle file
# Save to here: /home/ncorley/protein/ProteinFunctions/data/zero_shot, with the name "SwissProt_2023_unseen_sequences_and_labels.pkl"
df_2023_new_sequences_and_labels.to_pickle('/home/ncorley/protein/ProteinFunctions/data/zero_shot/SwissProt_2023_unseen_sequences_and_labels.pkl')
