# Import and define functions

Installs packages

In [1]:
"""!jupyter nbextension enable --py --sys-prefix qgrid
!jupyter nbextension enable --py widgetsnbextension
!pip install qgrid --upgrade
!pip install notebook --upgrade
!pip install nodejs
!pip install npm
!jupyter labextension install qgrid2
!pip install --upgrade openai
!pip install python-dotenv""";

Imports libraries and OpenAI key

In [25]:
from Bio import Entrez
# Set your email address for PubMed API (replace with your email)
Entrez.email = "kuhfeldrf@oregonstate.edu"
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import re
import time
import os
import string
#import qgrid
import os
#from dotenv import load_dotenv
import openai
#os.environ['OPENAI_API_KEY'] = "sk-..SI"
#os.environ.get('OPENAI_API_KEY')

<b>Uniport Function: </b>Extracts the protein sequence information from the Uniprot protein page

In [3]:
def fetch_sequence(lines):
    """
    Extracts the protein sequence information from the Uniprot protein page
    """
    protein_sequence, chain,protein_desc, protein_name = '', '','',''
    protein_desc_list = []
    protein_name_list = []
    # Regular expressions
    sequence_pattern = re.compile(r'<sequence[^>]*>(.*?)</sequence>')
    #chain_pattern = re.compile(r'<feature type="chain".*description="(.*?)"')
    protein_desc_pattern = re.compile(r'<fullName.*?>(.*?)</fullName>')
    protein_name_pattern = re.compile(r'<name>(.*?)</name>')

    for line in lines:
        if '<name' in line:
            match = protein_name_pattern.search(line)
            if match:
                protein_name_list.append(match.group(1))
        if not protein_desc and '<fullName' in line:
            match = protein_desc_pattern.search(line)
            if match:
                protein_desc_list.append(match.group(1))
        if '<sequence' in line:
            match = sequence_pattern.search(line)
            if match:
                protein_sequence = match.group(1)
        #if '<feature type="chain"' in line:
        #    match = chain_pattern.search(line)
        #    if match:
        #        chain = match.group(1)
    if protein_name_list:     
        protein_name=protein_name_list[0]
    if protein_desc_list:     
        protein_desc=protein_desc_list[0]

    
    #return protein_sequence, chain, protein_name, protein_desc
    return protein_sequence, protein_name, protein_desc

<b>Uniport Function: </b>Extract a mapping of evidence keys to their associated PubMed IDs from the XML root node.

In [4]:
def extract_evidence_to_pubmed_mapping(root):
    """
    Extract a mapping of evidence keys to their associated PubMed IDs from the XML root node.

    Args:
    - root (xml.etree.ElementTree.Element): The root node of the XML document.

    Returns:
    - dict: A dictionary mapping evidence keys to PubMed IDs.
    """
    evidence_to_pubmed = {}
    
    # Extract evidence mapping to PubMed ID
    for evidence in root.findall(".//{http://uniprot.org/uniprot}evidence"):
        evidence_key = evidence.attrib.get('key')
        source = evidence.find(".//{http://uniprot.org/uniprot}dbReference[@type='PubMed']")
        
        if source is not None:
            pubmed_id = source.attrib.get('id')
            if evidence_key and pubmed_id:
                evidence_to_pubmed[evidence_key] = pubmed_id
    
    return evidence_to_pubmed

<b>Uniport Function: </b>Extracts peptide features and function comments from a given XML data.

In [5]:
def extract_peptide_and_function(data, protein_id):
    """
    Extracts peptide features and function comments from a given XML data.
    """
    # Parsing XML content
    root = ET.fromstring(data)
    lines = data.split('\n')
    #protein_sequence, chain, protein_name, protein_desc = fetch_sequence(lines)
    protein_sequence, protein_name, protein_desc = fetch_sequence(lines)
    # Extract the mapping from evidence key to PubMed ID
    evidence_to_pubmed = extract_evidence_to_pubmed_mapping(root)
    # Extract peptide features
    peptide_features = []
    for feature in root.findall(".//{http://uniprot.org/uniprot}feature[@type='peptide']"):
        begin = int(feature.find("{http://uniprot.org/uniprot}location/{http://uniprot.org/uniprot}begin").attrib.get('position', ''))
        end = int(feature.find("{http://uniprot.org/uniprot}location/{http://uniprot.org/uniprot}end").attrib.get('position', ''))
        peptide_seq = protein_sequence[begin-1:end]  # Extract peptide sequence using begin and end
        interval = f'{begin}-{end}'
        peptide_info = {
            'proteinID': protein_id,
            'protein_name': protein_name,
            'protein_desc':protein_desc,
            #'chain': chain,
            'interval':interval,
            'peptide': peptide_seq,
            'description': feature.attrib.get('description', None),
            'feature_evidence': feature.attrib.get('evidence', None),
            'evidence_to_pubmed': evidence_to_pubmed,
            #'begin': begin,
            #'end': end,
            #'id': feature.attrib.get('id', None),
            #'protein_sequence': protein_sequence,
        }
        peptide_features.append(peptide_info)
    
    # Extract function comments
    function_comments = []
    for comment in root.findall(".//{http://uniprot.org/uniprot}comment[@type='function']"):
        function_info = {
            'molecule': comment.find("{http://uniprot.org/uniprot}molecule").text if comment.find("{http://uniprot.org/uniprot}molecule") is not None else None,
            'text': comment.find("{http://uniprot.org/uniprot}text").text,
            'comment_evidence': comment.find("{http://uniprot.org/uniprot}text").attrib.get('evidence', None)
        }
        function_comments.append(function_info)
    
    return peptide_features, function_comments, evidence_to_pubmed

<b>Uniport Function: </b>To associate peptide features with related function comments based on shared evidence or matching descriptions.

In [6]:
def associate_peptide_with_function(peptide_features, function_comments, evidence_to_pubmed):
    """
    To associate peptide features with related function comments based on shared evidence or matching descriptions.
    """
    associated_data = []
    
    for peptide in peptide_features:
        associated_comment_texts = []
        all_functions = []  # List to store all functions if not directly linked
        peptide_evidence = set(peptide['feature_evidence'].split()) if peptide['feature_evidence'] else set()
        
        # Fetch associated PubMed IDs based on evidence keys that are also present in the evidence column
        peptide_pubmed = {evidence_to_pubmed[eid] for eid in peptide_evidence if eid in evidence_to_pubmed}
        
        direct_link = False  # Variable to track if there's a direct link for the peptide
        
        for comment in function_comments:
            comment_evidence = set(comment['comment_evidence'].split()) if comment['comment_evidence'] else set()
            # Check for shared evidence or if peptide description matches comment molecule
            if peptide_evidence & comment_evidence or (peptide['description'] == comment.get('molecule')):
                peptide['comment_evidence'] = comment_evidence
                associated_comment_texts.append(comment['text'])
                direct_link = True
                # Add PubMed IDs associated with the comment's evidence to peptide_pubmed
                comment_pubmed = {evidence_to_pubmed[eid] for eid in comment_evidence if eid in evidence_to_pubmed}
                peptide_pubmed.update(comment_pubmed)
            else:
                all_functions.append(comment['text'])
        
        peptide['associated_function'] = '; '.join(associated_comment_texts)
        if not direct_link and all_functions:
            peptide['non_associated_function'] = all_functions
        
        # Convert the set to a list
        peptide['evidence_pubmed'] = list(peptide_pubmed)
        associated_data.append(peptide)
    
    return associated_data


<b>Uniport Function: </b>To extract references from the provided XML data and return them as a DataFrame.

In [7]:
def extract_references(data, protein_id):
    """
    To extract references from the provided XML data and return them as a DataFrame.
    """
    lines = data.split('\n')

    # Continue extracting reference data
    reference_data = []
    inside_reference = False
    title, pubmed_id, doi, reference_key, formatted_authors, scope_range = None, None, None, None, None, None
    author_list = []

    # Regular expressions for extracting information
    reference_key_pattern = re.compile(r'<reference key="(\d+)">')
    title_pattern = re.compile(r'<title>(.*?)</title>')
    person_name_pattern = re.compile(r'<person name="(.*?)"/>')
    dbReference_type_pattern = re.compile(r'<dbReference type="(.*?)"')
    dbReference_id_pattern = re.compile(r'id="(.*?)"/>')
    scope_pattern = re.compile(r'PROTEIN SEQUENCE OF (\d+-\d+)')

    for line in lines:
        try:
            # Extract reference information
            if '<reference key="' in line:
                inside_reference = True
                reference_key = reference_key_pattern.search(line).group(1)
                # Ensure author_list is always initialized as an empty list
                author_list = []
            elif inside_reference:
                if '<title>' in line:
                    title = title_pattern.search(line).group(1)
                elif '<person name="' in line:
                    author = person_name_pattern.search(line).group(1)
                    author_list.append(author)
                elif '<dbReference type="' in line:
                    ref_type = dbReference_type_pattern.search(line).group(1)
                    ref_id = dbReference_id_pattern.search(line).group(1)
                    if ref_type == "PubMed":
                        pubmed_id = ref_id
                    elif ref_type == "DOI":
                        doi = ref_id
                elif '<scope>' in line:
                    scope_match = scope_pattern.search(line)
                    if scope_match:
                        scope_range = scope_match.group(1)
                elif '</reference>' in line:
                    inside_reference = False
                    # Format the authors list
                    if author_list:
                        if len(author_list) >= 1:
                            last_name, first_name_initial = author_list[0].split(' ')[0], author_list[0].split(' ')[1][0]
                            formatted_authors = f"{last_name}, {first_name_initial}. et al."
                    else:
                        formatted_authors = None

                    reference_data.append({
                        'proteinID': protein_id,
                        'reference key': reference_key,
                        'title': title,
                        'authors': formatted_authors,
                        'pubmed': pubmed_id,
                        'doi': doi,
                        'scope_range': scope_range
                    })

                    # Reset for the next reference
                    author_list, pubmed_id, doi, title, reference_key, formatted_authors, scope_range = [], None, None, None, None, None, None 
        except Exception as e:
            print(f"Error processing line: {line}. Error: {e}")

    # Convert reference_data list to a DataFrame
    
    return pd.DataFrame(reference_data)  # Note that this should return reference_df, not reference_data


<b>Uniport Function: </b>Fetch protein information from UniProt for a given protein ID.

In [8]:
def fetch_protein_info(protein_id):
    """
    Fetch protein information from UniProt for a given protein ID.
    """
    url = f'https://www.uniprot.org/uniprot/{protein_id}.xml'
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code != 200:
        return pd.DataFrame(), pd.DataFrame()
    
    data = response.text

    # Extract peptide features and function comments
    peptide_features, function_comments, evidence_to_pubmed = extract_peptide_and_function(data, protein_id)
    # Extract reference data
    reference_df = extract_references(data, protein_id)

    # Associate peptide with function
    associated_data = associate_peptide_with_function(peptide_features, function_comments, evidence_to_pubmed)

    # Convert associated_data list to a DataFrame
    peptide_df = pd.DataFrame(associated_data)

    return peptide_df, reference_df


<b>Uniport Function: </b>Update associated function updates the asscoaitated function column from the non_Asscoiated_function matching pubmed ID or description column

In [9]:
def update_associated_function(row):
    # If associated_function is NaN or an empty string, proceed with the update
    if pd.isna(row['associated_function']) or row['associated_function'] == '':
        if isinstance(row['non_associated_function'], str) and row['non_associated_function'].startswith('[') and row['non_associated_function'].endswith(']'):
            non_associated_entries = eval(row['non_associated_function'])
        else:
            non_associated_entries = row['non_associated_function'] if isinstance(row['non_associated_function'], list) else []

        description = row['description'].lower().replace("-", " ") if pd.notna(row['description']) else ""
        associated_info = ""
        
        # Check if the DataFrame has a 'pubmed_id' column
        pubmed_id = str(row['pubmed_id']) if 'pubmed_id' in row.keys() and pd.notna(row['pubmed_id']) else "-1" #-1 represents a value that a pubmedID will never be

        for entry in non_associated_entries:
            # Check if entry contains the description or the pubmed_id
            if description in entry.lower().replace("-", " ") or pubmed_id in entry:
                associated_info = associated_info + ' ' + entry if associated_info else entry
                
        return associated_info
    else:
        return row['associated_function']


<b>PubMed Function: </b>Function to fetch abstract, title, authors, and doi for a PubMed ID

In [10]:
def search_pubmed_by_title(title):
    handle = Entrez.esearch(db="pubmed", term=title)
    record = Entrez.read(handle)
    return record["IdList"]

def fetch_details(row, row_num, total_rows):
    """
    Function to fetch abstract, title, authors, and doi based on either PubMed ID, DOI, or title
    """
    details = {}
    pubmed_id = row['pubmed_id']
    
    # Check if pubmed_id is NaN or None
    if pd.isnull(pubmed_id):
        # Try to search by title
        matching_ids = search_pubmed_by_title(row['title'])
        if matching_ids:
            pubmed_id = matching_ids[0]  # take the first matching ID
        else:
            print(f"Row {row_num}/{total_rows}: No PubMed ID found for title '{row['title']}'")
            return None
    
    # Fetch details by PubMed ID
    try:
        handle = Entrez.efetch(db="pubmed", id=pubmed_id, retmode="xml")
        record = Entrez.read(handle)
        
        # Fetch abstract if available
        try:
            details['abstract'] = record['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
        except:
            details['abstract'] = None
        
        # Fetch title if available
        try:
            details['title'] = record['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle']
        except:
            details['title'] = None
        
        # Fetch authors if available
        try:
            author_list = [author['LastName'] + ", " + author['Initials'] + "." for author in record['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList']]
            # Format the authors
            if len(author_list) > 1:
                details['authors'] = author_list[0] + " et al."
            else:
                details['authors'] = author_list[0]
        except:
            details['authors'] = None
        
        # Fetch doi if available
        try:
            for article_id in record['PubmedArticle'][0]['PubmedData']['ArticleIdList']:
                if article_id.attributes['IdType'] == "doi":
                    details['doi'] = article_id.title()
                    break
        except:
            details['doi'] = None
        
        details['pubmed_id'] = pubmed_id        
        print(f"Row {row_num}/{total_rows}: Successfully fetched details for PubMed ID {pubmed_id}")
        return details
    except Exception as e:
        print(f"Row {row_num}/{total_rows}: Error fetching details for PubMed ID {pubmed_id}: {str(e)}")
        return None

<b>Function: </b>Loops through a speciese specific list of proteins, fetching info from Uniprot, then returns two data frames with and without fetchable reference data named after a the provided species

In [11]:
def process_species_data(protein_ids_list):
    """
    Refined version of process_species_data to avoid the SettingWithCopyWarning.
    """
    reference_data_dfs = []
    
    peptide_data_dfs = []

    # Iterate through the protein IDs and call the function
    for index, protein_id in enumerate(protein_ids_list):
        start_time = time.time()
        peptide_data_df, reference_data_df = fetch_protein_info(protein_id)
        elapsed_time = time.time() - start_time
        peptide_data_dfs.append(peptide_data_df)
        reference_data_dfs.append(reference_data_df)
        print(f"Processing {protein_id} (Index: {index + 1} of {len(protein_ids_list)}) took {elapsed_time:.2f} seconds.")
    
    processed_peptide_df = pd.concat(peptide_data_dfs, ignore_index=True)
    processed_reference_df = pd.concat(reference_data_dfs, ignore_index=True)
    return processed_peptide_df, processed_reference_df   

<b>Function: </b>Prints inportant info on the dataframes

In [12]:
def print_critical_info(df, df_name):
    """Print critical information about a DataFrame."""
    
    print(f"Information for DataFrame: {df_name}")
    print("-" * 40)
    
    # Shape of the DataFrame
    print(f"Numbers of peptides: {df.shape[0]}")
    print(f"Shape: {df.shape}")
    
    # List of columns
    print(f"Columns: {df.columns.tolist()}")
    
    # Data types of each column
    #print("\nData Types:")
    #print(df.dtypes)
    
    # Number of missing values in each column
    print("\nMissing Values Count:")
    print(df.isnull().sum())
    
    # Display the first few rows of the DataFrame
    #print("\nHead of DataFrame:")
    #print(df.head())
    
    print("\n" + "=" * 40 + "\n")


# Bring in protein & MBPDB lists

MBPDB list imported from Summer 2023

In [24]:
mbpdb_df = pd.read_csv('data/exported_data.tsv', sep='\t')
mbpdb_df.rename(columns={'protein_pid': 'proteinID', 'intervals': 'interval'}, inplace=True)
mbpdb_function_list = list(set(mbpdb_df['function']))
#mbpdb_df_qgrid = qgrid.show_grid(mbpdb_df, show_toolbar=True, grid_options={'forceFitColumns': False})
#mbpdb_df_qgrid
mbpdb_df

Unnamed: 0,peptide,proteinID,protein_desc,protein_species,interval,function,additional_details,ic50,inhibition_type,inhibited_microorganisms,ptm,title,authors,abstract,doi
0,YVPFP,P47710,Alpha-S1-casein,Homo sapiens,158-162,Anticancer,Inhibits TR7D breast cancer cell proliferation,,,,,Identification of a novel opioid peptide (Tyr-...,"Kampa, M. et al.",A new casomorphin pentapeptide (αS1-casomorphi...,10.1042/bj3190903
1,YVPFP,P47710,Alpha-S1-casein,Homo sapiens,158-162,Opioid,,,,,,Identification of a novel opioid peptide (Tyr-...,"Kampa, M. et al.",A new casomorphin pentapeptide (αS1-casomorphi...,10.1042/bj3190903
2,YLGYLE,P02662,Alpha-S1-casein,Bos taurus,106-111,ACE-inhibitory,,85.76,,,,In Silico and In Vitro Analysis of Multifuncti...,"Amigo, L. et al.","Currently, the associations between oxidative ...",10.3390/foods9080991
3,YLGYLE,P02662,Alpha-S1-casein,Bos taurus,106-111,Antioxidant,,,,,,In Silico and In Vitro Analysis of Multifuncti...,"Amigo, L. et al.","Currently, the associations between oxidative ...",10.3390/foods9080991
4,YLGYLE,P02662,Alpha-S1-casein,Bos taurus,106-111,Increase mucin secretion,,,,,,Opioid Activities and Structures of α-Casein-D...,"Loukas, S. et al.","Exorphins, peptides with opioid activity, have...",10.1021/bi00288a034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1068,KVGTKCCAKP,P14639,Serum albumin,Ovis aries,455-464,ACE-inhibitory,,30.00,,,,Quantitative structureeactivity relationship b...,"Sagardia, I. et al.",The peptide profile of a ripened cheese (Basqu...,10.1016/j.idairyj.2012.12.006
1069,FP,P02769,Serum albumin,Bos taurus,246-247,ACE-inhibitory,,315.00,,,,Characterisation of the hydrolytic specificity...,"Norris, R. et al.",The hydrolytic specificity of Aspergillus nige...,10.1016/j.foodchem.2014.01.056
1070,FP,P02769,Serum albumin,Bos taurus,246-247,ACE-inhibitory,,315.00,,,,Structural analysis of new antihypertensive pe...,"Abubakar, A. et al.",Whey protein was digested with one of seven ki...,10.3168/jds.S0022-0302(98)75878-3
1071,FP,P02769,Serum albumin,Bos taurus,246-247,ACE-inhibitory,,205.00,,,,Characterization of New Milk-derived Inhibitor...,"Fuglsang, A. et al.",inhibition of angiotensin converting enzyme (A...,10.1080/1475636031000138723


In [None]:
len(list(set(mbpdb_df['peptide'])))

list of human milk proteins from OSU Proteome Discoverer Dave made from a few papers not comprehensive

In [None]:
# Read the file
with open('protein_lists\HumanMilkProteinDatabase_v2.fasta', 'r') as file:
    content = file.readlines()

# Extract unique protein IDs
hum_protein_ids = {line.split('|')[1] for line in content if line.startswith('>sp|')}
hum_protein_ids=list(hum_protein_ids)
print(len(hum_protein_ids))


Test lists and list of proteins found in MBPDB peptides used for developement

In [26]:
#list of 42 proteins that currently have matches in the MBPDB
uniprot_ids = [    "P02666",    "P47710",    "P02662",    "P04653",    "P09115",    "P18626",    "O97943",    "O62823",    "P02663",    "P04654",   "P33049",    "A0A1L6KYI1",    "E9NZN2",    "P05814",    "P11839",    "P09116",    "Q9TSI0",    "Q9TVD0",    "P33048",    "P86273",    "A0A344X7B9",    "P02668",    "P07498",    "P02669",    "I6UFY2",    "P02670",    "P80195",    "P02754",    "P02755",    "P02756",    "P00711",    "P00710",    "P24627",    "P02788",    "P14632",    "Q29477",    "O77698",    "P14639",    "P02769",    "P67976",    "L8I8G5",    "P01966"]

#short lits of proteins to test
test_list=['P61278','O43612', 'P02788','P62158','P02666']

# Executes code and searches list

This is the code that runs the Uniport functions above given a protein list of interest, averages ~2 sec per protein

In [27]:
processed_peptide_df, processed_reference_df = process_species_data(test_list)

Processing P61278 (Index: 1 of 5) took 1.77 seconds.
Processing O43612 (Index: 2 of 5) took 1.69 seconds.
Processing P02788 (Index: 3 of 5) took 2.00 seconds.
Processing P62158 (Index: 4 of 5) took 1.62 seconds.
Processing P02666 (Index: 5 of 5) took 1.76 seconds.


In [28]:
processed_peptide_df

Unnamed: 0,proteinID,protein_name,protein_desc,interval,peptide,description,feature_evidence,evidence_to_pubmed,comment_evidence,associated_function,evidence_pubmed,non_associated_function
0,P61278,SMS_HUMAN,Somatostatin,31-43,LRQFLQKSLAAAA,Neuronostatin,,"{'5': '29615476', '6': '29615476'}","{5, 2, 3}",May enhance low-glucose-induced glucagon relea...,[29615476],
1,P61278,SMS_HUMAN,Somatostatin,89-116,SANSNPAMAPRERKAGCKNFFWKTFTSC,Somatostatin-28,,"{'5': '29615476', '6': '29615476'}",,,[],"[Inhibits the secretion of pituitary hormones,..."
2,P61278,SMS_HUMAN,Somatostatin,103-116,AGCKNFFWKTFTSC,Somatostatin-14,,"{'5': '29615476', '6': '29615476'}",{5},"Inhibits the secretion of pituitary hormones, ...",[29615476],
3,O43612,OREX_HUMAN,Hypocretin neuropeptide precursor,34-66,QPLPDCCRQKTCSCRLYELLHGAGNHAAGILTL,Orexin-A,1.0,"{'2': '10973318', '3': '15479620', '4': '16429...",{1},Neuropeptides that play a significant role in ...,[],
4,O43612,OREX_HUMAN,Hypocretin neuropeptide precursor,70-97,RSGPPGLQGRLQRLLQASGNHAAGILTM,Orexin-B,1.0,"{'2': '10973318', '3': '15479620', '4': '16429...",{1},Neuropeptides that play a significant role in ...,[],
5,P02788,TRFL_HUMAN,Lactotransferrin,20-67,GRRRSVQWCAVSQPEATKCFQWQRNMRKVRGPPVSCIKRDSPIQCIQA,Lactoferricin-H,,"{'2': '10089347', '3': '10792619', '4': '10828...",,,[],[Transferrins are iron binding transport prote...
6,P02788,TRFL_HUMAN,Lactotransferrin,171-201,FFSASCVPGADKGQFPNLCRLCAGTGENKCA,Kaliocin-1,,"{'2': '10089347', '3': '10792619', '4': '10828...",{13},Has antimicrobial activity and is able to perm...,[12693969],
7,P02788,TRFL_HUMAN,Lactotransferrin,338-343,YLGSGY,Lactoferroxin-A,,"{'2': '10089347', '3': '10792619', '4': '10828...",{14},Has opioid antagonist activity (PubMed:1369293...,[1369293],
8,P02788,TRFL_HUMAN,Lactotransferrin,543-547,RYYGY,Lactoferroxin-B,,"{'2': '10089347', '3': '10792619', '4': '10828...",{14},Has opioid antagonist activity (PubMed:1369293...,[1369293],
9,P02788,TRFL_HUMAN,Lactotransferrin,680-686,KYLGPQY,Lactoferroxin-C,,"{'2': '10089347', '3': '10792619', '4': '10828...",{14},Has opioid antagonist activity (PubMed:1369293...,[1369293],


A series of merge steps to combine peptide and reference data, exands peptide list if multiple references are mentioned

In [None]:
# Explode the 'evidence_pubmed' column, creating a row for each entry in the list
expanded_df = processed_peptide_df.explode('evidence_pubmed')

# Remove rows where 'evidence_pubmed' is NaN
filtered_df = expanded_df.dropna(subset=['evidence_pubmed'])

# Merge the filtered dataframe with the reference dataframe based on 'proteinID' and 'evidence_pubmed'
merged_df = pd.merge(filtered_df, processed_reference_df, 
                     left_on=['proteinID', 'evidence_pubmed'], 
                     right_on=['proteinID', 'pubmed'], 
                     how='left')

# Extract rows from expanded_df where 'evidence_pubmed' is NaN
no_pubmed_df = expanded_df[expanded_df['evidence_pubmed'].isna()]

# Merge no_pubmed_df with reference dataframe based on 'proteinID' and 'interval'
merged_df_nopubmed_df = pd.merge(no_pubmed_df, processed_reference_df, 
                                left_on=['proteinID', 'interval'], 
                                right_on=['proteinID', 'scope_range'], 
                                how='left')

# Filter rows where the interval matches the scope_range and 'pubmed' is NaN
matching_interval_df_no_pubmed = merged_df_nopubmed_df[(merged_df_nopubmed_df['interval'] == merged_df_nopubmed_df['scope_range']) &  merged_df_nopubmed_df['pubmed'].isna()]

# Combine merged_df and merged_df_nopubmed_df into a single dataframe
full_merged_df = pd.concat([merged_df, merged_df_nopubmed_df], ignore_index=True)

# Create 'pubmed_id' column based on conditions related to 'evidence_pubmed' and 'pubmed'
full_merged_df['pubmed_id'] = full_merged_df.apply(lambda row: row['evidence_pubmed'] if pd.isna(row['pubmed']) else (row['pubmed'] if pd.isna(row['evidence_pubmed']) else (row['evidence_pubmed'] if row['evidence_pubmed'] == row['pubmed'] else None)), axis=1)

# Filter rows from full_merged_df where 'pubmed_id' is not NaN
reff_merged_df = full_merged_df[full_merged_df['pubmed_id'].notna()].copy()

# Add rows from matching_interval_df_no_pubmed to reff_merged_df
reff_merged_df = pd.concat([reff_merged_df, matching_interval_df_no_pubmed], ignore_index=True)

# Create a unique identifier for each row in both dataframes
full_merged_df['uid'] = full_merged_df['proteinID'] + '_' + full_merged_df['interval'].astype(str)
matching_interval_df_no_pubmed.loc[:, 'uid'] = matching_interval_df_no_pubmed['proteinID'] + '_' + matching_interval_df_no_pubmed['interval'].astype(str)

# Check if rows in full_merged_df exist in matching_interval_df_no_pubmed using the unique identifier
full_merged_df['in_matching'] = full_merged_df['uid'].isin(matching_interval_df_no_pubmed['uid'])

# Filter rows in full_merged_df where 'pubmed_id' is NaN and aren't present in matching_interval_df_no_pubmed
no_reff_merged_df = full_merged_df[(full_merged_df['pubmed_id'].isna()) & (~full_merged_df['in_matching'])].copy()

# Drop the temporary columns used for operations
no_reff_merged_df.drop(columns=['uid', 'in_matching'], inplace=True)
full_merged_df.drop(columns=['uid', 'in_matching'], inplace=True)
matching_interval_df_no_pubmed.drop(columns=['uid'], inplace=True)

# Suppress the SettingWithCopyWarning that may arise from certain operations
pd.options.mode.chained_assignment = None


Fetchess abstract and reference infromation from PubMed, takes ~1 second per row

In [None]:
# Initialize an empty DataFrame to store details fetched from the 'fetch_details' function
pubmed_details = pd.DataFrame()

# Calculate the total number of rows in the reff_merged_df for progress tracking
total_rows = len(reff_merged_df)

# Apply the 'fetch_details' function to each row of the reff_merged_df
# The results are stored in a Series where each entry is a dictionary of details fetched for that row
results = reff_merged_df.apply(lambda row: fetch_details(row, row.name + 1, total_rows), axis=1)

# Extract details from the results and assign them to appropriate columns in the pubmed_details DataFrame
pubmed_details['pubmed_id'] = results.apply(lambda x: x['pubmed_id'] if x else None)   # Extract 'pubmed_id' from the result dictionary
pubmed_details['abstract'] = results.apply(lambda x: x['abstract'] if x else None)     # Extract 'abstract' from the result dictionary
pubmed_details['title'] = results.apply(lambda x: x['title'] if x else None)           # Extract 'title' from the result dictionary
pubmed_details['authors'] = results.apply(lambda x: x['authors'] if x else None)       # Extract 'authors' from the result dictionary

# Check if 'doi' exists in the result dictionary before extracting to avoid KeyError
pubmed_details['doi'] = results.apply(lambda x: x['doi'] if x and 'doi' in x else None)

The extracted pubmed data can be explored below, where pubmedID was nan, errounus references can be extracted

In [None]:
pubmed_details_qgrid = qgrid.show_grid(pubmed_details, show_toolbar=True, grid_options={'forceFitColumns': False})
pubmed_details_qgrid

In [None]:
# Remove duplicate entries from the pubmed_details DataFrame based on the 'pubmed_id' column
pubmed_details_unique = pubmed_details.drop_duplicates(subset='pubmed_id')

# Merge the reff_merged_df with the unique pubmed_details based on 'pubmed_id'
# If there's a name conflict between the columns of the two dataframes being merged, '_pubmed' is added as a suffix to columns from pubmed_details_unique
reff_merged_final_df = pd.merge(reff_merged_df, pubmed_details_unique, on='pubmed_id', how='left', suffixes=('', '_pubmed'))

# For rows with missing values in the 'title', 'doi', and 'authors' columns of reff_merged_final_df, 
# replace them with corresponding values from columns 'title_pubmed', 'doi_pubmed', and 'authors_pubmed'
reff_merged_final_df['title'] = reff_merged_final_df.apply(lambda row: row['title'] if pd.notnull(row['title']) else row['title_pubmed'], axis=1)
reff_merged_final_df['doi'] = reff_merged_final_df.apply(lambda row: row['doi'] if pd.notnull(row['doi']) else row['doi_pubmed'], axis=1)
reff_merged_final_df['authors'] = reff_merged_final_df.apply(lambda row: row['authors'] if pd.notnull(row['authors']) else row['authors_pubmed'], axis=1)

# Drop columns that are no longer needed from reff_merged_final_df
reff_merged_final_df = reff_merged_final_df.drop(columns=['title_pubmed', 'doi_pubmed', 'authors_pubmed', 'pubmed', 'feature_evidence', 'evidence_to_pubmed', 'evidence_pubmed', 'comment_evidence', 'reference key', 'scope_range'])

# Drop columns that are not needed from no_reff_merged_df
no_reff_merged_final_df = no_reff_merged_df.drop(columns=['pubmed', 'feature_evidence', 'evidence_to_pubmed', 'evidence_pubmed', 'comment_evidence', 'reference key', 'title', 'authors', 'pubmed', 'doi', 'scope_range', 'pubmed_id'])

print_critical_info(no_reff_merged_final_df, "no_reff_merged_final_df")
print_critical_info(reff_merged_final_df, "reff_merged_final_df")

In [None]:
no_reff_merged_final_df

Extracteds asccociated function from non_asscoiated by referencing pubmed ID and description column

In [None]:
# Calculate initial counts of empty 'associated_function' in both dataframes
initial_empty_reff = (reff_merged_final_df['associated_function'] == '').sum()
initial_empty_no_reff = (no_reff_merged_final_df['associated_function'] == '').sum()

# Apply the update_associated_function method to the 'associated_function' column of both dataframes
reff_merged_final_df['associated_function'] = reff_merged_final_df.apply(update_associated_function, axis=1)
no_reff_merged_final_df['associated_function'] = no_reff_merged_final_df.apply(update_associated_function, axis=1)

# Calculate the change in the number of empty 'associated_function' entries in both dataframes
change_reff = initial_empty_reff - (reff_merged_final_df['associated_function'] == '').sum()
change_no_reff = initial_empty_no_reff - (no_reff_merged_final_df['associated_function'] == '').sum()

# Print the change
print(f"In reff_merged dataframe, {change_reff} associated_functions were assigned.")
print(f"In no_reff_merged dataframe, {change_no_reff} associated_functions were assigned.")

# Set 'non_associated_function' to NaN for rows where 'associated_function' is not NaN
reff_merged_final_df.loc[reff_merged_final_df['associated_function'].notna() & (reff_merged_final_df['associated_function'] != ""), 'non_associated_function'] = np.nan
no_reff_merged_final_df.loc[no_reff_merged_final_df['associated_function'].notna() & (no_reff_merged_final_df['associated_function'] != ""), 'non_associated_function'] = np.nan

# Exports/Imports data to CSV

Exports real protein list to CSV

In [None]:
no_reff_merged_final_df.to_csv('data/no_reff_merged_final_df.csv')
reff_merged_final_df.to_csv('data/reff_merged_final_df.csv')

Exports test rotein list to CSV

In [None]:
uni_no_reff_merged_final_df.to_csv('data/uni_no_reff_merged_final_df.csv')
uni_reff_merged_final_df.to_csv('data/uni_reff_merged_final_df.csv')

Import test dataframes from CSV

In [None]:
uni_reff_merged_final_df = pd.read_csv('data/uni_reff_merged_final_df.csv', index_col=0)
uni_no_reff_merged_final_df = pd.read_csv('data/uni_no_reff_merged_final_df.csv', index_col=0)
uni_reff_merged_final_df['pubmed_id'] = uni_reff_merged_final_df['pubmed_id'].astype('Int64')

Import real dataframes from CSV

In [None]:
reff_merged_final_df = pd.read_csv('data/reff_merged_final_df.csv', index_col=0)
no_reff_merged_final_df = pd.read_csv('data/no_reff_merged_final_df.csv', index_col=0)
#contverts pubmed ID from float to int, deals with NAs
reff_merged_final_df['pubmed_id'] = reff_merged_final_df['pubmed_id'].where(pd.notna(reff_merged_final_df['pubmed_id']), pd.NA)
reff_merged_final_df['pubmed_id'] = reff_merged_final_df['pubmed_id'].astype('Int64')


# Explores Organized data / make corrections to associated function

Prints key info on dataframe

In [None]:
# Print critical information about both DataFrames
print_critical_info(no_reff_merged_final_df, "data/no_reff_merged_final_df")
print_critical_info(reff_merged_final_df, "data/reff_merged_final_df")

### Explores dataframe with references of real data

### Edit the dataframe to added the associated functions from non_asscoiated functions that were not parsed:  
    1) Copy the asscoiated line line from non_asscociated_function to asscoiated_function.  
            Example: Casocidin-1 in description is Casocidin-I in function so this has to be manually parsed.     
    2) Delete the non_asscociated_function leaving this blank not nan to indicate a manual edit.      
    3) If there is no associated_function the insert "indeterminable" into the associated function column and leave the            non_asscoiated text. 
    4) To save changes run the next line of code with ".get_changed_df()" in it.

In [None]:
#uni_reff_merged_final_df_qgrid = qgrid.show_grid(uni_reff_merged_final_df, show_toolbar=True, grid_options={'forceFitColumns': False})
#uni_reff_merged_final_df_qgrid

reff_merged_final_df_qgrid = qgrid.show_grid(reff_merged_final_df, show_toolbar=True, grid_options={'forceFitColumns': False})
reff_merged_final_df_qgrid

In [None]:
reff_merged_final_me_df = reff_merged_final_df_qgrid.get_changed_df()

### Explores dataframe w/o references of real data

### Edit the dataframe to added the associated functions from non_asscoiated functions that were not parsed:  
    1) Copy the asscoiated line line from non_asscociated_function to asscoiated_function.  
            Example: Casocidin-1 in description is Casocidin-I in function so this has to be manually parsed.     
    2) Delete the non_asscociated_function leaving this blank not nan to indicate a manual edit.      
    3) If there is no associated_function the insert "indeterminable" into the associated function column and leave the         non_asscoiated text. 
    4) To save changes run the next line of code with ".get_changed_df()" in it.

In [None]:
#uni_no_reff_merged_final_df_qgrid = qgrid.show_grid(uni_no_reff_merged_final_df, show_toolbar=True, grid_options={'forceFitColumns': False})
#uni_no_reff_merged_final_df_qgrid

no_reff_merged_final_df_qgrid = qgrid.show_grid(no_reff_merged_final_df, show_toolbar=True, grid_options={'forceFitColumns': False})
no_reff_merged_final_df_qgrid

In [None]:
no_reff_merged_final_me_df =no_reff_merged_final_df_qgrid.get_changed_df()
no_reff_merged_final_me_df

# Determines biological function using OpenAI/ChatGPT API

In [None]:
def extract_info(response_string):
    """
    Extracts function names, evidence, and logic from the provided response string.

    Args:
    - response_string (str): The response string containing function information.

    Returns:
    - list: A list of tuples where each tuple contains function name, logic, and evidence.
    """
    
    # Split the response by the new line to separate the different functions
    functions_list = response_string.split("\n\n")
    
    # For each function, extract the function name, evidence, and logic
    extracted_info = []
    for func_str in functions_list:
        if "Function:" in func_str:
            func_name = func_str.split("Function:")[1].split("|")[0].strip()
            evidence = func_str.split("Evidence:")[1].split("|")[0].strip()
            logic = func_str.split("Logic:")[1].split("|")[0].strip()
            extracted_info.append((func_name, logic, evidence))
    
    return extracted_info

def classify_bioactivity(peptide, description, associated_function, non_associated_function, abstract, title, mbpdb_function_list):
    """
    Constructs a query based on provided parameters, sends it to OpenAI's API, 
    and returns the extracted function information.

    Args:
    - peptide (str): The peptide of interest.
    - description (str): Description of the peptide.
    - associated_function (str): The associated function of the peptide.
    - non_associated_function (str): Function of the protein or all peptides from that protein.
    - abstract (str): The abstract related to the peptide.
    - title (str): The title of the source.
    - mbpdb_function_list (list): List of potential functions.

    Returns:
    - list: A list of tuples where each tuple contains function name, logic, and evidence.
    """
    
    # Construct the query based on the provided parameters
    query = (
        f"Given the peptide {peptide}, with the description: {description}, "
        f"and the associated_function: {associated_function}, "
        f"or function of the protein or all peptides from that protein called non_associated_function: {non_associated_function}, "
        f"\n\nif associated_function = indeterminable then do ignore the non_associated_function input "
        f"title: {title}, and abstract: {abstract}, "
        f"please identify any bioactivity functions of the peptide from the list provided(mbpdb_function_list). "
        f"For each identified function, please provide the response in the following format: "
        f"'Function: [Function Name] | Evidence: [Specific evidence from the sources] | Logic: [Explanation for the choice]'"
        f"\n\nList of potential functions:{mbpdb_function_list}."
    )
    
    # Send the constructed query to OpenAI's API
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "who is an expert in bioactive peptides"},
            {"role": "user", "content": query}
        ]
    )
    
    # Extract the response content
    bioactivity_response = response['choices'][0]['message']['content']
    
    # Extract info from the response and return
    return extract_info(bioactivity_response)


In [None]:
# Create an empty list to store results
results = []

# Iterate over each row of the DataFrame
# Using enumerate to get both the row number (index) and the row data
for index, (_, row) in enumerate(reff_merged_final_me_df.iterrows()):
    # Record the start time for this iteration
    start_time = time.time()
    
    # Get the functions, logic, and evidence based on the row data
    functions_with_logic = classify_bioactivity(row['peptide'], row['description'], row['associated_function'], row['non_associated_function'], row['abstract'], row['title'], mbpdb_function_list)
    
    # For each function, logic, and evidence, create a new row and append it to the results list
    for func, logic, evidence in functions_with_logic:
        new_row = row.copy()
        new_row['gpt_function'] = func
        new_row['gpt_logic'] = logic
        new_row['gpt_evidence'] = evidence
        results.append(new_row)
    
    # Calculate the elapsed time for this iteration
    elapsed_time = time.time() - start_time
    print(f"Processed row {index + 1}/{len(reff_merged_final_me_df)} in {elapsed_time:.2f} seconds")

# Convert the results to a new DataFrame
reff_merged_final_me_gpt_df = pd.DataFrame(results)

# Display the updated DataFrame using qgrid
# updated_df = qgrid.show_grid(updated_df, show_toolbar=True, grid_options={'forceFitColumns': False})
# updated_df        


In [None]:
# Display the updated DataFrame using qgrid
reff_merged_final_me_gpt_df_qgrid = qgrid.show_grid(reff_merged_final_me_gpt_df, show_toolbar=True, grid_options={'forceFitColumns': False})
reff_merged_final_me_gpt_df_qgrid

In [None]:
# Defining the cleaning function
def clean_gpt_function(func):
    return func.split("\n")[0].strip().strip("'")

# Assuming reff_merged_final_me_gpt_df is already defined, applying the cleaning function to its 'gpt_function' column
reff_merged_final_me_gpt_df['gpt_function'] = reff_merged_final_me_gpt_df['gpt_function'].apply(clean_gpt_function)
list(set(reff_merged_final_me_gpt_df['gpt_function']))


In [None]:
reff_merged_final_me_gpt_df.to_csv('data/reff_merged_final_me_gpt_df.csv')
print_critical_info(reff_merged_final_me_gpt_df,'data/reff_merged_final_me_gpt_df')

In [None]:
#updated_df = qgrid.show_grid(updated_df, show_toolbar=True, grid_options={'forceFitColumns': False})
#updated_df = updated_df.get_changed_df()
#updated_df.to_csv('data/uni_reff_merged_final_me_gpt_df.csv')
updated_df = reff_merged_final_me_gpt_df

In [None]:
#reff_merged_final_me_gpt_df=pd.read_csv('reff_merged_final_me_gpt_df.csv', index_col=0)
print_critical_info(updated_df,"b4merge")

In [None]:
print(processed_peptide_df.shape)
processed_peptide_df = processed_peptide_df.drop_duplicates(subset=['proteinID', 'peptide'])
print(processed_peptide_df.shape)

In [None]:
# Merge based on 'proteinID' and 'peptide' using an inner merge
merged_df = updated_df.merge(processed_peptide_df[['proteinID', 'peptide', 'protein_name', 'protein_desc']], 
                             on=['proteinID', 'peptide'], 
                             how='inner')

# Rearrange columns
cols = list(updated_df.columns) + ['protein_name', 'protein_desc']
updated_df = merged_df[cols]

# Extract current columns
cols = list(updated_df.columns)

# Reorder columns
cols = [cols[0]] + ['protein_name', 'protein_desc'] + cols[1:-2]

# Assign the reordered columns to updated_df
updated_df = updated_df[cols]
print_critical_info(updated_df,"aftermerge")


In [None]:
updated_df

In [None]:
updated_df.to_csv('data/uni_reff_merged_final_me_gpt_df.csv')

# Compared Uniport list to MBPDB

generate a list of unique peptides that can be cross referenced with MBPDB website

In [None]:
list(set(reff_merged_final_me_gpt_df['peptide']))

Creates new df with peptides discoved in Uniport search and match with MBPDB on peptide and DOI

In [None]:
# Step 1: Create db_match_df

# Filter rows from uni_reff_merged_final_df where the 'peptide' matches with those in mbpdb_df
# We first convert the relevant columns of mbpdb_df to a dictionary with 'list' orientation.
# Then, we use the 'isin' method to check if the values in the respective columns of uni_reff_merged_final_df are present in this dictionary.
db_match_df = reff_merged_final_me_gpt_df[
    reff_merged_final_me_gpt_df[['peptide']].isin(
        mbpdb_df[['peptide']].to_dict(orient='list')
    ).all(axis=1)
]

# Number of unique peptides, DOIs, and functions not matched with MBPDB
unique_peptides_not_matched = reff_merged_final_me_gpt_df[~reff_merged_final_me_gpt_df['peptide'].isin(db_match_df['peptide'])]['peptide'].nunique()
unique_doi_not_matched = reff_merged_final_me_gpt_df[~reff_merged_final_me_gpt_df['doi'].isin(db_match_df['doi'])]['doi'].nunique()
unique_gpt_functions_not_matched = reff_merged_final_me_gpt_df[~reff_merged_final_me_gpt_df['gpt_function'].isin(db_match_df['gpt_function'])]['gpt_function'].nunique()

# Print the number of rows in the original dataframe uni_reff_merged_final_df
print("Number of records from UniPort/GPT search:", reff_merged_final_me_gpt_df.shape[0])

# Print the number of rows in the filtered dataframe db_match_df
print("Number of peptides matched with MBPDB:", db_match_df.shape[0])

# Print the number of unique peptides, DOIs, and functions not matched with MBPDB
print("Number of unique peptides not matched with MBPDB:", unique_peptides_not_matched)
print("Number of unique DOIs not matched with MBPDB:", unique_doi_not_matched)
print("Number of unique functions from GPT not matched with MBPDB:", unique_gpt_functions_not_matched)

# Display the resulting db_match_df
db_match_df


In [None]:
mbpdb_function_list

Creates dataframe with peptides that did not match MBPDB

In [None]:
# Step 2: Create no_db_match_df

# To identify rows in uni_reff_merged_final_df that are not in db_match_df, we drop rows in uni_reff_merged_final_df
# that have the same index as the rows in db_match_df. The resulting dataframe, no_db_match_df, contains 
# rows from uni_reff_merged_final_df that didn't match with mbpdb_df based on 'peptide' and 'doi'.
no_db_match_df = uni_reff_merged_final_df.drop(db_match_df.index)

# Print the number of rows in the resulting dataframe no_db_match_df
print("Number of unique peptides w/o MBPDB match:", no_db_match_df.shape[0])

# Display no_db_match_df using qgrid for an interactive view
# The 'show_toolbar' option enables a toolbar in the qgrid display, and the 'forceFitColumns' option in grid_options 
# ensures that columns are not forcibly fit into the grid's width, allowing horizontal scrolling.
no_db_match_df_qgrid = qgrid.show_grid(no_db_match_df, show_toolbar=True, grid_options={'forceFitColumns': False})

# Return the qgrid display
no_db_match_df_qgrid


# Downloads a XML file from UniProt or PubMed

UniPort page download

In [None]:
# Specify the protein ID
protein = 'P10451'

# Construct the URL using the protein ID to fetch the XML content from UniProt
url = f'https://www.uniprot.org/uniprot/{protein}.xml'

# Use the 'requests' library to get the content of the URL
response = requests.get(url)
    
# Check to ensure the request was successful (HTTP status code 200 means "OK")
if response.status_code == 200:
    # If successful, save the XML content to a local file with the name '{protein}.xml'
    with open(f'xml_examples/{protein}.xml', 'wb') as file:
        file.write(response.content)
else:
    # If the request wasn't successful, print an error message with the HTTP status code
    print(f'Failed to retrieve the file: {response.status_code}')

PubMed page download

In [None]:
# Specify the PubMed ID
pubmed_id = '1369293'

# Use the Entrez system (from the BioPython library) to fetch the XML details of the article with the specified PubMed ID
handle = Entrez.efetch(db="pubmed", id=pubmed_id, retmode="xml")

# Save the raw XML response to a local file named '{pubmed_id}.xml'
with open(f'xml_examples/{pubmed_id}.xml', 'wb') as file:
    file.write(handle.read())  