# 1. METHOD 1: Via Uniprot REST API and `requests`

In [2]:
import requests

#! Get the Uniprot request URL
api_endpoint = "https://www.uniprot.org/uniprot/"
protein = "P01308" # put the uniprot protein ID here
req_url = api_endpoint+protein+".fasta"

#! Request the protein from Uniprot & print its FASTA sequence
response = requests.get(req_url)
if response.status_code == 200:
    print(response.text)
else:
    print("Something wrong!")
    print(response.status_code)

#! Save the FASTA sequence to a file
filename = protein+".fasta"
if response.status_code == 200:
    with open(filename, 'w') as file:
        file.write(response.text)
    print(f"✓ Downloaded {protein} to {filename}")
else:
    print("Something wrong!")
    print(response.status_code)

>sp|P01308|INS_HUMAN Insulin OS=Homo sapiens OX=9606 GN=INS PE=1 SV=1
MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAED
LQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN

✓ Downloaded P01308 to P01308.fasta


# 2. METHOD 2: via `Entrez` and `SeqIO` module in `Biopython`

In [1]:
from Bio import Entrez
from Bio import SeqIO

def retrieve_prot_sequence(protein_id):
    
    #! Set my email (mandatory for Entrez)
    Entrez.email = 'nguyuling@graduate.utm.my'

    #! Fetch protein
    handle = Entrez.efetch(
        db='protein',
        id=protein_id,
        rettype='fasta',
        retmode='text'
    )
    
    #! Read the FASTA record from the handle
    record = SeqIO.read(handle, 'fasta')
    handle.close()
    
    #! Extract the sequence string
    protein_sequence = str(record.seq)
    
    #! Print the formatted data
    print("Protein ID:", record.id)
    print("Description:", record.description)
    print("Length:", len(record.seq), "amino acids")
    print("Sequence:")
    print(protein_sequence[:80] + "...")

    #! Save to file
    filename = protein_id + ".fasta"
    with open(filename, "w") as file:
        SeqIO.write(record, file, "fasta")
    print(f"Successfully saved protein ID {protein_id} to {filename}")
    
    return protein_sequence

# Function Call
protein_sequence = retrieve_prot_sequence("P01308")


Protein ID: sp|P01308.1|INS_HUMAN
Description: sp|P01308.1|INS_HUMAN RecName: Full=Insulin; Contains: RecName: Full=Insulin B chain; Contains: RecName: Full=Insulin A chain; Flags: Precursor
Length: 110 amino acids
Sequence:
MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPL...
Successfully saved protein ID P01308 to P01308.fasta


In [1]:
from Bio import Entrez
from Bio import SeqIO

def retrieve_prot_sequence(protein_id):
    
    #! Set my email (mandatory for Entrez)
    Entrez.email = 'nguyuling@graduate.utm.my'

    #! Fetch protein
    handle = Entrez.efetch(
        db='protein',
        id=protein_id,
        rettype='fasta',
        retmode='text'
    )
    
    #! Read the FASTA record from the handle
    record = SeqIO.read(handle, 'fasta')
    handle.close()
    
    #! Extract the sequence string
    protein_sequence = str(record.seq)
    
    #! Print the formatted data
    print("Protein ID:", record.id)
    print("Description:", record.description)
    print("Length:", len(record.seq), "amino acids")
    print("Sequence:")
    print(protein_sequence[:80] + "...")

    #! Save to file
    filename = protein_id + ".fasta"
    with open(filename, "w") as file:
        SeqIO.write(record, file, "fasta")
    print(f"Successfully saved protein ID {protein_id} to {filename}")
    
    return protein_sequence

# Function Call
protein_sequence = retrieve_prot_sequence("P01308")


Protein ID: sp|P01308.1|INS_HUMAN
Description: sp|P01308.1|INS_HUMAN RecName: Full=Insulin; Contains: RecName: Full=Insulin B chain; Contains: RecName: Full=Insulin A chain; Flags: Precursor
Length: 110 amino acids
Sequence:
MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPL...
Successfully saved protein ID P01308 to P01308.fasta


# 3. BASIC SEQUENCE ANALYSIS

In [None]:
from collections import Counter
import pandas as pd

def generate_basic_analysis(protein_seq):
    
    #! Count every unique amino acid character
    aa_count = Counter(protein_seq)
    #! Count the length of the protein sequence
    total = len(protein_seq)
    #! Calculate the the percentage of each amino acid
    aa_percent = {aa: (count / total) * 100 for aa, count in aa_count.items()}

    #! Create the data frame
    df = pd.DataFrame({
        'Amino Acid' : list(aa_count.keys()),
        'Count': list(aa_count.values()),
        'Percentage': [aa_percent[aa] for aa in aa_count.keys()]
    })

    #! Sorting Requirement: Sort the DataFrame by the 'Count' column in descending order
    df_sorted = df.sort_values(by='Count', ascending=False)
    
    return df_sorted

#! Function Call
df_protein = generate_basic_analysis(protein_sequence) # protein_sequence from previous section
print(df_protein)


--- Amino Acid Composition Results ---
   Amino Acid  Count  Percentage
2           L     20   18.181818
6           G     12   10.909091
1           A     10    9.090909
15          E      8    7.272727
11          Q      7    6.363636
5           P      6    5.454545
9           V      6    5.454545
13          C      6    5.454545
4           R      5    4.545455
14          S      5    4.545455
16          Y      4    3.636364
10          N      3    2.727273
17          T      3    2.727273
8           F      3    2.727273
18          K      2    1.818182
0           M      2    1.818182
12          H      2    1.818182
7           D      2    1.818182
3           W      2    1.818182
19          I      2    1.818182


# PART 1: Working with Protein Sequences (Beginner)

In [None]:
# RUn this cell first to install reuired packages
# You only need to run this once

!pip install biopython requests pandas matplotlib networkx numpy

print("\n✓ All packages installed successfully!")

<hr>

### 1.1 Understanding Protein Sequences
#### Background:
Proteins are biological molecules made up of amino acids. Each protein has a unique sequence of amino acids that determines its structure and function. In bioinformatics, we represent proteins as strings of one-letter amino acid codes.

#### **The 20 Standard Amino Acids:**
![20 standard aa](https://i0.wp.com/www.compoundchem.com/wp-content/uploads/2014/09/20-Common-Amino-Acids.png?fit=2480%2C1754&ssl=1)
- A (Ala), C (Cys), D (Asp), E (Glu), F (Phe)
- G (Gly), H (His), I (Ile), K (Lys), L (Leu)
- M (Met), N (Asn), P (Pro), Q (Gln), R (Arg)
- S (Ser), T (Thr), V (Val), W (Trp), Y (Tyr)

### Use Case 1: Analyzing Insulin Protein
**Biological Context:** Insulin is a hormone that regulates blood sugar. It's a small protein, making it perfect for learning.

**Data Source:** UniProt Database (https://www.uniprot.org/)
- Protein: Human Insulin (UniProt ID: P01308)
- Format: FASTA (text format for biological sequences)

### 1.1 Retrieving Protein Sequence Data

#### Method 1: via Uniprot REST API and `requests`

![](http://demoapp.chat/images/api.png)

In this method, we will use the API endpoint of Uniprot, send the request via `requests` library in Python. It's the same with how we surf internet, visiting websites. To visit a website, we need to know the URL. Same in here, to retrieve the data, we need to know the "link". 

In this case there will be a standard general part of the link, a.k.a the **"API endpoint"**. For Uniprot, we can use the following API endpoint: `https://www.uniprot.org/uniprot/{protein-id}.fasta` where protein-id refers to any specific ID of protein in Uniprot. 

In [None]:
import requests

api_endpoint = "https://www.uniprot.org/uniprot/"
protein = "P01308" # put the uniprot protein ID here
req_url = api_endpoint+protein+".fasta"

print(req_url)

Ok, now we have the request URL ready, we need to send the request to the Uniprot server from Python and retrieve the data.

In [None]:
response = requests.get(req_url)

if response.status_code == 200:
    print(response.text)
else:
    print("Something wrong!")
    print(response.status_code)

If we further enhance the code above, we can save the response text into a fasta file. Then we would have a function to download protein sequence data (in FASTA) in Python

In [None]:
response = requests.get(req_url)

filename = protein+".fasta"

if response.status_code == 200:
    with open(filename, 'w') as file:
        file.write(response.text)
    print(f"✓ Downloaded {protein} to {filename}")
else:
    print("Something wrong!")
    print(response.status_code)

#### Method 2: via `Entrez` and `SeqIO` module in `Biopython`

Different with the Method 1 above, using `Entrez` and `SeqIO` in `Biopython` technically retrieve the same thing given the same protein ID, but the information retrieved from `Entrez` library can be more flexible. For example, one could extract only the sequence itself by using the `.sequence` attribute from the returned result object.

In [None]:
from Bio import Entrez
from Bio import SeqIO

In [None]:
Entrez.email = 'cwenghowe@utm.my'
protein = "P01308"
handle = Entrez.efetch(db='protein', id=protein , rettype='fasta', retmode='text')

In [None]:
record = SeqIO.read(handle, 'fasta')

In [None]:
print(record.format('fasta'))

In [None]:
print(record.seq)

To save the retrieved protein data from `Entrez`, we can use `SeqIO.write()` as follows

In [12]:
filename = protein+".fasta"
with open(filename, "w") as file:
    SeqIO.write(record, file, "fasta")

#### Practice 1
Now create a function named: `retrieve_prot_sequence()` that accepts one parameters, the protein ID. and does not return anything. The function basically takes the protein ID, retrieve via `Entrez` method and store into a FASTA file at the end.

In [None]:
# my answer

from Bio import Entrez
from Bio import SeqIO

def retrieve_prot_sequence(protein_id):
    
    # Set your email (mandatory for Entrez)
    Entrez.email = 'nguyuling@graduate.utm.my'

    # Fetch
    handle = Entrez.efetch(
        db='protein',
        id=protein_id,
        rettype='fasta',
        retmode='text'
    )
    
    # Read the FASTA record from the handle
    record = SeqIO.read(handle, 'fasta')
    handle.close()
    
    # Write the SeqRecord object to the FASTA file
    filename = f"{protein_id}.fasta" 
    with open(filename, "w") as file:
        SeqIO.write(record, file, "fasta")
        
    print(f"Successfully retrieved and saved protein ID {protein_id} to {filename}")

<hr>

### 1.2 Parsing Protein Sequence Data
For the protein data retrieved from `Entrez` method above, we can further parse the data using `SeqIO.parse()`

In [None]:
# lets read the file P01308.fasta

insulin_data = []

for record in SeqIO.parse('P01308.fasta', "fasta"):
    insulin_data.append({
        'id': record.id,
        'description': record.description,
        'sequence': str(record.seq),
        'length': len(record.seq)
    })

print(insulin_data)

In [None]:
for info in insulin_data:
    print("Protein ID:", info['id'])
    print("Description:", info['description'])
    print("Length:", info['length'], "amino acids")
    print("Sequence:")
    print(info['sequence'][:80] + "...")  # Show first 80 characters

<hr>

### 1.2 Basic Sequence Analysis


In [None]:
insulin_sequence = insulin_data[0]['sequence']
# Sequence Retrieval: Retrieves the amino acid sequence string from the insulin_data list.
# insulin_data[0]: Selects the first item in the list (since the FASTA file only contained one record, this is the only dictionary).
# ['sequence']: Accesses the value associated with the key 'sequence' inside that dictionary. This final result is the protein sequence string.

print(insulin_sequence)

In [1]:
from collections import Counter
# Counter is an efficient tool for tallying the frequency of items in a list or characters in a string.

# Count each amino acid
aa_count = Counter(insulin_sequence)
# It takes the insulin_sequence string and automatically counts every unique amino acid character,
# storing the result as a dictionary-like object (e.g., {'M': 1, 'A': 5, ...}).

total = len(insulin_sequence)
# Uses the len() function to get the total number of amino acids in the sequence.

print(aa_count)
print(total)

In [None]:
# Calculate percentages
aa_percent = {aa: (count/total)*100 for aa, count in aa_count.items()}
# Calculate Percentages (Dictionary Comprehension): This line uses a dictionary comprehension—a concise way to build a new dictionary.
# It loops through every item in aa_count (where aa is the amino acid and count is its frequency).
# For each amino acid, it calculates the percentage: (count / total) * 100.
# The result is a new dictionary, aa_percent, where keys are amino acids and values are their percentage composition.

print(aa_percent)

<hr>