In [1]:
!pip install biopython --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.9/3.3 MB[0m [31m22.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m3.0/3.3 MB[0m [31m37.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.3/3.3 MB[0m [31m36.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25h

**FASTA File Setup in Notebook**

In [2]:
import pandas as pd

base_url = "https://raw.githubusercontent.com/mouminx/BBIO_383_FASTA/main/"

fasta_files = [
    "Oceanospirillum_yesbio_china.fasta",
    "Oleispira_nobio_cChina.fasta",
    "dinor_yesbio_germany.fasta",
    "novo_yesbio_sweden.fasta",
    "rhodop_nobio_germany.fasta",
    "ricket_nobio_sweden.fasta"
]

**Download Files into Notebook**

In [3]:
import os

def download_fasta_files(filenames, base_url, save_dir="data"):
    os.makedirs(save_dir, exist_ok=True)
    for filename in filenames:
        url = base_url + filename
        local_path = os.path.join(save_dir, filename)
        if not os.path.exists(local_path):
            !wget -q -O {local_path} {url}
            print(f"Downloaded: {filename}")
        else:
            print(f"Already exists: {filename}")

download_fasta_files(fasta_files, base_url)

Downloaded: Oceanospirillum_yesbio_china.fasta
Downloaded: Oleispira_nobio_cChina.fasta
Downloaded: dinor_yesbio_germany.fasta
Downloaded: novo_yesbio_sweden.fasta
Downloaded: rhodop_nobio_germany.fasta
Downloaded: ricket_nobio_sweden.fasta


**Count Product Types**

In [4]:
def count_product_types(filepath):
    hypothetical = 0
    identified = 0
    total = 0

    with open(filepath, 'r') as f:
        for line in f:
            if line.startswith('>'):
                total += 1
                if 'hypothetical' in line.lower():
                    hypothetical += 1
                else:
                    identified += 1
    return hypothetical, identified, total

**Concatenate All Sequences**

In [5]:
def concat_fasta_sequence(filepath):
  from Bio import SeqIO
  return "".join(str(record.seq) for record in SeqIO.parse(filepath, "fasta"))

In [6]:
def calculate_dinucleotide_content(sequence):
    """ Calculates the dinucleotide content relative to genome length


    """
    # convert sequence to uppercase
    sequence = sequence.upper()

    # subtract 1 from total length (bc of how indexing works)
    total_dinucleotides = len(sequence) - 1

    # dictionary
    dinucleotide_content = {}

    # iterate through the sequence
    for i in range(total_dinucleotides):

        # grab a dinucleotide (2 bases)
        dinucleotide = sequence[i:i+2]

        # if a length of a dinucleotide is = 2
        if len(dinucleotide) == 2:

            # increment the count for the dinucleotide in the dictionary
            dinucleotide_content[dinucleotide] = dinucleotide_content.get(dinucleotide, 0) + 1

    # for every dinucleotide in the dictionary
    for dinucleotide in dinucleotide_content:

        # calculate th percentage and update the value in dictionary
        dinucleotide_content[dinucleotide] = (dinucleotide_content[dinucleotide] / total_dinucleotides) * 100

    # return dictionary
    return dinucleotide_content

**MAIN**

In [7]:
import pandas as pd

summary = []

#define all 16 possible dinucleotide combinations

all_dinucleotides = [
    'AA', 'AT', 'AC', 'AG',
    'TA', 'TT', 'TC', 'TG',
    'CA', 'CT', 'CC', 'CG',
    'GA', 'GT', 'GC', 'GG'
]

for filename in fasta_files:
    filepath = os.path.join("data", filename)

    # get counts
    hypo, ident, total = count_product_types(filepath)
    sequence = concat_fasta_sequence(filepath)
    d_content = calculate_dinucleotide_content(sequence)

    # start with product info
    row = {
        "File": filename,
        "# Hypothetical": hypo,
        "# Identified": ident,
        "# Total": total
    }

    # add dinucleotide % (rounded to 2 decimals), even if 0.0
    for d in all_dinucleotides:
        row[d] = round(d_content.get(d, 0.0), 2)

    summary.append(row)

# Create dataframe
df = pd.DataFrame(summary)
df

Unnamed: 0,File,# Hypothetical,# Identified,# Total,AA,AT,AC,AG,TA,TT,TC,TG,CA,CT,CC,CG,GA,GT,GC,GG
0,Oceanospirillum_yesbio_china.fasta,1266,2649,3915,6.53,6.13,5.0,5.87,3.8,5.9,5.8,8.63,6.08,6.47,5.95,6.2,7.13,5.63,7.95,6.95
1,Oleispira_nobio_cChina.fasta,1352,1629,2981,9.52,8.1,4.62,6.62,6.87,9.03,4.89,7.36,6.16,5.73,4.13,4.01,6.31,5.29,6.39,4.98
2,dinor_yesbio_germany.fasta,894,3564,4458,3.21,4.37,5.06,4.18,1.2,3.03,6.16,6.77,5.31,5.51,9.29,12.27,7.1,4.26,11.87,10.41
3,novo_yesbio_sweden.fasta,1167,2343,3510,3.53,4.63,4.66,4.3,1.43,3.47,6.11,6.8,5.82,5.56,7.89,12.93,6.34,4.14,13.55,8.84
4,rhodop_nobio_germany.fasta,1090,3069,4159,3.83,4.89,4.46,4.29,1.29,3.18,6.58,6.74,5.72,5.3,7.59,13.54,6.63,4.42,13.51,8.0
5,ricket_nobio_sweden.fasta,1133,1146,2279,6.93,5.56,5.6,4.25,2.72,5.07,4.96,6.89,6.79,4.54,7.26,10.64,5.91,4.46,11.41,7.01
