# BLAST



In [1]:
import requests

# URL to the raw content of the file
fasta_url = "https://github.com/oasci/pitt-biosc1540-2025s/raw/refs/heads/main/content/data/fasta/synthetic.fasta"

# Fetch the file content
response = requests.get(fasta_url)

# Handles HTTP request codes
if response.status_code == 200:
    FASTA_FILE = response.text
else:
    print(f"Failed to fetch file. Status code: {response.status_code}")


In [2]:
def read_fasta(data: str) -> dict[str, str]:
    fasta_dict = {}
    current_header = None
    current_sequence = []
    
    for line in data.splitlines():
        line = line.strip()
        if not line:
            continue  # Skip empty lines
        if line.startswith('>'):
            # If there is a previous header, save its sequence
            if current_header is not None:
                fasta_dict[current_header] = ''.join(current_sequence)
            # Set new header and reset sequence list
            current_header = line[1:].strip()  # Remove the '>' and extra spaces
            current_sequence = []
        else:
            current_sequence.append(line)
    
    # Don't forget to add the last read sequence to the dictionary
    if current_header is not None:
        fasta_dict[current_header] = ''.join(current_sequence)
    
    return fasta_dict

seqs = read_fasta(FASTA_FILE)
print(seqs)

{'synthetic_seq1': 'GCATGTAGGCGCTGGACTCGCTAGTAGTTTTGGGGCTGGAGACCGGAAAACATGTGCTACCTCACTTAGTACTAGCGGGGCAAGACATGCTGCTCTGCGAGTTATGACAGCGGAGAATTACTTTAGGATTTATTAAATCCGAGCCGGCATCCTTTTTCGTCTATGTCTACGAAAATTACAATGGCCGCCTCAGTGATGCGCGTAACCTAGTACGATGCCTAGTGAATT', 'synthetic_seq2': 'GGCGATAAGTTAAATTGTGTCAAGGGATGTCTTCGGAGTTCGAGCAACTGCATACCCCCAGTTAACGTCGTCCTGCCGGCAACGAGCAGCAATACAAGAGCGCCACTATCCTCCCCTACAAACGTATGCACCAAGCCAAGTCCCCATATCAAGGTATCCACGAGCTCAAGGTACTGTCTATAGTCTGCTGCTACAG', 'synthetic_seq3': 'TTCGGAGCGTCCACCGCCTGTCCAAATTTCCATTGTAATGTTGTTGTTAAGGTTGGTAATATGTAGCCCCTGGTAGCAAGACTACGCAGTGAAGGTTCGCCCTACGGACTCTGCGACCAAAGTCGCCCGCGCCGCCAATGACCTCTGCGTTGTGCGCGATTGGTTCCGGATCTCGGGAGCTAGGTCCCGCTGGATTTTGTGGGCAAGCCCTCTCTCTCTTACTTCACCGTGATTATTCCTGGAAACCGCATTTCTAGACTGACCAGTTAGCGT', 'synthetic_seq4': 'TAGCCCCAGCGGCTACTCTCCAGTCCGTCCTAACCTGTCCGAATGGAAGCGTCAATGATATGTATCACTTAATAGCAGGTCGGTTGTCCCGCGTGTCTTGGCAACAATCGATTTTGGACCAAGGCTCAAGCCTTGTTGGAAGTTATCTCAGCATAGCATTCACTCTTCCAGGCGACTCACATTCAGCAAAGATACAGCCTACTATTGCAAACTTGAG

In [3]:
def gen_kmers(sequence: str, k: int) -> tuple[str, int]:
    """Yield each k-mer in the sequence along with its starting index."""
    for i in range(len(sequence) - k + 1):
        yield sequence[i:i+k], i

In [4]:
seq_id, seq = next(iter(seqs.items()))
print(f"Length of sequence: {len(seq)}")

k = 11
seq_kmers = list(gen_kmers(seq, k))
print(f"Number of {k}-mers: {len(seq_kmers)}")

first_kmer, index = seq_kmers[32]
print(f"First {k}-mer: {first_kmer} (starting at index {index})")

Length of sequence: 228
Number of 11-mers: 218
First 11-mer: GGGCTGGAGAC (starting at index 32)


In [None]:
from collections import defaultdict

def build_lookup(
    sequences: dict[str, str],
    word_size: int
) -> dict[str, list[tuple[str, int]]]:
    """
    Build a lookup table mapping each k-mer to a list of tuples.
    Each tuple contains the sequence ID and the starting index of the k-mer.
    """
    lookup = defaultdict(list)
    
    for seq_id, sequence in sequences.items():
        for kmer, index in gen_kmers(sequence, word_size):
            lookup[kmer].append((seq_id, index))
    
    return dict(lookup)

In [25]:
lookup_table = build_lookup(seqs, word_size=8)

for i, (kmer, locations) in enumerate(lookup_table.items()):
    print(f"{kmer} has {len(locations)} hits")
    if i > 10:
        break

GCATGTAG has 26 hits
CATGTAGG has 25 hits
ATGTAGGC has 22 hits
TGTAGGCG has 21 hits
GTAGGCGC has 20 hits
TAGGCGCT has 25 hits
AGGCGCTG has 25 hits
GGCGCTGG has 25 hits
GCGCTGGA has 17 hits
CGCTGGAC has 19 hits
GCTGGACT has 27 hits
CTGGACTC has 25 hits
