<a href="https://colab.research.google.com/github/mervekldrm/bioinformatics/blob/main/hw2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install BioPython on local computer
#!pip3 install biopython


In [2]:
try:
    import google.colab
    # Running on Google Colab, so install Biopython first
    !pip install biopython
except ImportError:
    pass



Collecting biopython
  Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [3]:
import os
import sys

from urllib.request import urlretrieve

import Bio
from Bio import SeqIO, SearchIO, Entrez
from Bio.Seq import Seq
from Bio import SeqUtils # Import the SeqUtils module
from Bio.Blast import NCBIWWW
from Bio.Data import CodonTable

print("Python version:", sys.version_info)
print("Biopython version:", Bio.__version__)

Python version: sys.version_info(major=3, minor=12, micro=12, releaselevel='final', serial=0)
Biopython version: 1.85


In [19]:
# --- Gerekli kütüphaneler ---
import os
from Bio import Entrez, SeqIO

# --- Çıktı klasörü oluştur ---
outdir = "hw_2"
os.makedirs(outdir, exist_ok=True)

# --- Entrez ayarları ---
Entrez.email = "mervekaldirim@posta.mu.edu.tr"   # kendi mailini yaz


In [20]:
# The line below uses the Entrez.einfo() function to
# ask NCBI what databases are available. The result is
# 'stored' in a variable called 'handle'
handle = Entrez.einfo()

# In the line below, the response from NCBI is read
# into a record, that organises NCBI's response into
# something you can work with.
record = Entrez.read(handle)

In [24]:
# TP53 gene araması
handle = Entrez.esearch(db="gene", term="7157[uid] AND Homo sapiens[orgn]")
record = Entrez.read(handle)
print(record["IdList"])   # ['7157'] görmelisin


['7157']


In [28]:
# Gene özetinden kromozom ve koordinat bilgisi
summ = Entrez.esummary(db="gene", id="7157")
sd = Entrez.read(summ)

# DocumentSummarySet içinden ilk kaydı al
docsum = sd["DocumentSummarySet"]["DocumentSummary"][0]

# GenomicInfo içinden koordinatları çek
gi = docsum["GenomicInfo"][0]
chr_acc = gi["ChrAccVer"]     # NC_000017.11

s = int(gi["ChrStart"])
e = int(gi["ChrStop"])

# efetch için 1-based → küçük olana +1 ekle
seq_start = min(s, e) + 1
seq_stop  = max(s, e)

# gen ters iplikteyse (ChrStart > ChrStop) strand=2
strand = 2 if s > e else 1

print("Chrom:", chr_acc,
      "seq_start:", seq_start,
      "seq_stop:", seq_stop,
      "strand:", strand,
      "len:", seq_stop - seq_start + 1)



Chrom: NC_000017.11 seq_start: 7668421 seq_stop: 7687489 strand: 2 len: 19069


In [29]:
# Genomik DNA FASTA indir
genomic_fasta = os.path.join(outdir, "TP53_genomic_19070bp.fasta")

with Entrez.efetch(db="nuccore", id=chr_acc, rettype="fasta", retmode="text",
                   seq_start=seq_start, seq_stop=seq_stop, strand=strand) as h:
    open(genomic_fasta, "w").write(h.read())

# kontrol
g_rec = next(SeqIO.parse(genomic_fasta, "fasta"))
print("Genomic length:", len(g_rec.seq), "bp →", genomic_fasta)


Genomic length: 19069 bp → hw_2/TP53_genomic_19070bp.fasta


In [37]:
# Ödevde istenen mRNA accession sabit:
MRNA_ACC = "NM_000546.6"


In [38]:
cds_fasta = os.path.join(outdir, "TP53_CDS_1182bp.fasta")

# mRNA'nın GenBank kaydını indir
with Entrez.efetch(db="nucleotide", id=MRNA_ACC, rettype="gb", retmode="text") as h:
    gb_rec = SeqIO.read(h, "genbank")

# CDS feature'ını bul (TP53'te tek ana CDS olmalı)
cds_feat = next(f for f in gb_rec.features if f.type == "CDS")
cds_seq  = cds_feat.extract(gb_rec.seq)

# FASTA'ya yaz
SeqIO.write(
    SeqIO.SeqRecord(cds_seq, id=f"TP53_CDS_from_{MRNA_ACC}", description=""),
    cds_fasta,
    "fasta"
)

# kontrol
c_rec = next(SeqIO.parse(cds_fasta, "fasta"))
print("CDS length:", len(c_rec.seq), "bp →", cds_fasta)


CDS length: 1182 bp → hw_2/TP53_CDS_1182bp.fasta


In [39]:
combined_fasta = os.path.join(outdir, "TP53_combined_genomic_mRNA_CDS.fasta")

records = []
for fp in [genomic_fasta, mrna_fasta, cds_fasta]:
    records.extend(list(SeqIO.parse(fp, "fasta")))

SeqIO.write(records, combined_fasta, "fasta")
print("Combined FASTA yazıldı:", combined_fasta, "| record count:", len(records))


Combined FASTA yazıldı: hw_2/TP53_combined_genomic_mRNA_CDS.fasta | record count: 3


In [40]:
expected = {"Genomic (~19070)": 19070, "mRNA v1": 2512, "CDS": 1182}
obs = {"Genomic (~19070)": len(g_rec.seq), "mRNA v1": len(m_rec.seq), "CDS": len(c_rec.seq)}

for k in expected:
    mark = "OK" if obs[k] == expected[k] else f"⚠ {obs[k]} bp (expected {expected[k]})"
    print(f"{k}: {mark}")


Genomic (~19070): ⚠ 19069 bp (expected 19070)
mRNA v1: OK
CDS: OK


In [42]:
# Her bir FASTA dosyasını ekranda göster (ilk 300 bp örnek)
for f in [genomic_fasta, mrna_fasta, cds_fasta, combined_fasta]:
    print("\n=== ", f, " ===")
    for record in SeqIO.parse(f, "fasta"):
        print(">", record.id)
        print(str(record.seq)[:300] + ("..." if len(record.seq) > 300 else ""))



===  hw_2/TP53_genomic_19070bp.fasta  ===
> NC_000017.11:c7687489-7668421
TCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGGTAAGCTCCTGACTGAACTTGATGAGTCCTCTCTGAGTCACGGGCTCTCGGCTCCGTGTATTTTCAGCTCGGGAAAATCGCTGGGGCTGGGGGTGGGGCAGTGGGGACTTAGCGAGTTTGGGGGTGAGTGGGATGGAAGCTTGGCTAGAGGGATCATCATAGGAGTTGCATTGTTGGGAGACCTGGG...

===  hw_2/TP53_mRNA_variant1_2512bp.fasta  ===
> NM_001276760.3
CTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATG...

===  hw_2/TP53_CDS_1182bp.fasta  ===
> TP53_CDS_from_NM_000546.6
ATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAA

In [43]:
from google.colab import files

files.download(combined_fasta)   # TP53_combined_genomic_mRNA_CDS.fasta dosyasını indirir


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>