In [1]:
!apt install ncbi-blast+ -y
!pip install biopython pandas


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  ncbi-data
The following NEW packages will be installed:
  ncbi-blast+ ncbi-data
0 upgraded, 2 newly installed, 0 to remove and 41 not upgraded.
Need to get 15.8 MB of archives.
After this operation, 71.8 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 ncbi-data all 6.1.20170106+dfsg1-9 [3,519 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 ncbi-blast+ amd64 2.12.0+ds-3build1 [12.3 MB]
Fetched 15.8 MB in 2s (10.1 MB/s)
Selecting previously unselected package ncbi-data.
(Reading database ... 125080 files and directories currently installed.)
Preparing to unpack .../ncbi-data_6.1.20170106+dfsg1-9_all.deb ...
Unpacking ncbi-data (6.1.20170106+dfsg1-9) ...
Selecting previously unselected package ncbi-blast+.
Preparing to unpack .../ncbi-blast+_2.12.0+ds-3build1_amd64.deb .

DNA or Protein sequence character sequence search for similarity

In [156]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [157]:
# Healty sequence
healthy_seq = SeqRecord(
    Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"),
    id="Healthy_1",
    description="Healthy gene"
)

In [158]:
# Cancer-like cell version
cancer_seq = SeqRecord(
    Seq("ATGGCCATTGTAGTGGGCCGCTGAAAGGGTACCCGATAG"),
    id="Cancer_1",
    description="Mutated (Cancer) gene"
)


In [159]:
# Save FASTA file
SeqIO.write([healthy_seq, cancer_seq], "samples.fasta", "fasta")
print("Sample FASTA file created.")

Sample FASTA file created.


In [160]:
!makeblastdb -in samples.fasta -dbtype nucl -out cancerdb
!blastn -query samples.fasta -db cancerdb -out results.txt -outfmt 7



Building a new DB, current time: 11/02/2025 22:00:14
New DB name:   /content/cancerdb
New DB title:  samples.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /content/cancerdb
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 2 sequences in 0.000589848 seconds.




In [161]:
import pandas as pd

# Reading BLAST output
with open("results.txt") as f:
    lines = [line.strip() for line in f if not line.startswith("#") and line.strip()]

data = []
for line in lines:
    parts = line.split("\t")
    data.append({
        "Query": parts[0],
        "Subject": parts[1],
        "Identity(%)": float(parts[2]),
        "Alignment Length": int(parts[3]),
        "Mismatches": int(parts[4]),
    })

df = pd.DataFrame(data)
df


Unnamed: 0,Query,Subject,Identity(%),Alignment Length,Mismatches
0,Healthy_1,Healthy_1,100.0,39,0
1,Cancer_1,Cancer_1,100.0,39,0


In [162]:
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

alignments = pairwise2.align.globalxx(healthy_seq.seq, cancer_seq.seq)
print(format_alignment(*alignments[0]))


ATGGCCATTGTAA-TGGGCCGCTGAAAGGGTG-CCCGATAG
||||||||||| | |||||||||||||||||  ||||||||
ATGGCCATTGT-AGTGGGCCGCTGAAAGGGT-ACCCGATAG
  Score=37



In [164]:
matches = sum(a == b for a, b in zip(healthy_seq.seq, cancer_seq.seq))
score = matches / len(healthy_seq.seq)
print(f"Cancer Validation Similarity: {score*100:.2f}%")

if score < 0.95:
    print(" Potential Cancer Mutation Detected.")
else:
    print(" Sequence appears normal.")


Cancer Validation Similarity: 94.87%
 Potential Cancer Mutation Detected.




# Here is more complicated as Pipeline Level




In [13]:
!apt install ncbi-blast+ -y
!pip install biopython pandas


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ncbi-blast+ is already the newest version (2.12.0+ds-3build1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


# Cancer Genes dataset collection from NCBI [BRCA1, TP53, KRAS]

In [165]:
from Bio import Entrez, SeqIO

Entrez.email = "example@mail.com"  # NCBI

# Cancer related 3 gen: BRCA1, TP53, KRAS
GENE_IDS = ["NM_007294", "NM_000546", "NM_004985"]

records = []
for gid in GENE_IDS:
    handle = Entrez.efetch(db="nucleotide", id=gid, rettype="fasta", retmode="text")
    record = SeqIO.read(handle, "fasta")
    records.append(record)
    print(f" Downloaded: {record.id}")

SeqIO.write(records, "cancer_reference_genes.fasta", "fasta")
print("\n Reference cancer genes saved to cancer_reference_genes.fasta")


 Downloaded: NM_007294.4
 Downloaded: NM_000546.6
 Downloaded: NM_004985.5

 Reference cancer genes saved to cancer_reference_genes.fasta


In [166]:
# Print details of each downloaded gene
for record in records:
    print(f"ID: {record.id}")
    print(f"Name: {record.name}")
    print(f"Description: {record.description}")
    print(f"Sequence length: {len(record.seq)}")
    print("-" * 20)

ID: NM_007294.4
Name: NM_007294.4
Description: NM_007294.4 Homo sapiens BRCA1 DNA repair associated (BRCA1), transcript variant 1, mRNA
Sequence length: 7088
--------------------
ID: NM_000546.6
Name: NM_000546.6
Description: NM_000546.6 Homo sapiens tumor protein p53 (TP53), transcript variant 1, mRNA
Sequence length: 2512
--------------------
ID: NM_004985.5
Name: NM_004985.5
Description: NM_004985.5 Homo sapiens KRAS proto-oncogene, GTPase (KRAS), transcript variant b, mRNA
Sequence length: 5306
--------------------


## Genes come from patient DNA Sequence
A small mutation added

In [167]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

# BRCA1 gene 12 base reference sequence
ref_seq = "ATGGGACTGCTTCTTAGTGGCTGGACAGCAGTTTGTGTTTTGAGGTTTTTGTAGAGGATGGGCTTTCAGTTTTAAAGGAACTTCTGGTGAAGAGTGGAAGAGTGAAGAGCTGAGGAAGAGGAAGAG"

# 3 mutation version
# 1. 30. base G -> T
# 2. 60. base deleted
# 3. 90. base "CC" added
mutated = list(ref_seq)
mutated[29] = "T"       # substitution
del mutated[59]          # deletion
mutated.insert(90, "C")  # insertion
mutated.insert(91, "C")

mutated_seq = "".join(mutated)

patient_sample = SeqRecord(
    Seq(mutated_seq),
    id="Patient_BRCA1_mut120",
    description="Simulated BRCA1 cancer-like variant (120bp, with 3 mutations)"
)

SeqIO.write([patient_sample], "patient_sample.fasta", "fasta")
print("Mutated cancer-like 120bp BRCA1 FASTA created.")
print("First 80 bp:", mutated_seq[:80])


Mutated cancer-like 120bp BRCA1 FASTA created.
First 80 bp: ATGGGACTGCTTCTTAGTGGCTGGACAGCTGTTTGTGTTTTGAGGTTTTTGTAGAGGATGGCTTTCAGTTTTAAAGGAAC


Dataset generation

In [168]:
!makeblastdb -in cancer_reference_genes.fasta -dbtype nucl -out cancer_ref_db




Building a new DB, current time: 11/02/2025 22:04:56
New DB name:   /content/cancer_ref_db
New DB title:  cancer_reference_genes.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /content/cancer_ref_db
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 3 sequences in 0.000592947 seconds.




Alignment - Patient DNA and cancer genes comparison

In [169]:
!blastn -task blastn-short -query patient_sample.fasta -db cancer_ref_db -out results.txt -outfmt "6 qseqid sseqid pident length bitscore evalue"


Result read and analysis, blast output, identity, alignment length, mismatch and gap numbers

In [171]:
import pandas as pd

# BLAST columns
cols = ["Query_ID", "Matched_Gene", "Identity(%)", "Alignment_Length", "Bitscore", "Evalue"]

df = pd.read_csv("results.txt", sep="\t", names=cols)

if df.empty:
    print(" No significant BLAST hits found(short sequence its normal)")
else:
    display(df)


Unnamed: 0,Query_ID,Matched_Gene,Identity(%),Alignment_Length,Bitscore,Evalue
0,Patient_BRCA1_mut120,NM_004985.5,100.0,12,24.3,0.085
1,Patient_BRCA1_mut120,NM_004985.5,100.0,12,24.3,0.085
2,Patient_BRCA1_mut120,NM_004985.5,100.0,9,18.3,5.3
3,Patient_BRCA1_mut120,NM_004985.5,100.0,9,18.3,5.3
4,Patient_BRCA1_mut120,NM_004985.5,100.0,9,18.3,5.3
5,Patient_BRCA1_mut120,NM_004985.5,100.0,9,18.3,5.3
6,Patient_BRCA1_mut120,NM_000546.6,100.0,11,22.3,0.34
7,Patient_BRCA1_mut120,NM_000546.6,100.0,10,20.3,1.3
8,Patient_BRCA1_mut120,NM_000546.6,100.0,10,20.3,1.3
9,Patient_BRCA1_mut120,NM_000546.6,100.0,9,18.3,5.3


In [172]:
def interpret_identity(val):
    if val >= 99:
        return "Normal / Reference match"
    elif val >= 95:
        return "Mild variant (Benign)"
    else:
        return "Potential cancer mutation"

if not df.empty:
    df["Interpretation"] = df["Identity(%)"].apply(interpret_identity)
    display(df)


Unnamed: 0,Query_ID,Matched_Gene,Identity(%),Alignment_Length,Bitscore,Evalue,Interpretation
0,Patient_BRCA1_mut120,NM_004985.5,100.0,12,24.3,0.085,Normal / Reference match
1,Patient_BRCA1_mut120,NM_004985.5,100.0,12,24.3,0.085,Normal / Reference match
2,Patient_BRCA1_mut120,NM_004985.5,100.0,9,18.3,5.3,Normal / Reference match
3,Patient_BRCA1_mut120,NM_004985.5,100.0,9,18.3,5.3,Normal / Reference match
4,Patient_BRCA1_mut120,NM_004985.5,100.0,9,18.3,5.3,Normal / Reference match
5,Patient_BRCA1_mut120,NM_004985.5,100.0,9,18.3,5.3,Normal / Reference match
6,Patient_BRCA1_mut120,NM_000546.6,100.0,11,22.3,0.34,Normal / Reference match
7,Patient_BRCA1_mut120,NM_000546.6,100.0,10,20.3,1.3,Normal / Reference match
8,Patient_BRCA1_mut120,NM_000546.6,100.0,10,20.3,1.3,Normal / Reference match
9,Patient_BRCA1_mut120,NM_000546.6,100.0,9,18.3,5.3,Normal / Reference match


In [173]:
import json

if not df.empty:
    df.to_csv("Cancer_Validation_Report.csv", index=False)
    with open("Cancer_Validation_Report.json", "w") as f:
        json.dump(df.to_dict(orient="records"), f, indent=4)
    print("Cancer_Validation_Report.csv / .json kaydedildi.")


Cancer_Validation_Report.csv / .json kaydedildi.
