In [None]:
from Bio import SeqIO, pairwise2
from Bio.Seq import Seq
from Bio.pairwise2 import format_alignment
from Bio.SeqUtils import gc_fraction, molecular_weight, GC
from Bio.SeqUtils import MeltingTemp as mt

import matplotlib.pyplot as plt
import numpy as np
import Levenshtein

In [None]:
# ---------- 1. Sequence Manipulation ----------

# Read the sequences from FASTA files
seq_x_record = SeqIO.read('./x.fasta', 'fasta')
seq_y_record = SeqIO.read('./y.fasta', 'fasta')

# Get sequences and convert to string
seq_x_str = str(seq_x_record.seq)
seq_y_str = str(seq_y_record.seq)

# Clean sequences by removing ambiguous bases (any character that's not A, T, G, or C)
def clean_sequence(sequence):
    return ''.join(base for base in sequence if base in 'ATGCatgc')

seq_x = clean_sequence(seq_x_str)
seq_y = clean_sequence(seq_y_str)

# Convert to Biopython Seq objects
seq_x_bio = Seq(seq_x)
seq_y_bio = Seq(seq_y)

print(f"Original X length: {len(seq_x_str)}, Cleaned X length: {len(seq_x)}")
print(f"Original Y length: {len(seq_y_str)}, Cleaned Y length: {len(seq_y)}")

In [None]:
# Display which index the first time the codon (CGC) appears in Sequence X and Y
codon = "CGC"
x_index = seq_x.find(codon)
y_index = seq_y.find(codon)

print(f"First occurrence of codon {codon} in Sequence X: index {x_index}")
print(f"First occurrence of codon {codon} in Sequence Y: index {y_index}")

# Create a new sequence as specified
x_first_10 = seq_x[:10]
x_last_5 = seq_x[-5:]
y_first_20 = seq_y[:20]
y_last_3 = seq_y[-3:]


# Combine and reverse to create Sequence Z
combined = x_first_10 + x_last_5 + y_first_20 + y_last_3
seq_z = combined[::-1]  # Reversing the combined sequence
seq_z_bio = Seq(seq_z)

print(f"Sequence Z: {seq_z}")
print(f"Length of Sequence Z: {len(seq_z)}")

In [None]:
# ---------- 2. Sequence Analysis & Plotting ----------

seq_c = seq_z  # Sequence C is Sequence Z

# Base frequency
a_count = seq_c.count("A")
c_count = seq_c.count("C")

print(f"Frequency of A in Sequence C: {a_count}")
print(f"Frequency of C in Sequence C: {c_count}")

plt.bar(['A', 'C'], [a_count, c_count])
plt.title('Frequency of A and C in Sequence C')
plt.ylabel('Count')
plt.show()

In [None]:
# GC and AT content
gc_content = GC(seq_c)  # Using GC function instead of gc_fraction
at_content = 100 - gc_content

print(f"GC content: {gc_content:.2f}%")
print(f"AT content: {at_content:.2f}%")

print(f"\nGC content percentage for Sequence Z: {gc_content:.2f}%")
print(f"AT content percentage for Sequence Z: {at_content:.2f}%")

In [None]:
# Wallace rule: 2*(A+T) + 4*(G+C)
at = seq_c.count('A') + seq_c.count('T')
gc = seq_c.count('G') + seq_c.count('C')

melting_temp = 2 * at + 4 * gc
nn_tm = mt.Tm_NN(seq_c)

print(f"Melting temperature (Wallace rule): {melting_temp} °C")

In [None]:
# Molecular weight
mw = molecular_weight(seq_c, seq_type="DNA")
print(f"Molecular Weight: {mw:.2f} Da\n")

In [None]:
# ---------- 3. Transcription and Translation ----------

seq_c_dna = Seq(seq_c)

mRNA = seq_c_dna.transcribe()
protein = seq_c_dna.translate()

print(f"\nmRNA sequence from Sequence Z: {mRNA}")
print(f"Amino acid sequence from Sequence Z: {protein}")

In [None]:
# ---------- 4. Alignment & Similarities ----------

seq_a = seq_x
seq_b = seq_y
seq_c = seq_z

# Local alignment A vs B, localms untuk custom habis seq_b, match, mismatch, gap start, gap stop

local_alignments = pairwise2.align.localxx(seq_a, seq_b)
local_score = local_alignments[0].score if local_alignments else 0

print(f"Local alignment score (A vs B): {local_score}")

In [None]:
# Global alignment B vs C

global_alignments = pairwise2.align.globalxx(seq_b, seq_c)
global_score = global_alignments[0].score if global_alignments else 0

print(f"Global alignment score (B vs C): {global_score}")

In [None]:
#Hamming Distance

if len(seq_a) == len(seq_b):
    hamming_distance = sum(a != b for a, b in zip(seq_a, seq_b))
else:
    hamming_distance = "Not available (different lengths)"
print(f"Hamming Distance (A vs B): {hamming_distance}")

# Levenshtein Distance
lev_distance = Levenshtein.distance(seq_a, seq_b)
print(f"Levenshtein Distance (A vs B): {lev_distance}")