In [4]:
!pip install biopython
!pip install pysam

Looking in indexes: https://pypi.python.org/simple, https://cognite.jfrog.io/cognite/api/pypi/snakepit/simple
Looking in indexes: https://pypi.python.org/simple, https://cognite.jfrog.io/cognite/api/pypi/snakepit/simple


In [42]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [50]:
def read_fastq(filename):
    sequences = []
    with open(filename, 'r') as file:
        while True:
            header = file.readline().strip()
            if not header:
                break
            sequence = file.readline().strip()
            file.readline()  # Skip the "+"
            quality = file.readline().strip()
            sequences.append((header, sequence, quality))
    return sequences

In [51]:
def find_overlap(read1, read2, min_overlap=10):
    """Find the overlap between two reads"""
    len1 = len(read1)
    len2 = len(read2)
    max_overlap = min(len1, len2)

    for overlap in range(max_overlap, min_overlap - 1, -1):
        if read1.endswith(read2[:overlap]):
            return overlap
    return 0

In [52]:
def merge_reads(read1, read2, overlap):
    """Merge two reads using the given overlap"""
    return read1 + read2[overlap:]

In [53]:
def olc_assembly(r1_reads, r2_reads):
    """Assemble paired-end reads using the OLC algorithm"""
    assembled_sequences = []
    for read1 in r1_reads:
        best_overlap = 0
        best_read2 = None

        for read2 in r2_reads:
            overlap = find_overlap(read1[1], read2[1])
            if overlap > best_overlap:
                best_overlap = overlap
                best_read2 = read2

        if best_read2 is not None:
            assembled_sequence = merge_reads(read1[1], best_read2[1], best_overlap)
            assembled_quality = read1[2] + best_read2[2][best_overlap:]
            assembled_sequences.append((read1[0], assembled_sequence, assembled_quality))

    return assembled_sequences


In [54]:
# Read paired-end reads from FASTQ files
r1_fastq_reads = read_fastq("data/mutant_R1.fastq")
r2_fastq_reads = read_fastq("data/mutant_R2.fastq")

In [55]:
# Perform overlap-layout-consensus assembly
olc_assembled_sequences = olc_assembly(r1_reads=r1_fastq_reads, r2_reads=r2_fastq_reads)
olc_assembled_sequences

[('@mutant-no_snps.gff-24960/1',
  'AATGTTGTCACTTGGATTCAAATGACATTTTAAATCTAATTATTCATGAATCGAACTAGTACGAAATGCAATGAGCATCTTGTCTAGTTCGATTTTTTAATGTCTAAAAATGTCGTATATGTAATCAGAGTAGAAAGTGTTGAGGCGTTTCAGAAGTTGTTTAGAAAAGTAAGTAAAATAAAAAATGCACTGAGCAACAAAAGATGTTGCTCGTGCATTTAG',
  '5??A9?BBBDDDBEDDBFF+FGHHIIHHHEIHIIHIIAHDHIIHIG#IIHIFHHHFGIII*IHHHIHFIIHGICIHHIHFFFHHHIIIIIHIHDHIIIAHHH?GHHHHHF@HGGH6GGHEEGBGGGGGFGFE6FGFEFE?GF5EGGEEACFHH*ICHHGCHGHFHIHHHFFFDHHDFFHHHEEFEDGHH+8GFG.EGF*FHEG@*FEE*DBGFF:+GGGGG0'),
 ('@mutant-no_snps.gff-24958/1',
  'CAAAGTCGTTGGTCATATAAAAAACCGCGTACAGTCAACTATAGATACAATCAAGATAAACTCATGCACAGATTGGGAGATATTTTAGTGCAATATGGAATTCAACATGACACAGGGTTATTACCACATGAATGGCATTATCACATTTCGCCAC',
  '?A????@?DDDABDE9FGGGFGICFHIIIBGHIIIGICHHIFH=IHAFIHIHHHHIFCIIEIHAIFGIHIDDIHEIIFIIIIHHHICIIIHIH=@HFHEHBFHIHHHH-HHHFCBHHDGHHEHGEH<GDEEEFHDEGGDGE*FGBGG,GDE?D('),
 ('@mutant-no_snps.gff-24956/1',
  'TATAAATTCAACTTTGCAACAGAACCATCTAATCTTCAACAAACTGGCCCGTTTGTTGAACTACTCTTTAATAAAATAATTTTTCCGTTCCCAATTCCACATTGCAATAATAGAAAATCC

In [66]:
# Write assembled sequences to a FASTA file
with open("results/olc_assembled_sequences.fasta", "w") as output_handle:
    for header, sequence, quality in olc_assembled_sequences:
        output_handle.write(f">{header}\n{sequence}\n")

In [58]:
def calculate_n50_length(contig_lengths):
    sorted_lengths = sorted(contig_lengths, reverse=True)
    total_length = sum(sorted_lengths)
    target_length = total_length / 2
    cumulative_length = 0
    for length in sorted_lengths:
        cumulative_length += length
        if cumulative_length >= target_length:
            return length

In [59]:
def calculate_genome_coverage(assembled_sequences, reference_sequences):
    total_reference_length = sum(len(seq) for seq in reference_sequences)
    covered_length = 0
    for assembled_seq in assembled_sequences:
        for reference_seq in reference_sequences:
            if assembled_seq in reference_seq:
                covered_length += len(assembled_seq)
                break
    return covered_length / total_reference_length

In [67]:
# Read reference genome
with open("data/wildtype.fna", "r") as ref_file:
    reference_sequences_fna = [seq.strip() for seq in ref_file.readlines() if not seq.startswith(">")]

# Calculate N50 length
old_contig_lengths = [len(seq) for _, seq, _ in olc_assembled_sequences]
n50_length = calculate_n50_length(old_contig_lengths)

# Calculate genome coverage
genome_coverage = calculate_genome_coverage([seq for _, seq, _ in olc_assembled_sequences], reference_sequences_fna)

In [68]:
genome_coverage

0.0