In [4]:
import os
import zipfile
import subprocess
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shutil

from Bio import SeqIO
from collections import Counter

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

In [5]:
zip_targets = {
    'TrainFiles.zip': './',
    'TestFiles.zip': './'
}

for zip_path, extract_to in zip_targets.items():
    # Create the output directory if it doesn't exist
    os.makedirs(extract_to, exist_ok=True)

    # Extract zip content
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
        print(f"✅ Extracted {zip_path} to ./{extract_to}/")

✅ Extracted TrainFiles.zip to ././/
✅ Extracted TestFiles.zip to ././/


In [2]:
!docker pull muefab/genie:latest

latest: Pulling from muefab/genie
4f4fb700ef54: Pulling fs layer
b0907bbb508f: Pulling fs layer
846abc99b13f: Pulling fs layer
4619a895afbd: Pulling fs layer
bb6c686a0e98: Pulling fs layer
4f4fb700ef54: Already exists
4619a895afbd: Already exists
b0907bbb508f: Already exists
846abc99b13f: Already exists
bb6c686a0e98: Download complete
bb6c686a0e98: Pull complete
4619a895afbd: Pull complete
b0907bbb508f: Pull complete
846abc99b13f: Pull complete
4f4fb700ef54: Pull complete
Digest: sha256:c3112a3879cc18061bbab5ed8f76dec255ab1be46e2133cd59320dd5ba98ef89
Status: Downloaded newer image for muefab/genie:latest
docker.io/muefab/genie:latest


In [6]:
notebook_dir = os.getcwd()

# Pick one `.mgb` file from TrainFiles
mgb_filename = "ID_ZZWUCJ.mgb"
mgb_filename_no_mgb = mgb_filename[:-4]
train_dir = os.path.join(os.getcwd(), "TestFiles")
mgb_file_path = os.path.join(train_dir, mgb_filename)

# Output location for decoded FASTQ
output_fastq = f"{mgb_filename_no_mgb}.fastq"

# Docker mount paths
host_dir = train_dir                # Local directory with the `.mgb` file
container_dir = "/data"             # Directory inside the container

# Show paths
print(f"📁 Host path to `.mgb`: {mgb_file_path}")
print(f"📁 Host directory mounted: {host_dir}")
print(f"📦 Container directory will be: {container_dir}")
print(f"📄 Output FASTQ: {output_fastq}")

📁 Host path to `.mgb`: C:\Users\milam\PycharmProjects\PythonProject\TestFiles\ID_ZZWUCJ.mgb
📁 Host directory mounted: C:\Users\milam\PycharmProjects\PythonProject\TestFiles
📦 Container directory will be: /data
📄 Output FASTQ: ID_ZZWUCJ.fastq


In [None]:
def inspect_mgb_structure(host_dir=".", container_dir="/work", mgb_filename=mgb_filename):
    command = [
        "docker", "run", "--rm",
        "-v", f"{host_dir}:{container_dir}",
        "muefab/genie:latest", "run",  # ✅ Add "run" subcommand here
        "-f",
        "-i", f"{container_dir}/TestFiles/{mgb_filename}",
        "-o", f"{container_dir}/TestFiles/{mgb_filename_no_mgb}.fastq"
    ]
    print("Running:", " ".join(command))
    result = subprocess.run(command, capture_output=True, text=True)
    print("\n--- STDOUT ---\n")
    print(result.stdout)
    if result.stderr:
        print("\n--- STDERR ---\n")
        print(result.stderr)

inspect_mgb_structure()

In [None]:
# Safer path for Windows (forward slashes or raw string)
fastq_path = os.path.join(os.getcwd(), train_dir, f"{mgb_filename_no_mgb}.fastq")

# Check if the file exists before parsing
if not os.path.exists(fastq_path):
    print(f"❌ FASTQ file not found at: {fastq_path}")
else:
    total_reads = 0
    read_lengths = []
    quality_scores = []

    for record in SeqIO.parse(fastq_path, "fastq"):
        total_reads += 1
        read_lengths.append(len(record.seq))
        quality_scores.extend(record.letter_annotations["phred_quality"])

    print(f"🔍 Total reads: {total_reads}")
    print(f"📏 Avg read length: {sum(read_lengths)/len(read_lengths):.1f} bp")
    print(f"🎯 Avg quality score: {sum(quality_scores)/len(quality_scores):.1f}")


In [None]:
print("🧪 First 3 reads:\n")
for i, record in enumerate(SeqIO.parse(fastq_path, "fastq")):
    print(f"🔹 ID: {record.id}")
    print(f"🔹 SEQ: {record.seq[:50]}...")  # just preview first 50 bp
    print(f"🔹 QUALITY: {record.letter_annotations['phred_quality'][:10]}...\n")
    if i >= 2:
        break


In [None]:
notebook_dir = os.getcwd()
container_dir = "/data"

def decode_all_mgb_in_folder(folder_name):
    host_dir = os.path.join(notebook_dir, folder_name)
    for mgb_filename in os.listdir(host_dir):
        if not mgb_filename.endswith(".mgb"):
            continue

        mgb_filename_no_ext = os.path.splitext(mgb_filename)[0]
        print(f"\n🔄 Decoding: {mgb_filename}")

        command = [
            "docker", "run", "--rm",
            "-v", f"{host_dir}:{container_dir}",
            "muefab/genie:latest", "run",
            "-f",
            "-i", f"{container_dir}/{mgb_filename}",
            "-o", f"{container_dir}/{mgb_filename_no_ext}.fastq"
        ]

        print("Running:", " ".join(command))
        result = subprocess.run(command, capture_output=True, text=True)

        """
        Caution on printing out each line as this does take up memory.

        print("\n--- STDOUT ---\n")
        print(result.stdout)
        if result.stderr:
            print("\n--- STDERR ---\n")
            print(result.stderr)#

        """

In [None]:
decode_all_mgb_in_folder("TrainFiles")
decode_all_mgb_in_folder("TestFiles")