In [1]:
import os
# import vcf
from pathlib import Path
from matplotlib import pyplot as plt
from matplotlib import cm
import pandas as pd
import yaml
import glob
import seaborn as sns
from Bio import SeqIO

# Mitochondria assembly

In [2]:
proj_dir="/master/nplatt/sch_hae_its-nigeria"
results_dir=f"{proj_dir}/results"

In [3]:
Path(f"{results_dir}/mito_assembly").mkdir(parents=True, exist_ok=True)
os.chdir(f"{results_dir}/mito_assembly")

In [4]:
info_df=pd.read_csv(f"{proj_dir}/its-nigeria_samplesheet.csv", sep=",")
info_df

Unnamed: 0,wgs_id,its_id,sra,species,country,locale,patient,miracidum
0,Sb_NG_ak_1.1,Sb.ng.ak.1.1F,,sbovis,nigeria,ak,1,1
1,Sb_NG_ak_2.1,sb_ng_ak_2.1,,sbovis,nigeria,ak,2,1
2,Sb_NG_ak_2.2,Sb.ng.ak.2.2F,,sbovis,nigeria,ak,2,2
3,Sb_NG_ak_2.3,Sb.ng.ak.2.3F,,sbovis,nigeria,ak,2,3
4,Sb_NG_ak_3.1,Sb.ng.ak.3.1R,,sbovis,nigeria,ak,3,1
...,...,...,...,...,...,...,...,...
200,Sh_NG_os_3_1,sh_ng_os_3_1,,shaematobium,nigeria,osun,3,1
201,c_Sh_NG_os_3_11,Sh.ng.os.3.11F,,shaematobium,nigeria,osun,3,11
202,c_Sh_NG_os_3_5,Sh.ng.os.3.5F,,shaematobium,nigeria,osun,3,5
203,c_Sh_NG_os_3_6,Sh.ng.os.3.6F,,shaematobium,nigeria,osun,3,6


In [5]:
info_df["wgs_id"].to_csv("samples.list", sep=",", header=False, index=False)

In [None]:
%%bash

bash /master/nplatt/sch_hae_its-nigeria/code/its-12-mitochondria-assembly.sh

In [11]:
%%bash

cd /master/nplatt/sch_hae_its-nigeria/results/mito_assembly

mkdir -p /master/nplatt/sch_hae_its-nigeria/results/mito_assembly/ragtag/

# Path to the CSV file
CSV_FILE="/master/nplatt/sch_hae_its-nigeria/results/mito_read_count/mito_confidence.csv"

# Loop through each line of the CSV file
while IFS= read -r LINE; do
    # Extract the first and last fields using awk
    SAMPLE_ID=$(echo "$LINE" | awk -F',' '{print $1}')
    REFERENCE=$(echo "$LINE" | awk -F',' '{print $2}')

    OUT_DIR="/master/nplatt/sch_hae_its-nigeria/results/mito_assembly/ragtag/${SAMPLE_ID}"
    LOG="$OUT_DIR/${SAMPLE_ID}.ragtag.log"
    SH_SCRIPT="$OUT_DIR/${SAMPLE_ID}.ragtag.sh"

    mkdir -p $OUT_DIR
    REF_FAS="/master/nplatt/sch_hae_its-nigeria/results/mito_assembly/references/${REFERENCE}.fas"

    # Use ls and select the last file in the list
    INITIAL_ASSEMBLY=$(ls /master/nplatt/sch_hae_its-nigeria/results/mito_assembly/get_organelle/${SAMPLE_ID}/animal_mt.K85.scaffolds.graph1.1.path_sequence.fasta 2>/dev/null | tail -n 1)

    # Check which assembly file to use
    if [[ -n "$INITIAL_ASSEMBLY" ]]; then
        QUERY="$INITIAL_ASSEMBLY"
    else
        echo "$SAMPLE_ID missing $INITIAL_ASSEMBLY" >>missing_assemblies.txt
    fi

    # Make script
    CMD="conda run -n ragtag --live-stream ragtag.py scaffold -w -t 12 --remove-small -f 500 -u -o $OUT_DIR $REF_FAS $QUERY >$LOG 2>&1"
    # Write to the script
    {
        echo '#!/bin/bash'
        echo ""
        echo "$CMD"
    } > "$SH_SCRIPT"
    chmod +x $SH_SCRIPT
    
    echo $SAMPLE_ID
    bash $SH_SCRIPT

done < <(tail -n +2 "$CSV_FILE")


c_Sh_NG_od_4_5
c_Sh_NG_bo_6_1
c_Sh_NG_ed_2_1
c_Sh_NG_kb_2_3
c_Sh_NG_kb_2_6
c_Sh_NG_kw_2_3
c_Sh_NG_kw_2_7
Sh_NG_kn_10_1
Sh_NG_od_5_4
Sh_NG_od_6_2
Sh_NG_od_8_3
Sb_NG_au_2.2
Sb_NG_ak_2.1
Sb_NG_au_1.2
Sb_NG_au_2.11
Sb_NG_au_2.12
Sb_NG_au_2.13
Sb_NG_au_2.14
Sb_NG_au_2.16
Sb_NG_au_2.17
Sb_NG_au_2.3
Sb_NG_au_2.4
Sb_NG_au_2.5
Sb_NG_au_2.6
Sb_NG_au_2.9
Sb_NG_be_1.1
Sb_NG_be_1.10
Sb_NG_be_1.11
Sb_NG_be_1.12
Sb_NG_be_1.3
Sb_NG_be_1.5
Sb_NG_be_1.6
Sb_NG_be_1.9
Sb_NG_be_2.1
Sb_NG_be_3.1
Sb_NG_en_1.1
SRR11907395
SRR11907458
Sb_NG_au_2.10
Sb_NG_au_2.7
c_Sh_NG_kn_4_7
c_Sh_NG_ed_2_3
c_Sh_NG_eb_2_3
c_Sh_NG_kn_4_1
c_Sh_NG_bo_3_1
c_Sh_NG_bo_3_2
c_Sh_NG_bo_5_2
c_Sh_NG_bo_6_2
c_Sh_NG_bo_7_3
c_Sh_NG_eb_6_2
c_Sh_NG_ed_1_1
c_Sh_NG_ed_2_2
c_Sh_NG_ed_3_1
c_Sh_NG_ed_7_1
c_Sh_NG_kb_2_1
c_Sh_NG_kb_2_2
c_Sh_NG_kb_2_9
c_Sh_NG_kn_11_3
c_Sh_NG_kn_14_2
c_Sh_NG_kn_3_2
c_Sh_NG_ks_1_2
c_Sh_NG_ks_1_3
c_Sh_NG_ks_1_7
c_Sh_NG_kw_1_10
c_Sh_NG_kw_1_8
c_Sh_NG_kw_2_2
c_Sh_NG_kw_2_5
c_Sh_NG_kw_2_8
c_Sh_NG_od_10_2
c_Sh_NG_od_4_4
c_S

In [12]:
# Base directory containing assemblies
base_dir = "/master/nplatt/sch_hae_its-nigeria/results/mito_assembly/ragtag"

# Output files
output_fasta = "mito_assemblies.fasta"
output_stats = "scaffold_counts.csv"
missing_files_log = "missing.files"
small_scaffolds_log = "small_scaffold.files"

# Open the output files
with open(output_fasta, "w") as fasta_out, \
     open(output_stats, "w") as stats_out, \
     open(missing_files_log, "w") as missing_out, \
     open(small_scaffolds_log, "w") as small_out:

    # Write the header for the stats file
    stats_out.write("sample_id,scaffold_count\n")
    
    # Iterate through each subdirectory in the assemblies directory
    for sample_id in os.listdir(base_dir):
        sample_path = os.path.join(base_dir, sample_id, "ragtag.scaffold.fasta")
        
        # Check if the fasta file exists
        if not os.path.isfile(sample_path):
            missing_out.write(f"{sample_path}\n")  # Log missing files
            #print(f"Warning: {sample_path} is missing. Logging to {missing_files_log}.")
            continue
        
        # Parse the fasta file
        longest_seq = None
        scaffold_count = 0
        for record in SeqIO.parse(sample_path, "fasta"):
            scaffold_count += 1
            if longest_seq is None or len(record.seq) > len(longest_seq.seq):
                longest_seq = record
        
        # If no sequences are found, skip this sample
        if scaffold_count == 0:
            print(f"Warning: No sequences found in {sample_path}. Skipping...")
            continue
        
        # Check if the longest sequence is smaller than 5kb
        if len(longest_seq.seq) < 5000:
            small_out.write(f"{sample_path}\n")  # Log small scaffold files
            print(f"Warning: Longest sequence in {sample_path} is smaller than 5kb. Logging to {small_scaffolds_log}.")

        # Modify the header of the longest sequence
        longest_seq.id = f"{sample_id}"
        longest_seq.description = ""  # Clear the description to avoid duplication
        
        # Write the longest sequence to the output fasta file
        SeqIO.write(longest_seq, fasta_out, "fasta")
        
        # Write the sample ID and scaffold count to the stats file
        stats_out.write(f"{sample_id},{scaffold_count}\n")
        
        print(f"Processed {sample_id}: {scaffold_count} scaffolds, longest sequence written to {output_fasta}")

print(f"Processing complete. Results written to {output_fasta}, {output_stats}, {missing_files_log}, and {small_scaffolds_log}.")


Processed c_Sh_NG_kn_4_5: 1 scaffolds, longest sequence written to mito_assemblies.fasta
Processed ERR3012901: 1 scaffolds, longest sequence written to mito_assemblies.fasta
Processed SRR11907524: 1 scaffolds, longest sequence written to mito_assemblies.fasta
Processed ERR5919560: 1 scaffolds, longest sequence written to mito_assemblies.fasta
Processed Sh_NG_od_8_3: 1 scaffolds, longest sequence written to mito_assemblies.fasta
Processed Sb_NG_au_2.8: 1 scaffolds, longest sequence written to mito_assemblies.fasta
Processed Sh_NG_kw_1_9: 1 scaffolds, longest sequence written to mito_assemblies.fasta
Processed SRR11907394: 1 scaffolds, longest sequence written to mito_assemblies.fasta
Processed Sb_NG_be_1.3: 1 scaffolds, longest sequence written to mito_assemblies.fasta
Processed Sb_NG_au_2.1: 1 scaffolds, longest sequence written to mito_assemblies.fasta
Processed c_Sh_NG_ed_7_1: 1 scaffolds, longest sequence written to mito_assemblies.fasta
Processed Sh_NG_kn_10_1: 1 scaffolds, longest