In [10]:
import csv
import os
import sys
import subprocess
import glob
import json
import pandas
import pickle
import pprint
from pathlib import Path
from collections import defaultdict
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Blast import NCBIXML
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastpCommandline
import pysam

In [27]:
def run_command(command):
    """Utility function to run a shell command. Will output stderr if anything but 0 is returned."""
    result = subprocess.run(command, shell=True, stderr=subprocess.PIPE, text=True)
    if result.returncode != 0:
        print(f"Error running command: {command}")
        print(result.stderr)
    return result

def run_minimap2(reference_fasta, query_fasta, output_sam, threads):
    """Run minimap2 against a reference and output to a sam file"""
    # Align the genome assembly against the plasmid database and output the result in SAM format
    command = f"minimap2 -t {threads} -a {reference_fasta} {query_fasta} -o {output_sam}"
    print(command)
    return run_command(command)
    
def convert_sam_to_bam(sam_file, bam_file):
    """Generate and run the command to convert SAM to BAM."""
    command = f"samtools view -Sb {sam_file} > {bam_file}"
    print(command)
    return run_command(command)

def sort_bam(bam_file, sorted_bam_file):
    """Generate and run the command to sort the BAM file."""
    command = f"samtools sort {bam_file} -o {sorted_bam_file}"
    print(command)
    return run_command(command)

def index_bam(sorted_bam_file):
    """Generate and run the command to index the sorted BAM file."""
    command = f"samtools index {sorted_bam_file}"
    print(command)
    return run_command(command)

def update_ids_in_sam(input_samfile_path, output_samfile_path, id_lookup_dict):
    # Open the input SAM file
    with pysam.AlignmentFile(input_samfile_path, "r") as samfile:
        # Open the output SAM file for writing
        with pysam.AlignmentFile(output_samfile_path, "wh", template=samfile) as outfile:
            for read in samfile:
                # Update the query name if it exists in the lookup dictionary
                if read.query_name in id_lookup_dict:
                    plasmid_name = parsing_table.get(read.query_name, {}).get('name', 'uh-oh!')
                    plasmid_id = parsing_table.get(read.query_name, {}).get('ID', 'uh-oh!')
                    plasmid_strain = parsing_table.get(read.query_name, {}).get('strain', 'uh-oh!')
                    read.query_name = f'{plasmid_id}__{plasmid_strain}__{plasmid_name}'

                # Write the modified read to the output SAM file
                outfile.write(read)


In [15]:
# Lets pop this pack of pickled plasmids
parsing_table = pickle.load(open(f'{parsing_tables_dir}/blast_parsing_dict.pkl', 'rb'))

# ok now let's set up the columns for our matrix and drop the synthetic vector and the ultra-rare lp21-cp9 fusion plasmid that can only be validated manually ( for now c; )
plasmids_in_db = {k : v for k, v in parsing_table.items()}
matrix_cols = []
for k in plasmids_in_db:
    matrix_cols.append(plasmids_in_db[k]['name'])
matrix_cols = list(set(matrix_cols))
matrix_cols.sort()
matrix_cols.remove('pBSV2')
#matrix_cols.remove('lp21-cp9')
print(matrix_cols)

['chromosome', 'cp26', 'cp32-1', 'cp32-1+5', 'cp32-10', 'cp32-11', 'cp32-12', 'cp32-2', 'cp32-3', 'cp32-3+10', 'cp32-4', 'cp32-5', 'cp32-5+1', 'cp32-5-1', 'cp32-6', 'cp32-7', 'cp32-8', 'cp32-9', 'cp32-9-4', 'cp9', 'cp9-3', 'lp17', 'lp21', 'lp21-cp9', 'lp25', 'lp28-1', 'lp28-11', 'lp28-2', 'lp28-3', 'lp28-4', 'lp28-5', 'lp28-6', 'lp28-7', 'lp28-8', 'lp28-9', 'lp36', 'lp38', 'lp5', 'lp54', 'lp56']


In [14]:
VERSION='v8'
#### Define Inputs and Paths ####
# get CURRENT working directory
init_cwd = os.getcwd()
# Set working directory
source_dir = '/home/mf019/borrelia_plasmid_classifier_v3'
databases_dir = f'{source_dir}/dbs'
# Use the big ref fasta! with all of the noise!
plasmid_ref_fasta = '/home/mf019/borrelia_plasmid_classifier_v3/dbs/all_plasmids.fasta'
parsing_tables_dir = f'{source_dir}/parsing_tables'
# Set assemblies directory
assemblies_dir = f'{source_dir}/assemblies'
shortread_dir  = f'{assemblies_dir}/shortread'
longread_dir = f'{assemblies_dir}/longread'
# Specify where to find the annotations
shortread_annotations  = f'{shortread_dir}/annotation'
longread_annotations = f'{longread_dir}/annotation'
# okay where are we gonna dump our output?
output_dir = '/home/mf019/plasmid_caller_v4_test'
#sam n bam outputs
sam_out = f'{output_dir}/sam'
bam_out = f'{output_dir}/bam'

In [5]:
# find them ~~genbanks~~ *FASTAS*
sr_fnas = glob.glob(f'{shortread_annotations}/*/*.fna')
print(f'Found {len(sr_fnas)} shortread fna files')
lr_fnas = glob.glob(f'{longread_annotations}/*/*.fna')
print(f'Found {len(lr_fnas)} longread fna files')

Found 49 shortread fna files
Found 49 longread fna files


In [8]:
print(plasmid_ref_fasta)

/home/mf019/borrelia_plasmid_classifier_v3/dbs/all_plasmids.fasta


In [34]:
for assembly_fasta in lr_fnas:
    asm_id = os.path.basename(assembly_fasta).split('.')[0]
    temp_sam_file = f"{sam_out}/longread/temp/wp_ref_vs_{asm_id}_TEMP.sam"
    sam_file = f"{sam_out}/longread/wp_ref_vs_{asm_id}.sam"
    bam_file = f"{bam_out}/longread/wp_ref_vs_{asm_id}.bam"
    # we are aligning the references AGAINST our assembly so the output doesn't look super scuffed.
    # and so that we can look at it on a contig by contig basis to see which classification is the best hit.
    run_minimap2(assembly_fasta, plasmid_ref_fasta, temp_sam_file, 47)
    update_ids_in_sam(temp_sam_file, sam_file, parsing_table)
    convert_sam_to_bam(sam_file, bam_file)

minimap2 -t 47 -a /home/mf019/borrelia_plasmid_classifier_v3/assemblies/longread/annotation/URI87H/URI87H.fna /home/mf019/borrelia_plasmid_classifier_v3/dbs/all_plasmids.fasta -o /home/mf019/plasmid_caller_v4_test/sam/longread/temp/wp_ref_vs_URI87H_TEMP.sam
samtools view -Sb /home/mf019/plasmid_caller_v4_test/sam/longread/wp_ref_vs_URI87H.sam > /home/mf019/plasmid_caller_v4_test/bam/longread/wp_ref_vs_URI87H.bam
minimap2 -t 47 -a /home/mf019/borrelia_plasmid_classifier_v3/assemblies/longread/annotation/URI34H/URI34H.fna /home/mf019/borrelia_plasmid_classifier_v3/dbs/all_plasmids.fasta -o /home/mf019/plasmid_caller_v4_test/sam/longread/temp/wp_ref_vs_URI34H_TEMP.sam
samtools view -Sb /home/mf019/plasmid_caller_v4_test/sam/longread/wp_ref_vs_URI34H.sam > /home/mf019/plasmid_caller_v4_test/bam/longread/wp_ref_vs_URI34H.bam
minimap2 -t 47 -a /home/mf019/borrelia_plasmid_classifier_v3/assemblies/longread/annotation/URI88H/URI88H.fna /home/mf019/borrelia_plasmid_classifier_v3/dbs/all_plasmid

In [35]:
for assembly_fasta in sr_fnas:
    asm_id = os.path.basename(assembly_fasta).split('.')[0]
    temp_sam_file = f"{sam_out}/shortread/temp/wp_ref_vs_{asm_id}_TEMP.sam"
    sam_file = f"{sam_out}/shortread/wp_ref_vs_{asm_id}.sam"
    bam_file = f"{bam_out}/shortread/wp_ref_vs_{asm_id}.bam"
    # we are aligning the references AGAINST our assembly so the output doesn't look super scuffed.
    # and so that we can look at it on a contig by contig basis to see which classification is the best hit.
    run_minimap2(assembly_fasta, plasmid_ref_fasta, temp_sam_file, 47)
    update_ids_in_sam(temp_sam_file, sam_file, parsing_table)
    convert_sam_to_bam(sam_file, bam_file)

minimap2 -t 47 -a /home/mf019/borrelia_plasmid_classifier_v3/assemblies/shortread/annotation/URI47/URI47.fna /home/mf019/borrelia_plasmid_classifier_v3/dbs/all_plasmids.fasta -o /home/mf019/plasmid_caller_v4_test/sam/shortread/temp/wp_ref_vs_URI47_TEMP.sam
samtools view -Sb /home/mf019/plasmid_caller_v4_test/sam/shortread/wp_ref_vs_URI47.sam > /home/mf019/plasmid_caller_v4_test/bam/shortread/wp_ref_vs_URI47.bam
minimap2 -t 47 -a /home/mf019/borrelia_plasmid_classifier_v3/assemblies/shortread/annotation/URI103/URI103.fna /home/mf019/borrelia_plasmid_classifier_v3/dbs/all_plasmids.fasta -o /home/mf019/plasmid_caller_v4_test/sam/shortread/temp/wp_ref_vs_URI103_TEMP.sam
samtools view -Sb /home/mf019/plasmid_caller_v4_test/sam/shortread/wp_ref_vs_URI103.sam > /home/mf019/plasmid_caller_v4_test/bam/shortread/wp_ref_vs_URI103.bam
minimap2 -t 47 -a /home/mf019/borrelia_plasmid_classifier_v3/assemblies/shortread/annotation/UCT31/UCT31.fna /home/mf019/borrelia_plasmid_classifier_v3/dbs/all_plasm