In [1]:
import csv
import os
import sys
import subprocess
import glob
import json
import pandas
import pickle
import pprint
from pathlib import Path
from collections import defaultdict # I like this collection. I am going to use this much more often.
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Blast import NCBIXML
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastpCommandline



Due to the on going maintenance burden of keeping command line application
wrappers up to date, we have decided to deprecate and eventually remove these
modules.

We instead now recommend building your command line and invoking it directly
with the subprocess module.


In [205]:
RUN_BLAST = False

In [199]:
VERSION='v7'
#### Define Inputs and Paths ####
# get CURRENT working directory
init_cwd = os.getcwd()
# Set working directory
working_dir = '/home/mf019/borrelia_plasmid_classifier_v3'
# path out the databases
databases_dir = f'{working_dir}/dbs'
whole_plasmid_db = f'{databases_dir}/plasmid_db/wp_db'
pf32_db = f'{databases_dir}/pf32_db/pfam32db'
all_pf32_db = f'{databases_dir}/all_pf32_db/all_pf32_db'
# lets specify where parsing tables are to be found!
parsing_tables_dir = f'{working_dir}/parsing_tables'
# set assemblies directory # DO I WANT TO ARGV THIS?
assemblies_dir = f'{working_dir}/assemblies' #f'{working_dir}/assemblies'
shortread_dir  = f'{assemblies_dir}/shortread'
longread_dir = f'{assemblies_dir}/longread'
shortread_contigs  = f'{shortread_dir}/contigs'
longread_contigs = f'{longread_dir}/contigs'
shortread_annotations  = f'{shortread_dir}/annotation'
longread_annotations = f'{longread_dir}/annotation'
output_dir = f'{working_dir}/output'
blast_results_dir = f'{output_dir}/blast_results_{VERSION}'
pf32_blast_results_dir = f'{blast_results_dir}/pf32'
all_pf32_blast_results_dir = f'{blast_results_dir}/all_pf32'
wp_blast_results_dir = f'{blast_results_dir}/whole_plasmid'

In [200]:
# Lets pop this pack of pickled plasmids
parsing_table = pickle.load(open(f'{parsing_tables_dir}/blast_parsing_dict.pkl', 'rb'))

# ok now let's set up the columns for our matrix and drop the synthetic vector and the ultra-rare lp21-cp9 fusion plasmid that can only be validated manually ( for now c; )
plasmids_in_db = {k : v for k, v in parsing_table.items()}
matrix_cols = []
for k in plasmids_in_db:
    matrix_cols.append(plasmids_in_db[k]['name'])
matrix_cols = list(set(matrix_cols))
matrix_cols.sort()
matrix_cols.remove('pBSV2')
#matrix_cols.remove('lp21-cp9')
print(matrix_cols)

['chromosome', 'cp26', 'cp32-1', 'cp32-1+5', 'cp32-10', 'cp32-11', 'cp32-12', 'cp32-2', 'cp32-3', 'cp32-3+10', 'cp32-4', 'cp32-5', 'cp32-5+1', 'cp32-5-1', 'cp32-6', 'cp32-7', 'cp32-8', 'cp32-9', 'cp32-9-4', 'cp9', 'cp9-3', 'lp17', 'lp21', 'lp21-cp9', 'lp25', 'lp28-1', 'lp28-11', 'lp28-2', 'lp28-3', 'lp28-4', 'lp28-5', 'lp28-6', 'lp28-7', 'lp28-8', 'lp28-9', 'lp36', 'lp38', 'lp5', 'lp54', 'lp56']


In [201]:
print('analysis inputs')
print(f'CWD: {init_cwd}')
print(f'Working directory: {working_dir}')
print(f'Databases directory: {databases_dir}')
print(f'Plasmid database: {whole_plasmid_db}')
print(f'PF32 database: {pf32_db}')
print(f'ALL_PF32 database: {all_pf32_db}')
print(f'Parsing tables directory: {parsing_tables_dir}')
print(f'Assemblies directory: {assemblies_dir}')
print(f'Shortread directory: {shortread_dir}')
print(f'Longread directory: {longread_dir}')
print(f'Shortread contigs directory: {shortread_contigs}')
print(f'Shortread annotations directory: {shortread_annotations}')
print(f'Longread contigs directory: {longread_contigs}')
print(f'Longread annotations directory: {longread_annotations}')
print(f'Output directory: {output_dir}')
print(f'Whole contig blast results directory: {wp_blast_results_dir}')
print(f'PF32 Blast results: {all_pf32_blast_results}')
print('analysis begin')


analysis inputs
CWD: /home/mf019/borrelia_plasmid_classifier_v3
Working directory: /home/mf019/borrelia_plasmid_classifier_v3
Databases directory: /home/mf019/borrelia_plasmid_classifier_v3/dbs
Plasmid database: /home/mf019/borrelia_plasmid_classifier_v3/dbs/plasmid_db/wp_db
PF32 database: /home/mf019/borrelia_plasmid_classifier_v3/dbs/pf32_db/pfam32db
ALL_PF32 database: /home/mf019/borrelia_plasmid_classifier_v3/dbs/all_pf32_db/all_pf32_db
Parsing tables directory: /home/mf019/borrelia_plasmid_classifier_v3/parsing_tables
Assemblies directory: /home/mf019/borrelia_plasmid_classifier_v3/assemblies
Shortread directory: /home/mf019/borrelia_plasmid_classifier_v3/assemblies/shortread
Longread directory: /home/mf019/borrelia_plasmid_classifier_v3/assemblies/longread
Shortread contigs directory: /home/mf019/borrelia_plasmid_classifier_v3/assemblies/shortread/contigs
Shortread annotations directory: /home/mf019/borrelia_plasmid_classifier_v3/assemblies/shortread/annotation
Longread contigs d

In [202]:
# Function to determine the best match
def get_best_match(matches, key):
    best_match = None
    best_score = -1
    for match in matches:
        if match[key] > best_score:
            best_score = match[key]
            best_match = match
    return best_match
    
# Function to calculate overall percent identity and coverage using HSPs
def calculate_percent_identity_and_coverage(alignment):
    total_identities = 0
    total_alignment_length = 0
    covered_intervals = []
    query_intervals = []
    subject_hit_coords = []
    
    for hsp in alignment.hsps:
        total_identities += hsp.identities
        total_alignment_length += hsp.align_length

        # Add HSP interval to the hit location list. this is to check the annotation.
        subj_start = hsp.sbjct_start 
        subj_end = hsp.sbjct_end

        # Add 'em to our list of subject hit coords. 
        subject_hit_coords.append((subj_start, subj_end))
        
        # Add abs HSP interval to the covered intervals list, take the min/max to make sure the coordinates are correct! (This is for raw coverage)
        abs_subj_start = min(subj_start, subj_end)
        abs_subj_end = max(subj_start, subj_end)

        # Also add query coordinates so we can see where on the contig we are aligning.
        query_start = hsp.query_start
        query_end = hsp.query_end

        # Add this to the query_intervals list
        query_intervals.append((query_start,query_end))

        # Add abs HSP interval to the covered intervals list
        covered_intervals.append((abs_subj_start, abs_subj_end))

    # Merge overlapping intervals to calculate the covered length
    merged_intervals = merge_intervals(covered_intervals)
    covered_length = sum(end - start for start, end in merged_intervals)

    # Calculate overall percent identity
    overall_percent_identity = (total_identities / total_alignment_length) * 100 if total_alignment_length != 0 else 0

    # Calculate coverage percentage
    reference_length = alignment.length
    coverage_percentage = (covered_length / reference_length) * 100 if reference_length != 0 else 0

    return overall_percent_identity, coverage_percentage, covered_length, covered_intervals, query_intervals, subject_hit_coords

def merge_intervals(intervals):
    # Sort intervals by start position
    sorted_intervals = sorted(intervals, key=lambda x: x[0])
    merged_intervals = []

    for interval in sorted_intervals:
        if not merged_intervals or merged_intervals[-1][1] < interval[0]:
            merged_intervals.append(interval)
        else:
            merged_intervals[-1] = (merged_intervals[-1][0], max(merged_intervals[-1][1], interval[1]))

    return merged_intervals

def add_to_nested_dict(nested_dict, outer_key, inner_key, value):
    if outer_key not in nested_dict:
        nested_dict[outer_key] = {}  # Initialize a dictionary for this outer key
    if inner_key not in nested_dict[outer_key]:
        nested_dict[outer_key][inner_key] = value  # instantiate inner_key = value


In [203]:
# find them genbanks
sr_gbs = glob.glob(f'{shortread_annotations}/*/*.gbff')
print(f'Found {len(sr_gbs)} shortread gbff files')
lr_gbs = glob.glob(f'{longread_annotations}/*/*.gbff')
print(f'Found {len(lr_gbs)} longread gbff files')

Found 49 shortread gbff files
Found 49 longread gbff files


In [204]:
# Initialize the assembly dictionary
assembly_dict = {}
for assembly in sr_gbs:
    isolate_name = str(assembly.split('/')[-1].strip('.gbff'))
    print(isolate_name)
    assembly_dict[isolate_name] = {'shortread': defaultdict(dict), 'longread': defaultdict(dict), 'longread_name': None, 'longread_method' : None}
    for contig in SeqIO.parse(assembly, 'genbank'):
        assembly_dict[isolate_name]['shortread'][contig.name] = defaultdict(dict)
        assembly_dict[isolate_name]['shortread'][contig.name]['seqrecord'] = contig
        assembly_dict[isolate_name]['shortread'][contig.name]['wp_hits'] = []
        assembly_dict[isolate_name]['shortread'][contig.name]['pf32_hits'] = []
        assembly_dict[isolate_name]['shortread'][contig.name]['all_pf32_hits'] = []
        

for assembly in lr_gbs:
    isolate_name = assembly.split('/')[-1].strip('.gbff')
    print(isolate_name)
    sr_name = isolate_name[:-1]
    assembly_dict[sr_name]['longread_name'] = isolate_name
    assembly_dict[sr_name]['longread_method'] = 'pacbio' if isolate_name.endswith('P') else 'hybrid'
    for contig in SeqIO.parse(assembly, 'genbank'):
        assembly_dict[sr_name]['longread'][contig.name] = defaultdict(dict)
        assembly_dict[sr_name]['longread'][contig.name]['seqrecord'] = contig
        assembly_dict[sr_name]['longread'][contig.name]['wp_hits'] = []
        assembly_dict[sr_name]['longread'][contig.name]['pf32_hits'] = []
        assembly_dict[sr_name]['longread'][contig.name]['all_pf32_hits'] = []

print('dictionary construction complete! pickling!')
#pickle.dump(assembly_dict, open(f'{output_dir}/assembly_dict_v4.pkl', 'wb'))
print(f'dictionary pickled to: {output_dir}/assembly_dict_v4.pkl')


URI47
URI103
UCT31
UCT92
URI46
UWI247
UCT30
URI44
URI87
UCT50
UWI283
URI107
UCT110
URI48
UNY203
UWI263
URI117
UCT35
URI111
URI33
URI86
UNY149
UCT96
URI56
URI118
UNY172
URI112
UNY208
URI89
URI102
URI36
UWI248
ESI26
URI40
URI39
UNY169
UCT113
URI101
URI42
UCT109
UCT32
URI41
URI88
URI91
UCT29
URI34
URI120
UNY193
URI93
URI87H
URI34H
URI88H
URI33H
UCT110H
URI39H
URI91H
UCT35H
UWI247P
URI120H
URI107H
UWI263P
URI89H
URI42H
URI44H
UCT109H
URI40H
URI117H
URI47H
URI86H
URI36H
UNY208P
ESI26H
UCT31H
URI56H
UCT30H
URI103H
UCT29H
URI112H
UWI248P
UNY203P
UCT96H
UCT32H
UNY193P
UCT113H
URI93H
UNY169P
UWI283P
URI102H
URI41H
UNY172P
UNY149P
UCT92H
URI118H
UCT50H
URI101H
URI46H
URI48H
URI111H
dictionary construction complete! pickling!
dictionary pickled to: /home/mf019/borrelia_plasmid_classifier_v3/output/assembly_dict_v4.pkl


In [37]:
if RUN_BLAST is True:
    print(str('Start Blastin'))
    for isolate in assembly_dict:
        input_files = [] # empty list to store input files
        lr_data = assembly_dict[isolate]['longread']
        sr_data = assembly_dict[isolate]['shortread']
        lr_id = assembly_dict[isolate]['longread_name']
        #lr_id = isolate # since we're operating on lr first
        input_files.append(f'{longread_annotations}/{lr_id}/{lr_id}.fna') # add lr contigs to list
        if sr_data == 'NOT AVAILABLE':
            print(f'No shortread annotations found for {isolate}')
            sr_id = None
        else:
            sr_id = isolate#[:-1:]
            input_files.append(f'{shortread_annotations}/{sr_id}/{sr_id}.fna')
        print(f'setting up blast commands for {isolate}')
        for file in input_files:
            name = Path(file).stem
            wp_output_file = f'{wp_blast_results_dir}/{name}_whole_plasmid.xml'
            wp_blast_cmd = f'blastn -query {file} -task "blastn" '
            wp_blast_cmd += f'-db {whole_plasmid_db} '
            wp_blast_cmd += f' -out {wp_output_file} '
            wp_blast_cmd += "-evalue 1e-100 -num_threads 29 -outfmt 5 -max_target_seqs 5 -max_hsps 10"
            print(f'Running blast command:\n{wp_blast_cmd}')
            subprocess.run(wp_blast_cmd, shell=True) # run it
            print(f'whole plasmid blast results written to: {wp_output_file}')
            print(f'setting up blast command for pf32 blast')
            pf_output_file = f'{pf32_blast_results_dir}/{name}_pf32.xml'
            pf_blast_cmd = f'blastx -query {file} -task "blastx" '
            pf_blast_cmd += f'-db {pf32_db} '
            pf_blast_cmd += f' -out {pf_output_file} '
            pf_blast_cmd += "-evalue 1e-100 -num_threads 29 -outfmt 5 -max_target_seqs 5 -max_hsps 10"
            subprocess.run(pf_blast_cmd, shell=True, check=False) # run it, but don't check for errors to avoid stopping the script
            print(f'pf32 blast results written to: {pf_output_file}')
            print(f'setting up blast command for ALL pf32 blast')
            all_pf_output_file = f'{all_pf32_blast_results_dir}/{name}_all_pf32.xml'
            pf_blast_cmd = f'blastx -query {file} -task "blastx" '
            pf_blast_cmd += f'-db {all_pf32_db} '
            pf_blast_cmd += f' -out {all_pf_output_file} '
            pf_blast_cmd += "-evalue 1e-100 -num_threads 29 -outfmt 5 -max_target_seqs 5 -max_hsps 10"
            subprocess.run(pf_blast_cmd, shell=True, check=False) # run it, but don't check for errors to avoid stopping the script
            print(f'pf32 blast results written to: {all_pf_output_file}')
    print('saddle up, we are done here!')
else:
    print('Blast already run, skipping blast')

Start Blastin
setting up blast commands for URI47
Running blast command:
blastn -query /home/mf019/borrelia_plasmid_classifier_v3/assemblies/longread/annotation/URI47H/URI47H.fna -task "blastn" -db /home/mf019/borrelia_plasmid_classifier_v3/dbs/plasmid_db/wp_db  -out /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/whole_plasmid/URI47H_whole_plasmid.xml -evalue 1e-100 -num_threads 29 -outfmt 5 -max_target_seqs 5 -max_hsps 10
whole plasmid blast results written to: /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/whole_plasmid/URI47H_whole_plasmid.xml
setting up blast command for pf32 blast
pf32 blast results written to: /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/pf32/URI47H_pf32.xml
setting up blast command for ALL pf32 blast
pf32 blast results written to: /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/all_pf32/URI47H_all_pf32.xml
Running blast command:
blastn -query /home/mf019/borrelia_plasmid_classifier_

In [206]:
pf32_results = glob.glob(f'{pf32_blast_results}/*.xml')
all_pf32_results = glob.glob(f'{all_pf32_blast_results}/*.xml')
wp_results = glob.glob(f'{plasmid_blast_results}/*.xml')
print(f'Found {len(pf32_results)} pf32 blast results')
print(f'Found {len(all_pf32_results)} all_pf32 blast results')
print(f'Found {len(wp_results)} whole plasmid blast results')

Found 98 pf32 blast results
Found 98 all_pf32 blast results
Found 98 whole plasmid blast results


In [207]:
# dir(blast_record)
#['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', 
#'__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'alignments', 
#'application', 'blast_cutoff', 'database', 'database_length', 'database_letters', 'database_name', 'database_sequences', 'date', 'descriptions', 'dropoff_1st_pass', 'effective_database_length', 
#'effective_hsp_length', 'effective_query_length', 'effective_search_space', 'effective_search_space_used', 'expect', 'filter', 'frameshift', 'gap_penalties', 'gap_trigger', 'gap_x_dropoff', 'gap_x_dropoff_final',
#'gapped', 'hsps_gapped', 'hsps_no_gap', 'hsps_prelim_gapped', 'hsps_prelim_gapped_attemped', 'ka_params', 'ka_params_gap', 'matrix', 'multiple_alignment', 'num_good_extends', 'num_hits', 'num_letters_in_database', 
#'num_seqs_better_e', 'num_sequences', 'num_sequences_in_database', 'posted_date', 'query', 'query_id', 'query_length', 'query_letters', 'reference', 'sc_match', 'sc_mismatch', 'threshold', 'version', 'window_size']
####
# dir(alignment)
#['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', 
#'__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'accession', 'hit_def', 'hit_id', 'hsps', 'length', 'title']

# Parse BLAST results and update the assembly dictionary :)
for result in wp_results:
    with open(result, "r") as blast_file:
        assembly_name = Path(result).stem.split('.')[-1].split('_')[0]
        if assembly_name.endswith(('H','P')):
            method = 'longread'
            isolate = assembly_name[:-1:]
        else:
            method = 'shortread'
            isolate = assembly_name

        blast_records = NCBIXML.parse(blast_file)
        for blast_record in blast_records:
            contig = blast_record.query
            query_length = blast_record.query_length
            for isolate_name, data in assembly_dict.items():
                    hits = assembly_dict[isolate][method][contig]['wp_hits']
                    if query_length >= 1000:
                        for alignment in blast_record.alignments:
                            alignment_id = alignment.hit_id.split('|')[1]
                            ref_length = alignment.length
                            plasmid_name = parsing_table.get(alignment_id, {}).get('name', 'uh-oh!')
                            overall_percent_identity, coverage_percentage, covered_length, covered_intervals, query_intervals, subject_hit_coords = calculate_percent_identity_and_coverage(alignment)
                            hits.append({
                                "alignment_id": alignment_id,
                                "plasmid_name": plasmid_name,
                                "query_length": query_length,
                                "ref_length": ref_length,
                                "overall_percent_identity": overall_percent_identity,
                                "coverage_percentage": coverage_percentage,
                                'covered_positions': covered_length,
                                'covered_intervals': covered_intervals,
                                'query_intervals': query_intervals,
                                'subject_hit_coords': subject_hit_coords,
                            })
    print(f"Processed {result}!")

Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/whole_plasmid/URI36_whole_plasmid.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/whole_plasmid/URI91H_whole_plasmid.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/whole_plasmid/URI88H_whole_plasmid.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/whole_plasmid/URI46_whole_plasmid.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/whole_plasmid/UCT31H_whole_plasmid.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/whole_plasmid/UCT32_whole_plasmid.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/whole_plasmid/URI89_whole_plasmid.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/whole_plasmid/UCT109H_whole_plasmid.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_r

In [208]:
def get_hit_id(alignment_id):
    """Extract hit id from hit title"""
    if "|" in alignment_id:
        if 'pdb' in alignment_id: # I do not know why this singular db entry is different. thank you very much blast :)
            #print(alignment_id)
            plasmid_name = alignment_id.split("|")[-1]
            #print(plasmid_name)
        else:
            plasmid_name = alignment_id.split("|")[1]
    else:
        plasmid_name = alignment_id
    return plasmid_name

In [209]:
for result in pf32_results:
    with open(result, "r") as blast_file:
        assembly_name = Path(result).stem.split('.')[-1].split('_')[0]
        if assembly_name.endswith(('H','P')):
            method = 'longread'
            isolate = assembly_name[:-1:]
        else:
            method = 'shortread'
            isolate = assembly_name

        blast_records = NCBIXML.parse(blast_file)
        for blast_record in blast_records:
            contig = blast_record.query
            query_length = blast_record.query_length
            for isolate_name, data in assembly_dict.items():
                    hits = assembly_dict[isolate][method][contig]['pf32_hits']
                    if query_length >= 1000:
                        for alignment in blast_record.alignments:
                            alignment_id = alignment.hit_id
                            plasmid_name = get_hit_id(alignment.hit_id)
                            plasmid_name = plasmid_name.split('_')[-1] if len(plasmid_name.split('_')) > 1 else plasmid_name
                            strain = alignment.hit_id.split('_')[0]
                            ref_pf_length = alignment.length
                            for ncbi_id, data in parsing_table.items():
                                if data['strain'] == strain and data['name'] == plasmid_name:
                                    #print(ncbi_id)
                                    ref_total_length = parsing_table[ncbi_id]['length']
                                else:
                                    ref_total_length = 'NaN'
                                    pass
                            overall_percent_identity, coverage_percentage, covered_length, covered_intervals, query_intervals, subject_hit_coords = calculate_percent_identity_and_coverage(alignment)
                            hits.append({
                                "alignment_id": alignment_id,
                                "plasmid_name": plasmid_name,
                                "query_length": query_length,
                                "ref_length": ref_length,
                                "ref_total_length": ref_total_length,
                                "overall_percent_identity": overall_percent_identity,
                                "coverage_percentage": coverage_percentage,
                                'covered_positions': covered_length,
                                'covered_intervals': covered_intervals,
                                'query_intervals': query_intervals,
                                'subject_hit_coords': subject_hit_coords,
                            })
    print(f"Processed {result}!")

Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/pf32/UCT35H_pf32.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/pf32/ESI26_pf32.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/pf32/ESI26H_pf32.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/pf32/UCT32H_pf32.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/pf32/UNY169P_pf32.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/pf32/UWI248P_pf32.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/pf32/URI34H_pf32.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/pf32/URI111_pf32.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/pf32/UCT29H_pf32.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/pf32/URI33_pf32.xml!
Processed 

In [210]:
for result in all_pf32_results:
    with open(result, "r") as blast_file:
        assembly_name = Path(result).stem.split('.')[-1].split('_')[0]
        if assembly_name.endswith(('H','P')):
            method = 'longread'
            isolate = assembly_name[:-1:]
        else:
            method = 'shortread'
            isolate = assembly_name

        blast_records = NCBIXML.parse(blast_file)
        for blast_record in blast_records:
            contig = blast_record.query
            query_length = blast_record.query_length
            for isolate_name, data in assembly_dict.items():
                    hits = assembly_dict[isolate][method][contig]['all_pf32_hits']
                    if query_length >= 1000:
                        for alignment in blast_record.alignments:
                            alignment_id = alignment.hit_id
                            plasmid_name = get_hit_id(alignment.hit_id)
                            #print(alignment_id, plasmid_name)
                            plasmid_name = plasmid_name.split('_')[-3] if len(plasmid_name.split('_')) > 1 else plasmid_name
                            strain = alignment.hit_id.split('_')[0]
                            ref_pf_length = alignment.length
                            for ncbi_id, data in parsing_table.items():
                                if data['strain'] == strain and data['name'] == plasmid_name:
                                    #print(ncbi_id)
                                    ref_total_length = parsing_table[ncbi_id]['length']
                                else:
                                    ref_total_length = 'NaN'
                                    pass
                            overall_percent_identity, coverage_percentage, covered_length, covered_intervals, query_intervals, subject_hit_coords = calculate_percent_identity_and_coverage(alignment)
                            hits.append({
                                "alignment_id": alignment_id,
                                "plasmid_name": plasmid_name,
                                "query_length": query_length,
                                "ref_length": ref_length,
                                "ref_total_length": ref_total_length,
                                "overall_percent_identity": overall_percent_identity,
                                "coverage_percentage": coverage_percentage,
                                'covered_positions': covered_length,
                                'covered_intervals': covered_intervals,
                                'query_intervals': query_intervals,
                                'subject_hit_coords': subject_hit_coords,
                            })
    print(f"Processed {result}!")

Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/all_pf32/URI86H_all_pf32.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/all_pf32/URI101_all_pf32.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/all_pf32/URI103H_all_pf32.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/all_pf32/UWI263_all_pf32.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/all_pf32/UNY193P_all_pf32.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/all_pf32/URI44_all_pf32.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/all_pf32/UCT32_all_pf32.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/all_pf32/URI112_all_pf32.xml!
Processed /home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/all_pf32/UNY169_all_pf32.xml!
Processed /home/mf019/borrelia_plasmi

In [238]:

# Write the best match for each contig to a TSV file
with open(f'best_matches_{VERSION}_1000bp.tsv', 'w', newline='') as tsvfile:
    fieldnames = [
        'name', 'method', 'contig', 'contig_len',
        'wp_alignment_id', 'pf_alignment_id',
        'wp_query_length', 'wp_ref_length', 'pf_ref_length',
        'wp_percent_identity', 'pf_percent_identity', 'wp_coverage_percentage', 'pf_coverage_percentage',
        'wp_plasmid_name', 'pf_plasmid_name',
        'best_method', 'best_hit', 'completeness', 'pf_query_end', 'pf_query_start', 'wp_query_start', 'wp_query_end', 'wp_subj_start', 'wp_subj_end',
        'all_pf_alignment_id', 'all_pf_ref_length', 'all_pf_percent_identity', 'all_pf_coverage_percentage', 'all_pf_plasmid_name',
        'all_pf_query_end', 'all_pf_query_start',
    ]
    writer = csv.DictWriter(tsvfile, fieldnames=fieldnames, delimiter='\t')
    writer.writeheader()

    for isolate_name, data in assembly_dict.items():
        for method in ['shortread', 'longread']:
            if method == 'longread':
                display_name = data['longread_name']
            else:
                display_name = isolate_name
            for contig, homologies in data[method].items():
                wp_best_identity = get_best_match(homologies['wp_hits'], 'overall_percent_identity')
                wp_best_coverage = get_best_match(homologies['wp_hits'], 'coverage_percentage')

                pf32_best_identity = None
                all_pf32_best_identity = None
                if homologies['pf32_hits'] and (not homologies['wp_hits'] or homologies['wp_hits'][0]['query_length'] <= 100000):
                    pf32_best_identity = get_best_match(homologies['pf32_hits'], 'overall_percent_identity')
                    pf32_best_coverage = get_best_match(homologies['pf32_hits'], 'coverage_percentage')
                    all_pf32_best_identity = get_best_match(homologies['all_pf32_hits'], 'overall_percent_identity')
                
                best_hit = None
                best_method = None
                wp_best = None
                wp_alignment_id = ''
                wp_query_length = ''
                wp_ref_length = ''
                wp_subj_start = ''
                wp_subj_end = ''
                wp_coverage_percentage = ''
                wp_percent_identity = ''
                wp_plasmid_name = ''
                pf_best = None
                pf_alignment_id = ''
                pf_percent_identity = ''
                pf_ref_length = ''
                pf_query_start = ''
                pf_query_end = ''
                pf_coverage = ''
                pf_plasmid_name = ''
                all_pf_best = None
                all_pf_alignment_id = ''
                all_pf_percent_identity = ''
                all_pf_ref_length = ''
                all_pf_query_start = ''
                all_pf_query_end = ''
                all_pf_coverage = ''
                all_pf_plasmid_name = ''

                if wp_best_identity and wp_best_coverage and wp_best_coverage['coverage_percentage'] > wp_best_identity['coverage_percentage']:
                    wp_best = wp_best_identity
                else:
                    wp_best = wp_best_coverage

                if wp_best:
                    wp_alignment_id = wp_best['alignment_id']
                    wp_query_length = wp_best['query_length']
                    wp_ref_length = wp_best['ref_length']
                    wp_percent_identity = wp_best['overall_percent_identity']
                    wp_coverage_percentage = wp_best['coverage_percentage']
                    wp_plasmid_name = wp_best['plasmid_name']
                    wp_query_start, wp_query_end = wp_best['query_intervals'][0]
                    wp_subj_start, wp_subj_end = wp_best['subject_hit_coords'][0]
                    

                if pf32_best_identity:
                    pf_alignment_id = pf32_best_identity['alignment_id']
                    pf_plasmid_name = pf32_best_identity['plasmid_name']
                    pf_coverage = pf32_best_identity['coverage_percentage']
                    pf_percent_identity = pf32_best_identity['overall_percent_identity']
                    pf_ref_length = pf32_best_identity['ref_length']
                    pf_contig_start, pf_contig_end = pf32_best_identity['covered_intervals'][0]
                    pf_subject_start, pf_subject_end = pf32_best_identity['subject_hit_coords'][0]
                    pf_query_start, pf_query_end = pf32_best_identity['query_intervals'][0]

                if all_pf32_best_identity:
                    all_pf_alignment_id = all_pf32_best_identity['alignment_id']
                    all_pf_plasmid_name = all_pf32_best_identity['plasmid_name']
                    all_pf_coverage = all_pf32_best_identity['coverage_percentage']
                    all_pf_percent_identity = all_pf32_best_identity['overall_percent_identity']
                    all_pf_ref_length = all_pf32_best_identity['ref_length']
                    all_pf_contig_start, pf_contig_end = all_pf32_best_identity['covered_intervals'][0]
                    all_pf_subject_start, pf_subject_end = all_pf32_best_identity['subject_hit_coords'][0]
                    all_pf_query_start, pf_query_end = all_pf32_best_identity['query_intervals'][0]
                
                if wp_best and pf32_best_identity:
                    #if pf32_best_identity['overall_percent_identity'] > wp_best['overall_percent_identity']:
                    best_method = 'pf32'
                    best_hit = pf_plasmid_name
                    #else:
                    #    best_method = 'wp'
                    #    best_hit = wp_plasmid_name
                else:
                    best_method = 'wp' if wp_best else 'NaN'
                    best_hit = wp_plasmid_name

                completeness = 'presumed' if wp_best and wp_best['coverage_percentage'] >= 96 else 'incomplete'
                contig_len = len(homologies['seqrecord'].seq)
                
                writer.writerow({
                    'name': display_name,
                    'method': method,
                    'contig': contig,
                    'wp_alignment_id': wp_alignment_id,
                    'pf_alignment_id': pf_alignment_id,
                    'all_pf_alignment_id': all_pf_alignment_id,
                    'contig_len': contig_len,
                    'wp_query_length': wp_query_length,
                    'wp_query_start': wp_query_start,
                    'wp_query_end': wp_query_end,
                    'wp_subj_start': wp_subj_start,
                    'wp_subj_end': wp_subj_end,
                    'wp_ref_length': wp_ref_length,
                    'pf_ref_length': pf_ref_length,
                    'all_pf_ref_length': pf_ref_length,
                    'wp_percent_identity': wp_percent_identity,
                    'pf_percent_identity': pf_percent_identity,
                    'all_pf_percent_identity': pf_percent_identity,
                    'wp_coverage_percentage': wp_coverage_percentage,
                    'pf_coverage_percentage': pf_coverage,
                    'all_pf_coverage_percentage': pf_coverage,
                    'wp_plasmid_name': wp_plasmid_name,
                    'pf_plasmid_name': pf_plasmid_name,
                    'all_pf_plasmid_name' : all_pf_plasmid_name,
                    'best_method': best_method,
                    'best_hit': best_hit,
                    'completeness': completeness,
                    'pf_query_start': pf_query_start,
                    'pf_query_end': pf_query_end,
                    'all_pf_query_start': pf_query_start,
                    'all_pf_query_end': pf_query_end,
                })


In [None]:
plasmid_caller_v7_csv = f'best_matches_{VERSION}_1000bp.tsv'
v7_csv_df = pandas.read_csv(plasmid_caller_v7_csv, delimiter='\t')

subset = v7_csv_df[['name','contig','all_pf_plasmid_name', 'pf_plasmid_name', 'wp_plasmid_name', 'best_hit']].fillna('NONE')
filtered_subset = subset[(subset['all_pf_plasmid_name'] != subset['pf_plasmid_name']) | (subset['wp_plasmid_name'] != subset['best_hit'])]
filtered_subset.reset_index(drop=True, inplace=True)

In [257]:
filtered_subset.to_csv('discrepancies_v7_wp_pf_2500bp.csv')